# ScoreTAsk


 * ` * `ScoreTask`: Uses the scored model to compute the sentiment for each city.
    * Use the trained model to predict the probability/score for each city the
      negative, neutral and positive sentiment.
    * Output a sorted list of cities by the predicted positive sentiment score to the output file.

""" Uses the scored model to compute the sentiment for each city.

        Output file should be a four column CSV with columns:
        - city name
        - negative probability
        - neutral probability
        - positive probability
    """

In [17]:
import sys
from IPython.display import clear_output
clear_output(wait=True)
import pandas as pd
import numpy as np
from modules.helpers import read_in_dataset
import inspect
from sklearn.metrics.pairwise import euclidean_distances
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier

print(inspect.getsource(read_in_dataset))

def read_in_dataset(dset, verbose=False):
    
    """Read in one of the datasets (train or properties)
        
        Keyword arguments:
        dset -- a string
        verbose -- whether or not to print info about the dataset
        
        Returns:
        a pandas dataframe
        """
    
    df =  pd.read_csv('{0}.csv'.format(dset), encoding = "ISO-8859-1")
    
    if verbose:
        print('\n{0:*^80}'.format(' Reading in the {0} dataset '.format(dset)))
        print("\nit has {0} rows and {1} columns".format(*df.shape))
        print('\n{0:*^80}\n'.format(' It has the following columns '))
        print(df.columns)
        print('\n{0:*^80}\n'.format(' The first 5 rows look like this '))
        print(df.head())
    
    return df



Using TensorFlow backend.


In [24]:
training_data_df= read_in_dataset('training_data', verbose = True)



********************* Reading in the training_data dataset *********************

it has 855 rows and 22 columns

************************* It has the following columns *************************

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'airline_sentiment',
       'airline_sentiment:confidence', 'negativereason',
       'negativereason:confidence', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone',
       'closest_cities', 'sentiment'],
      dtype='object')

*********************** The first 5 rows look like this ************************

    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  681448197    False   finalized                   3      2/25/15 2:26   
1  681448213    False   finalized                   3      2/25/15 9:04   
2  681448214    False   finalized      

# Determine X and Y and the training set
-  The "y" variable will be the multi-class sentiment (0, 1, 2 for negative, neutral and positive respectively).
- * The "X" variables will be the closest city to the "tweet_coord" using Euclidean distance.

In [25]:
X = training_data_df.closest_cities
y = training_data_df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
#one hot encoding
encoder = LabelBinarizer()
X_train_lb = encoder.fit_transform(X_train)
X_train_lb.shape

(684, 253)

In [32]:
X_test_lb = encoder.transform(X_test)

# Compute Probabilities with  Softmax Logistic regression

### Load the model

In [2]:
import pickle
filename = 'model.pkl'
softmax_clf = pickle.load(open(filename, 'rb'))

In [40]:
probabilities = softmax_clf.predict_proba(X_test_lb)


# Merge the cities name with the labels of sentiment

In [35]:
city_name = X_test.to_list()
new_col = probabilities.tolist()
data_tuples = list(zip(city_name,new_col))
df = pd.DataFrame(data_tuples, columns=['city_name','newcol'])
df.head()

Unnamed: 0,city_name,newcol
0,Newark,"[0.7766233837170984, 0.15829126945162666, 0.06..."
1,Bowie,"[0.6582538859242353, 0.1635033921677491, 0.178..."
2,Franklin Park,"[0.7046701569351638, 0.09846966682839016, 0.19..."
3,Dubai,"[0.471176832517254, 0.35672426312421485, 0.172..."
4,Miami Beach,"[0.7856497243006918, 0.10336717367645293, 0.11..."


In [36]:
df['newcol' ].head()

0    [0.7766233837170984, 0.15829126945162666, 0.06...
1    [0.6582538859242353, 0.1635033921677491, 0.178...
2    [0.7046701569351638, 0.09846966682839016, 0.19...
3    [0.471176832517254, 0.35672426312421485, 0.172...
4    [0.7856497243006918, 0.10336717367645293, 0.11...
Name: newcol, dtype: object

In [37]:
df[['negative probability','neutral probability', 'positive sentiment']] = pd.DataFrame(df.newcol.values.tolist(), index= df.index)
df.drop(columns='newcol', axis=1).head()

Unnamed: 0,city_name,negative probability,neutral probability,positive sentiment
0,Newark,0.776623,0.158291,0.065085
1,Bowie,0.658254,0.163503,0.178243
2,Franklin Park,0.70467,0.09847,0.19686
3,Dubai,0.471177,0.356724,0.172099
4,Miami Beach,0.78565,0.103367,0.110983


In [None]:
df.to_csv('scores.csv', index= False)