In [1]:
#Import Libraries
from wordcloud import WordCloud,STOPWORDS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neural_network import MLPClassifier
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import seaborn as sns
from numpy import nan
from bs4 import BeautifulSoup    
from nltk.stem import WordNetLemmatizer
from math import sqrt
%matplotlib inline



In [2]:
#Import Training set
training_set= pd.read_csv('dataset.tsv', sep='\t')

In [3]:
#Training set Summary
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.2+ MB


In [4]:
#Snippet of training set
training_set.head()


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
# As a sanity check, we print out the size of the training data.
print('Training set shape: ', training_set.shape)


('Training set shape: ', (156060, 4))


In [6]:
# Subsample the data for more efficient code execution in this exercise
num_train=50000
training_set = training_set[0:num_train]
print('Training set shape: ', training_set.shape)
training_set.head()

('Training set shape: ', (5000, 4))


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
#Create a new column that transforms sentiment (integer) to a phrase 
Sentiment_words=[]
for row in training_set['Sentiment']:
    if row ==0:
        Sentiment_words.append('negative')
    elif row == 1:
        Sentiment_words.append('somewhat negative')
    elif row == 2:
        Sentiment_words.append('neutral')
    elif row == 3:
        Sentiment_words.append('somewhat positive')
    elif row == 4:
        Sentiment_words.append('positive')
        
training_set['Sentiment_words'] = Sentiment_words
training_set.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Sentiment_words
0,1,1,A series of escapades demonstrating the adage ...,1,somewhat negative
1,2,1,A series of escapades demonstrating the adage ...,2,neutral
2,3,1,A series,2,neutral
3,4,1,A,2,neutral
4,5,1,series,2,neutral


In [8]:
#Count appearances of each of the 5 Sentiment Phrases
word_count=pd.value_counts(training_set['Sentiment_words'].values, sort=False)
word_count


somewhat negative     777
somewhat positive     898
positive              251
negative              155
neutral              2919
dtype: int64

In [9]:
#Preprocessing steps
def review_to_words(raw_review): 
    review =raw_review
    review = re.sub('[^a-zA-Z]', ' ',review)
    review = review.lower()
    review = review.split()
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(w) for w in review if not w in set(stopwords.words('english'))]
    return (' '.join(review))

In [10]:
#Transforms column phrase of the training data into a list. Each element of the list is a phrase which is preprocessed 
#by the review_to_words function
vocabulary= []
for i in range(0, training_set.shape[0]):
    vocabulary.append(review_to_words(training_set['Phrase'][i]))
  

In [11]:
#We create a new column in our training set, called new_Phrase, which is our previous preprocessed list.
training_set['new_Phrase']=vocabulary
training_set.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Sentiment_words,new_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,somewhat negative,series escapade demonstrating adage good goose...
1,2,1,A series of escapades demonstrating the adage ...,2,neutral,series escapade demonstrating adage good goose
2,3,1,A series,2,neutral,series
3,4,1,A,2,neutral,
4,5,1,series,2,neutral,series


In [12]:
# We delete the previous column "Phrase" from our training set.
training_set.drop(['Phrase'],axis=1,inplace=True)
training_set.head()

Unnamed: 0,PhraseId,SentenceId,Sentiment,Sentiment_words,new_Phrase
0,1,1,1,somewhat negative,series escapade demonstrating adage good goose...
1,2,1,2,neutral,series escapade demonstrating adage good goose
2,3,1,2,neutral,series
3,4,1,2,neutral,
4,5,1,2,neutral,series


In [13]:
#get the tokens for each phrase
phrases_tokens = []
for phrase in training_set['new_Phrase'][:num_train]:
    phrases_tokens.append(nltk.word_tokenize(phrase) )


In [14]:
#get the tag for each token
tags = []
for token in phrases_tokens:
    tags.append(nltk.pos_tag(token))

In [15]:
from nltk.corpus import sentiwordnet as swn
scores = []
# in each loop the words of each sentence are analyzed
for index, sentence in enumerate(tags):
    scores.append([])

    for tagged_word in sentence:

        #create new tag (a:adjective or v:verb)
        ntag = None

        if tagged_word[1].startswith('JJ'):
            ntag = 'a'
        elif tagged_word[1].startswith('NN'):
            ntag = 'n'
        elif tagged_word[1].startswith('V'):
            ntag = 'v'
        elif tagged_word[1].startswith('R'):
            ntag = 'r'
        else:
            ntag = ''

        if (ntag != None):
            # Getting average of all possible synsets
            synsets = swn.senti_synsets(tagged_word[0], ntag)
            
            #score keeps sum(pos-neg) for each synset
            score = 0
            if (len(synsets) > 0):
                for s in synsets:
                    score += s.pos_score() - s.neg_score()
                #the final score for a word is the average score of all its synsets
                scores[index].append(score / len(synsets))

In [16]:
#sentiments for each sentence
sentence_sent = []

#sentiment for each sentence is the average sentiment of its words
for score in scores:
    if len(score)==0:
        #if sentnece has no words then append it as neutral
        sentence_sent.append(0)
    else:
        #the sentiment of a sentence is the average sentiment of its words
        sentence_sent.append(sum([word_score for word_score in score]) / len(score))

In [17]:
# get the predictions
pred = []
for sen in sentence_sent:
    if sen <-0.6:
        pred.append('negative')
    elif sen<-0.2:
        pred.append('somewhat negative')
    elif sen<0.2:
        pred.append('neutral')
    elif sen<0.6:
        pred.append('somewhat positive')
    else:
        pred.append('positive')

In [18]:
#compute the confusion matrix
from sklearn.metrics import confusion_matrix

conf =  confusion_matrix( training_set['Sentiment_words'][:num_train] , pred,
                  labels=["negative", "somewhat negative", "neutral", "somewhat positive", "positive"])

print conf

[[   2   13  133    6    1]
 [   7   66  661   42    1]
 [   5   91 2667  149    7]
 [   0   14  760  116    8]
 [   1    2  203   43    2]]


In [19]:
#compute TP, TN, FP, FN
import numpy as np
FP = conf.sum(axis=1) - np.diag(conf)
FN = conf.sum(axis=0) - np.diag(conf)
TP = np.diag(conf)
TN = np.sum(conf) - (FP + FN + TP)
print "TP = %s \n FN = %s \n FP = %s \n TN = %s" %(TP, FN, FP, TN)

TP = [   2   66 2667  116    2] 
 FN = [  13  120 1757  240   17] 
 FP = [153 711 252 782 249] 
 TN = [4832 4103  324 3862 4732]


In [20]:
#function to calculate metrics for each label
def calc_measures(TP, FN, FP, TN, index):
    Prec = float(TP[index]) / (TP[index] + FP[index])
    Acc = float(TP[index]) / (TP[index] + FP[index] + FN[index] + TN[index])
    Rec = float(TP[index]) / (TP[index] + FN[index])
    F1 = 2 * float(Rec * Prec) / (Rec + Prec)
    return Prec, Acc, Rec, F1

In [21]:
#calculate average precission, accuracy, recall, F1 for each class
avgPrec = avgAcc = avgRec = avgF1 = 0
for idx, sent in enumerate(["negative", "somewhat negative", "neutral", "somewhat positive", "positive"]):
    print "Metrics for class: %s" %(sent)
    Prec, Acc, Rec, F1 = calc_measures(TP, FN, FP, TN, idx)
    print "Prec = %s \n Acc = %s \n Rec = %s \n F1 = %s" %(Prec*100, Acc*100, Rec*100, F1*100)
    avgPrec = avgPrec+Prec
    avgAcc = avgAcc+Acc
    avgRec = avgRec+Rec
    avgF1 = avgF1+F1

avgPrec = avgAcc = avgRec = avgF1 = 0
for idx, sent in enumerate(["negative", "somewhat negative", "neutral", "somewhat positive", "positive"]):
    print "Measure for class: %s" %(sent)
    Prec, Acc, Rec, F1 = calc_measures(TP, FN, FP, TN, idx)
    print "Prec = %s \n Acc = %s \n Rec = %s \n F1 = %s" %(Prec*100, Acc*100, Rec*100, F1*100)
    avgPrec = avgPrec+Prec
    avgAcc = avgAcc+Acc
    avgRec = avgRec+Rec
    avgF1 = avgF1+F1





Metrics for class: negative
Prec = 1.29032258065 
 Acc = 0.04 
 Rec = 13.3333333333 
 F1 = 2.35294117647
Metrics for class: somewhat negative
Prec = 8.49420849421 
 Acc = 1.32 
 Rec = 35.4838709677 
 F1 = 13.707165109
Metrics for class: neutral
Prec = 91.3669064748 
 Acc = 53.34 
 Rec = 60.2848101266 
 F1 = 72.6406101049
Metrics for class: somewhat positive
Prec = 12.9175946548 
 Acc = 2.32 
 Rec = 32.5842696629 
 F1 = 18.5007974482
Metrics for class: positive
Prec = 0.796812749004 
 Acc = 0.04 
 Rec = 10.5263157895 
 F1 = 1.48148148148
Measure for class: negative
Prec = 1.29032258065 
 Acc = 0.04 
 Rec = 13.3333333333 
 F1 = 2.35294117647
Measure for class: somewhat negative
Prec = 8.49420849421 
 Acc = 1.32 
 Rec = 35.4838709677 
 F1 = 13.707165109
Measure for class: neutral
Prec = 91.3669064748 
 Acc = 53.34 
 Rec = 60.2848101266 
 F1 = 72.6406101049
Measure for class: somewhat positive
Prec = 12.9175946548 
 Acc = 2.32 
 Rec = 32.5842696629 
 F1 = 18.5007974482
Measure for class: p

In [22]:
#compute average metrics
avgPrec = float(avgPrec)/5
avgAcc = float(avgAcc)
avgRec = float(avgRec)/5
avgF1 = float(avgF1)/5
print "Average statistics"
print "Average Prec = %s \n Average Acc = %s \n Average Rec = %s \n Average F1 = %s" %(avgPrec*100, avgAcc*100, avgRec*100, avgF1*100)


Average statistics
Average Prec = 22.9731689907 
 Average Acc = 57.06 
 Average Rec = 30.442519976 
 Average F1 = 21.736599064


In [23]:
#Feature Extraction
#Split train-test set
cv = CountVectorizer(max_features = 3000,ngram_range=(1, 2))
x__train = x__test=cv.fit_transform(vocabulary).toarray()
y = training_set.iloc[:, 2].values
X_train, X_test, y_train, y_test = train_test_split(x__train, y, test_size = 0.20, random_state = 0)
print(cv.get_feature_names())
print(x__train.shape)

[u'abandoned', u'abandoned still', u'abel', u'abel ferrara', u'aborted', u'aborted attempt', u'absolute', u'absolute joy', u'absorbing', u'absorbing character', u'absorbing documentary', u'accent', u'accent rrb', u'accurate', u'accurate depiction', u'achieved', u'achieved lottery', u'achievement', u'achievement director', u'achieving', u'achieving honest', u'across', u'across relic', u'acting', u'acting workshop', u'action', u'action flick', u'action scene', u'action screenwriter', u'action speed', u'actor', u'actor exercise', u'actor never', u'actor performance', u'actor well', u'actress', u'actress trying', u'actually', u'actually rather', u'adage', u'adage good', u'adaptation', u'adaptation mary', u'adolescent', u'adolescent audience', u'adrenaline', u'adrenaline jolt', u'affecting', u'affecting portrait', u'affection', u'affection original', u'age', u'age first', u'agers', u'agers stumble', u'aggressive', u'aggressive self', u'aid', u'aid wisecracking', u'aim', u'aim funny', u'aim 

In [24]:
#Feature Selection
X_new = SelectKBest(chi2, k=2000).fit_transform(x__train, y)
print(X_new.shape)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)

(5000, 2000)


In [25]:
#Find best k
for k in range(1000,3000,200):
 X_new = SelectKBest(chi2, k).fit_transform(x__train, y)
 X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)
 classifier = svm.LinearSVC()
 classifier.fit(X_train, y_train)
 y_pred = classifier.predict(X_test)
 print("SVM Classifier")
 print( 'Accuracy:', accuracy_score(y_pred,y_test))
 print( 'F1 score:', f1_score(y_pred,y_test,average="macro"))
 print ('Recall:', recall_score(y_pred,y_test,average="macro"))
 print ('Precision:', precision_score(y_pred,y_test,average="macro"))
 print(k)

SVM Classifier
('Accuracy:', 0.70499999999999996)
('F1 score:', 0.52445614382346795)
('Recall:', 0.61804500569601883)
('Precision:', 0.4822423195047511)
1000


SVM Classifier
('Accuracy:', 0.69499999999999995)
('F1 score:', 0.51916498866443717)
('Recall:', 0.6051939508875348)
('Precision:', 0.4785509107375624)
1200


SVM Classifier
('Accuracy:', 0.69799999999999995)
('F1 score:', 0.51829395889944896)
('Recall:', 0.59095890310006394)
('Precision:', 0.48201911883004794)
1400


SVM Classifier
('Accuracy:', 0.68799999999999994)
('F1 score:', 0.50449635501327172)
('Recall:', 0.57108197505659264)
('Precision:', 0.46988635847850679)
1600


SVM Classifier
('Accuracy:', 0.68400000000000005)
('F1 score:', 0.50313086943845353)
('Recall:', 0.56619871921323528)
('Precision:', 0.46938624302696963)
1800


SVM Classifier
('Accuracy:', 0.68200000000000005)
('F1 score:', 0.50202168488294008)
('Recall:', 0.56366164097154847)
('Precision:', 0.4687364037385498)
2000


SVM Classifier
('Accuracy:', 0.68000000000000005)
('F1 score:', 0.50121952263372582)
('Recall:', 0.55767340557646361)
('Precision:', 0.46955934432213831)
2200


SVM Classifier
('Accuracy:', 0.67400000000000004)
('F1 score:', 0.49786053941765795)
('Recall:', 0.54731218107920043)
('Precision:', 0.46889308676099162)
2400


SVM Classifier
('Accuracy:', 0.67800000000000005)
('F1 score:', 0.50403643487178074)
('Recall:', 0.55265227734651867)
('Precision:', 0.47499200845883688)
2600


SVM Classifier
('Accuracy:', 0.68200000000000005)
('F1 score:', 0.50732019922178073)
('Recall:', 0.55632195180965893)
('Precision:', 0.4780171596648789)
2800


In [None]:
#Feature Weighting
vectorizer = TfidfVectorizer(max_df=0.003,min_df=0.001,ngram_range=(1,2))
x__train = x__test=vectorizer.fit_transform(vocabulary).toarray()
y = training_set.iloc[:, 2].values
X_train, X_test, y_train, y_test = train_test_split(x__train, y, test_size = 0.20, random_state = 0)
print(vectorizer.get_feature_names())
print(x__train.shape)

In [26]:
#Bayes Classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Bayes Classifier")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

Bayes Classifier
('Accuracy:', 63.800000000000004)
('F1 score:', 51.590254570902857)
('Recall:', 51.869273239596517)
('Precision:', 51.677976748791579)


In [27]:
#Decision tree
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Decision tree")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

Decision tree
('Accuracy:', 65.900000000000006)
('F1 score:', 49.767081897599056)
('Recall:', 52.625833668350971)
('Precision:', 47.875072291435636)


In [28]:
#Neural Network
classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Neural Network")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

Neural Network
('Accuracy:', 58.199999999999996)
('F1 score:', 23.991962002192182)
('Recall:', 21.059624904675118)
('Precision:', 29.143508372052253)


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [29]:
#3-NN Classifier
classifier= KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("3-NN Classifier")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

3-NN Classifier
('Accuracy:', 65.5)
('F1 score:', 45.763639217650024)
('Recall:', 63.891771149253941)
('Precision:', 41.343685869892525)


In [30]:
#SVM Classifier
classifier = svm.LinearSVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("SVM Classifier")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

SVM Classifier
('Accuracy:', 68.200000000000003)
('F1 score:', 50.732019922178075)
('Recall:', 55.632195180965894)
('Precision:', 47.80171596648789)


In [31]:
#Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Logistic Regression Classifier")
print( 'Accuracy:', accuracy_score(y_pred,y_test)*100)
print( 'F1 score:', f1_score(y_pred,y_test,average="macro")*100)
print ('Recall:', recall_score(y_pred,y_test,average="macro")*100)
print ('Precision:', precision_score(y_pred,y_test,average="macro")*100)

Logistic Regression Classifier
('Accuracy:', 68.0)
('F1 score:', 50.018117244943817)
('Recall:', 67.307912845496944)
('Precision:', 44.3327868780068)
