In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [2]:
data = pd.read_csv("combined_data.csv")
data

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
data["sentiment"].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
# Removing punctuation, URL, and tags
import re
data['Text'] = data['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))


In [5]:
data.dropna(inplace=True)

In [6]:
print(data.shape)
print(data.tail())

(61459, 2)
      sentiment                                               Text
61454      fear               Melissa stared at her friend in dism
61455     happy  Successive state elections have seen the gover...
61456      fear               Vincent was irritated but not dismay
61457     happy  KendallHume turned back to face the dismayed coup
61458     happy                     I am dismayed  but not surpris


In [None]:
import snowballstemmer
ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
data['Text'] = data['Text'].apply(lambda x: replace(x))
data.head()

In [None]:
#build new non-skewed dataset

#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)
    



In [None]:

#strat_df = stratify(data, 10000)
data['sentiment'].value_counts()

#### Split Train and Test sets:

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(data['Text'], 
                                                    data['sentiment'],test_size=.1, 
                                                    random_state=1)
#y_train = y_train.astype('int')
#y_test = y_test.astype('int')



In [None]:
# X_train_strat,y_train_strat = stratify(X_train, y_train, test_size*10000)

# X_test_strat, y_train_strat, y_test_strat = train_test_split(strat_df['Text'], 
#                                                     strat_df['sentiment'],test_size=0.10, 
#                                                     random_state=1)
#y_train_strat = y_train_strat.astype('int')
#y_test_strat = y_test_strat.astype('int')

Stratify the Dataset

In [None]:
# join test and train X and y's together to stratify
data_train = pd.concat([X_train,y_train],axis=1)
data_test =  pd.concat([X_test,y_test],axis=1)
train_balanced = stratify(data_train, 8000)
test_balanced  = stratify(data_test, 2000)

# create X and y vectors for balanced training set
X_train_strat = train_balanced["Text"]
y_train_strat = train_balanced["sentiment"]

# create X and y vectors for balanced test set
X_test_strat = test_balanced["Text"]
y_test_strat = test_balanced["sentiment"]

#### Word of Bags: Preprocessing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_features=8000)
X_train_counts = count_vect.fit_transform(X_train)
X_train_strat_counts = count_vect.fit_transform(X_train_strat)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_strat_tfidf = tfidf_transformer.fit_transform(X_train_strat_counts)


### Multinomial Naive Bayes

Turn the Dataset into a list of P(sentiment | word) = P(word | sentiment) * p(sentiment) / p(word)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
naive_bayes_unbalanced = MultinomialNB()
naive_bayes_balanced = MultinomialNB()

In [None]:
naive_bayes_unbalanced.fit(X_train_tfidf, y_train)
naive_bayes_balanced.fit(X_train_strat_tfidf, y_train_strat)

In [None]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

X_test_strat_counts = count_vect.transform(X_test_strat)
X_test_strat_tfidf = tfidf_transformer.transform(X_test_strat_counts)


In [None]:
prediction1 = naive_bayes_unbalanced.predict(X_test_tfidf)
prediction2 = naive_bayes_balanced.predict(X_test_strat_tfidf)
prediction3 = naive_bayes_unbalanced.predict(X_test_strat_tfidf)
prediction4 = naive_bayes_balanced.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Skewed training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction1)))
print('precision score: {}'.format(precision_score(y_test, prediction1, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction1,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction1,average='macro')))
print()
print('Balanced training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction2)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction2, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction2,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction2,average='macro')))
print()
print('Skewed training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction3)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction3, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction3,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction3,average='macro')))
print()
print('Balanced training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction4)))
print('precision score: {}'.format(precision_score(y_test, prediction4, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction4,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction4,average='macro')))

# Unseen Data

In [None]:
real = pd.read_csv('realWorldEmotions.csv')

#do same preprocessing
real['Text'] = real['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
real.dropna(inplace=True)


real.head()


In [None]:
real['Sentiment'].value_counts()

In [None]:
data["sentiment"].value_counts()

In [None]:
for i,s in enumerate(real['Sentiment']):
    if s == 'joy':
        real.loc[i,'Sentiment'] = 'happy'

In [None]:
real['Sentiment'].value_counts()


In [None]:
real['Text'] = real['Text'].apply(lambda x: replace(x))
real.head()

In [None]:
# get document vectors
X = real["Text"]
y = real["Sentiment"]
# count vectorizer
X_count = count_vect.transform(X)
# Tfidf transformer
X_tfidf = tfidf_transformer.transform(X_count)

In [None]:
pred_balanced = naive_bayes_balanced.predict(X_tfidf)
pred_unbalanced = naive_bayes_unbalanced.predict(X_tfidf)


In [None]:
print('Skewed training on real data:')
print('Accuracy score: {}'.format(accuracy_score(y, pred_unbalanced)))
print('precision score: {}'.format(precision_score(y, pred_unbalanced, average='macro')))
print('recall score: {}'.format(recall_score(y, pred_unbalanced,average='macro')))
print('f1 score: {}'.format(f1_score(y, pred_unbalanced,average='macro')))
print('\n\n')
print('Balanced training on real data:')
print('Accuracy score: {}'.format(accuracy_score(y, pred_balanced)))
print('precision score: {}'.format(precision_score(y, pred_balanced, average='macro')))
print('recall score: {}'.format(recall_score(y, pred_balanced,average='macro')))
print('f1 score: {}'.format(f1_score(y, pred_balanced,average='macro')))