In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [2]:
data = pd.read_csv("combined_data.csv")
data

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
data["sentiment"].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
# Removing punctuation, URL, and tags
import re
data['Text'] = data['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))

In [5]:
data.dropna(inplace=True)

In [6]:
print(data.shape)
data.head()

(61459, 2)


Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier a...
1,sadness,Layin n bed with a headache ughhhhwaitin on y...
2,sadness,Funeral ceremonygloomy friday
3,happy,wants to hang out with friends SOON
4,neutral,We want to trade with someone who has Houston...


In [7]:
from nltk.stem import SnowballStemmer

In [8]:
stemmer = SnowballStemmer("english") 
# Split string on each row into a list of words
data['Text'] = data.apply(lambda x : x['Text'].split(" "),axis=1)
# Apply the stemmer on each word
data['Text'] = data['Text'].apply(lambda x: [stemmer.stem(y) for y in x])
# Join list of words into a single string for each row
data['Text'] = data['Text'].apply(' '.join)   # join list of strings into one string for each row
data

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier a...
1,sadness,layin n bed with a headach ughhhhwaitin on yo...
2,sadness,funer ceremonygloomi friday
3,happy,want to hang out with friend soon
4,neutral,we want to trade with someon who has houston ...
...,...,...
61454,fear,melissa stare at her friend in dism
61455,happy,success state elect have seen the govern parti...
61456,fear,vincent was irrit but not dismay
61457,happy,kendallhum turn back to face the dismay coup


In [9]:
#build new non-skewed dataset

#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [10]:
strat_df = stratify(data, 10000)
strat_df['sentiment'].value_counts()

fear        10000
sadness     10000
surprise    10000
anger       10000
love        10000
neutral     10000
happy       10000
relief      10000
Name: sentiment, dtype: int64

#### Split Train and Test sets:

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['Text'], 
                                                    data['sentiment'],test_size=0.10, 
                                                    random_state=1)


In [12]:
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(strat_df['Text'], 
                                                    strat_df['sentiment'],test_size=0.10, 
                                                    random_state=1)

#### Word of Bags: Preprocessing

In [13]:
# Applying the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_features = 10000)
X_train_counts = count_vect.fit_transform(X_train)
X_train_strat_counts = count_vect.fit_transform(X_train_strat)

In [14]:
# Applying the TFidf vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_strat_tfidf = tfidf_transformer.fit_transform(X_train_strat_counts)

### Multinomial Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

Training for unbalanced data set

In [17]:
clf1 = naive_bayes.fit(X_train_tfidf, y_train)

Training the balanced (stratified) dataset

In [18]:
clf2 = naive_bayes.fit(X_train_strat_tfidf, y_train_strat)

#### processing the test set with count and TFidf veectorizer

In [19]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

X_test_strat_counts = count_vect.transform(X_test_strat)
X_test_strat_tfidf = tfidf_transformer.transform(X_test_strat_counts)

#### Applying the models:

In [20]:
prediction1 = clf1.predict(X_test_tfidf)
prediction2 = clf2.predict(X_test_strat_tfidf)
prediction3 = clf1.predict(X_test_strat_tfidf)
prediction4 = clf2.predict(X_test_tfidf)

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Skewed training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction1)))
print('precision score: {}'.format(precision_score(y_test, prediction1, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction1,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction1,average='macro')))

print('Balanced training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction2)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction2, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction2,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction2,average='macro')))

print('Skewed training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction3)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction3, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction3,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction3,average='macro')))

print('Balanced training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction4)))
print('precision score: {}'.format(precision_score(y_test, prediction4, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction4,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction4,average='macro')))

Skewed training on skewed data:
Accuracy score: 0.4253172795314025
precision score: 0.40609656326742327
recall score: 0.5108284151410327
f1 score: 0.4173620090015593
Balanced training on balanced data:
Accuracy score: 0.515
precision score: 0.5071567638884378
recall score: 0.516122648163598
f1 score: 0.5085886421462789
Skewed training on balanced data:
Accuracy score: 0.515
precision score: 0.5071567638884378
recall score: 0.516122648163598
f1 score: 0.5085886421462789
Balanced training on skewed data:
Accuracy score: 0.4253172795314025
precision score: 0.40609656326742327
recall score: 0.5108284151410327
f1 score: 0.4173620090015593
