# Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [2]:
data = pd.read_csv("combined_data.csv")
data

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
data["sentiment"].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
# Removing punctuation, URL, and tags
import re
data['Text'] = data['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))


In [5]:
data.dropna(inplace=True)

In [6]:
print(data.shape)
data.head()
print(data.tail())

(61459, 2)
      sentiment                                               Text
61454      fear               Melissa stared at her friend in dism
61455     happy  Successive state elections have seen the gover...
61456      fear               Vincent was irritated but not dismay
61457     happy  KendallHume turned back to face the dismayed coup
61458     happy                     I am dismayed  but not surpris


In [7]:
import snowballstemmer
ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
data['Text'] = data['Text'].apply(lambda x: replace(x))
data.head()

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier and...
1,sadness,Layin n bed with a headach ughhhhwaitin on you...
2,sadness,Funer ceremonygloomi friday
3,happy,want to hang out with friend SOON
4,neutral,We want to trade with someon who has Houston t...


In [8]:
data['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [9]:
#function to build a new non-skewed dataset

#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)


#### Split Train and Test sets:

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['Text'], 
                                                    data['sentiment'],test_size=0.10, 
                                                    random_state=1)


In [11]:
y_train.value_counts()

fear        14543
happy       12239
sadness      8795
neutral      8040
love         4278
anger        3669
surprise     2374
relief       1375
Name: sentiment, dtype: int64

In [12]:
y_test.value_counts()

fear        1698
happy       1269
sadness     1001
neutral      920
love         442
anger        400
surprise     265
relief       151
Name: sentiment, dtype: int64

#### Stratification (oversampling) of train and test sets
Creation of balanced train and test sets

In [13]:
# join test and train X and y's together to stratify
data_train = pd.concat([X_train,y_train],axis=1)
data_test =  pd.concat([X_test,y_test],axis=1)
train_balanced = stratify(data_train, 8000)
test_balanced  = stratify(data_test, 2000)

In [14]:
train_balanced["sentiment"].value_counts()

fear        8000
neutral     8000
love        8000
relief      8000
anger       8000
sadness     8000
surprise    8000
happy       8000
Name: sentiment, dtype: int64

In [15]:
test_balanced["sentiment"].value_counts()

relief      2000
neutral     2000
love        2000
anger       2000
fear        2000
sadness     2000
surprise    2000
happy       2000
Name: sentiment, dtype: int64

In [16]:
# create X and y vectors for balanced training set
strat_X_train = train_balanced["Text"]
strat_y_train = train_balanced["sentiment"]

In [17]:
# create X and y vectors for balanced test set
strat_X_test = test_balanced["Text"]
strat_y_test = test_balanced["sentiment"]

## Creating a document matrix

#### Training set

In [18]:
# Applying the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_features = 10000)
# unbalanced train
X_train_counts = count_vect.fit_transform(X_train)
# balanced train
X_train_strat_counts = count_vect.fit_transform(strat_X_train)

In [19]:
# Applying the TFidf vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
# unbalanced train
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# balanced train
X_train_strat_tfidf = tfidf_transformer.fit_transform(X_train_strat_counts)

#### Test set

In [20]:
# unbalanced testset
# count vectorizer
X_test_counts = count_vect.transform(X_test)
# Tfidf transformer
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# balanced testset
# count vectorizer
X_test_strat_counts = count_vect.transform(strat_X_test)
# Tfidf transformer
X_test_strat_tfidf = tfidf_transformer.transform(X_test_strat_counts)

## Multinomial Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB

#### Training on unbalanced data set

In [22]:
clf1 = MultinomialNB().fit(X_train_tfidf, y_train)

#### Training on a balanced dataset

In [23]:
clf2 = MultinomialNB().fit(X_train_strat_tfidf, strat_y_train)

#### Applying the models

In [24]:
prediction1 = clf1.predict(X_test_tfidf)
prediction2 = clf2.predict(X_test_strat_tfidf)
prediction3 = clf1.predict(X_test_strat_tfidf)
prediction4 = clf2.predict(X_test_tfidf)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Skewed training and skewed test data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction1)))
print('precision score: {}'.format(precision_score(y_test, prediction1, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction1,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction1,average='macro')))

print('Balanced training and balanced test data:')
print('Accuracy score: {}'.format(accuracy_score(strat_y_test, prediction2)))
print('precision score: {}'.format(precision_score(strat_y_test, prediction2, average='macro')))
print('recall score: {}'.format(recall_score(strat_y_test, prediction2,average='macro')))
print('f1 score: {}'.format(f1_score(strat_y_test, prediction2,average='macro')))

print('Skewed training and balanced test data:')
print('Accuracy score: {}'.format(accuracy_score(strat_y_test, prediction3)))
print('precision score: {}'.format(precision_score(strat_y_test, prediction3, average='macro')))
print('recall score: {}'.format(recall_score(strat_y_test, prediction3,average='macro')))
print('f1 score: {}'.format(f1_score(strat_y_test, prediction3,average='macro')))

print('Balanced training and skewed test data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction4)))
print('precision score: {}'.format(precision_score(y_test, prediction4, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction4,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction4,average='macro')))

Skewed training and skewed test data:
Accuracy score: 0.18044256426944355
precision score: 0.11842648469090852
recall score: 0.11556072064069563
f1 score: 0.11154905310474224
Balanced training and balanced test data:
Accuracy score: 0.3133125
precision score: 0.31156870825388017
recall score: 0.3133125
f1 score: 0.310713851951226
Skewed training and balanced test data:
Accuracy score: 0.1135625
precision score: 0.1037259848007758
recall score: 0.1135625
f1 score: 0.0898300402135295
Balanced training and skewed test data:
Accuracy score: 0.30068337129840544
precision score: 0.2773026968106364
recall score: 0.31553148402287734
f1 score: 0.2754856095399232
