In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [2]:
data = pd.read_csv("combined_data.csv")
data

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
data["sentiment"].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
# Removing punctuation, URL, and tags
import re
data['Text'] = data['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))


In [5]:
data.dropna(inplace=True)

In [6]:
print(data.shape)
data.head()
print(data.tail())

(61459, 2)
      sentiment                                               Text
61454      fear               Melissa stared at her friend in dism
61455     happy  Successive state elections have seen the gover...
61456      fear               Vincent was irritated but not dismay
61457     happy  KendallHume turned back to face the dismayed coup
61458     happy                     I am dismayed  but not surpris


In [7]:
import snowballstemmer
ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
data['Text'] = data['Text'].apply(lambda x: replace(x))
data.head()

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier and...
1,sadness,Layin n bed with a headach ughhhhwaitin on you...
2,sadness,Funer ceremonygloomi friday
3,happy,want to hang out with friend SOON
4,neutral,We want to trade with someon who has Houston t...


In [8]:

data['sentiment'].value_counts()



fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [9]:
#build new non-skewed dataset

#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)
    



In [10]:

dstrat10000 = stratify(data, 10000)
dstrat10000['sentiment'].value_counts()

love        10000
surprise    10000
fear        10000
relief      10000
sadness     10000
anger       10000
happy       10000
neutral     10000
Name: sentiment, dtype: int64

#### Split Train and Test sets:

In [16]:
from sklearn.model_selection import train_test_split

rawX_train, rawy_train, rawX_test, rawX_test = train_test_split(data['Text'], 
                                                    data['sentiment'],test_size=0.10, 
                                                    random_state=1)
s5X_train, s5y_test, s5X_test, s5y_test = train_test_split(dstrat5000['Text'], 
                                                    dstrat5000['sentiment'],test_size=0.10, 
                                                    random_state=1)
s10X_train, s10y_tes, s10X_test, s10y_test = train_test_split(dstrat10000['Text'], 
                                                    dstrat10000['sentiment'],test_size=0.10, 
                                                    random_state=1)


#### Word of Bags: Preprocessing

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Instantiate the CountVectorizer method

count_vector = CountVectorizer(stop_words='english', max_features=10000)

tfid_vector = TfidfVectorizer(stop_words = 'english')


# Fit the training data and then return the matrix
training_rawdata = tfid_vector.fit_transform(rawX_train)
training_s5data = tfid_vector.fit_transform(s5X_train)
training_s10data = tfid_vector.fit_transform(s10X_train)

# Transform testing data and return the matrix. 
testing_rawdata = tfid_vector.fit_transform(rawX_test)
testing_s5data = tfid_vector.fit_transform(s5X_test)
testing_s10data = tfid_vector.fit_transform(s10X_test)


### Multinomial Naive Bayes

Turn the Dataset into a list of P(sentiment | word) = P(word | sentiment) * p(sentiment) / p(word)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
naive_bayes = MultinomialNB()

In [19]:
print(training_rawdata.shape)
print(testing_rawdata.shape)

naive_bayes.fit(training_rawdata,rawy_train)
predictions = naive_bayes.predict(testing_rawdata)

(55313, 8)
(6146, 8)


In [20]:

print('Accuracy score: {}'.format(accuracy_score(rawy_test, predictions)))
print('precision score: {}'.format(precision_score(rawy_test, predictions, average='macro')))
print('recall score: {}'.format(recall_score(rawy_test, predictions,average='macro')))
print('f1 score: {}'.format(f1_score(rawy_test, predictions,average='macro')))



Accuracy score: 0.0
precision score: 0.0
recall score: 0.0
f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

naive_bayes.fit(training_s5data,s5y_train)
predictions =  naive_bayes.predict(testing_s5data)

In [None]:
print("train on balanaced test on balanced")
print('Accuracy score: {}'.format(accuracy_score(s5y_test, predictions)))
print('precision score: {}'.format(precision_score(s5y_test, predictions, average='macro')))
print('recall score: {}'.format(recall_score(s5y_test, predictions,average='macro')))
print('f1 score: {}'.format(f1_score(s5y_test, predictions,average='macro')))

In [None]:
predictions =  naive_bayes.predict(testing_rawdata)

In [None]:
print("train on balanaced test on unbalanced")
print('Accuracy score: {}'.format(accuracy_score(rawy_test, predictions)))
print('precision score: {}'.format(precision_score(rawy_test, predictions, average='macro')))
print('recall score: {}'.format(recall_score(rawy_test, predictions,average='macro')))
print('f1 score: {}'.format(f1_score(rawy_test, predictions,average='macro')))

Below is Garbage