In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

In [2]:
data = pd.read_csv("combined_data.csv")
data

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
61454,fear,Melissa stared at her friend in dism
61455,happy,Successive state elections have seen the gover...
61456,fear,Vincent was irritated but not dismay
61457,happy,Kendall-Hume turned back to face the dismayed ...


In [3]:
data["sentiment"].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
# Removing punctuation, URL, and tags
import re
data['Text'] = data['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))

In [5]:
data.dropna(inplace=True)

In [6]:
print(data.shape)
data.head()

(61459, 2)


Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier a...
1,sadness,Layin n bed with a headache ughhhhwaitin on y...
2,sadness,Funeral ceremonygloomy friday
3,happy,wants to hang out with friends SOON
4,neutral,We want to trade with someone who has Houston...


In [7]:
from nltk.stem import SnowballStemmer

In [8]:
stemmer = SnowballStemmer("english") 
# Split string on each row into a list of words
data['Text'] = data.apply(lambda x : x['Text'].split(" "),axis=1)
# Apply the stemmer on each word
data['Text'] = data['Text'].apply(lambda x: [stemmer.stem(y) for y in x])
# Join list of words into a single string for each row
data['Text'] = data['Text'].apply(' '.join)   # join list of strings into one string for each row
data

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier a...
1,sadness,layin n bed with a headach ughhhhwaitin on yo...
2,sadness,funer ceremonygloomi friday
3,happy,want to hang out with friend soon
4,neutral,we want to trade with someon who has houston ...
...,...,...
61454,fear,melissa stare at her friend in dism
61455,happy,success state elect have seen the govern parti...
61456,fear,vincent was irrit but not dismay
61457,happy,kendallhum turn back to face the dismay coup


In [None]:
#build new non-skewed dataset

#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [None]:
data = stratify(data, 10000)
data['sentiment'].value_counts()

#### Split Train and Test sets:

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['Text'], 
                                                    data['sentiment'],test_size=0.20, 
                                                    random_state=1)

#### Word of Bags: Preprocessing

In [10]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# # Instantiate the CountVectorizer method
# tf_idf = TfidfVectorizer(stop_words='english',max_features=5000, min_df = 10, max_df = 0.1)
# tf_idf.fit(data['Text'])
# # Fit the training data and then return the matrix
# training_data = tf_idf.transform(X_train)
# # Transform testing data and return the matrix. 
# testing_data = tf_idf.transform(X_test)

In [11]:
training_data.shape

(49167, 3917)

In [12]:
# count_vector.get_feature_names()
tf_idf.get_feature_names()

['abandon',
 'abil',
 'abit',
 'abl',
 'abov',
 'absolut',
 'abt',
 'abus',
 'ac',
 'academ',
 'accent',
 'accept',
 'access',
 'accid',
 'accident',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accus',
 'ace',
 'ach',
 'achiev',
 'acknowledg',
 'act',
 'action',
 'activ',
 'actor',
 'actual',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adjust',
 'admir',
 'admit',
 'adopt',
 'ador',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'advertis',
 'advic',
 'affair',
 'affect',
 'affection',
 'afford',
 'afraid',
 'african',
 'afternoon',
 'afterward',
 'age',
 'agent',
 'aggrav',
 'aggress',
 'agit',
 'ago',
 'agre',
 'ah',
 'aha',
 'ahaha',
 'ahead',
 'ahh',
 'ahhh',
 'ahhhh',
 'ahhhhh',
 'aid',
 'aim',
 'aint',
 'air',
 'airport',
 'aka',
 'al',
 'ala',
 'alarm',
 'album',
 'alcohol',
 'alex',
 'ali',
 'alic',
 'alien',
 'aliv',
 'allen',
 'allergi',
 'allow',
 'alon',
 'alot',
 'alreadi',
 'alright',
 'altern',
 'altogeth',
 'alway',
 'amaz',
 'amazon'

### Multinomial Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)
predictions = naive_bayes.predict(testing_data)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))

Accuracy score: 0.3651155222909209
