In [1]:
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd
import nltk

In [2]:
df = pd.read_csv('../MasterProject/combined_data.csv')
df.head()

Unnamed: 0,sentiment,Text
0,sadness,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,happy,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [3]:
df['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
import re
df['Text'] = df['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
df.dropna(inplace=True)

In [5]:
import snowballstemmer
ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
df['Text'] = df['Text'].apply(lambda x: replace(x))
df.head()

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier and...
1,sadness,Layin n bed with a headach ughhhhwaitin on you...
2,sadness,Funer ceremonygloomi friday
3,happy,want to hang out with friend SOON
4,neutral,We want to trade with someon who has Houston t...


In [6]:
#sample with replacement
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [7]:
strat_df = stratify(df, 10000)
strat_df['sentiment'].value_counts()

relief      10000
anger       10000
surprise    10000
neutral     10000
love        10000
happy       10000
sadness     10000
fear        10000
Name: sentiment, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text'], 
                                                    df['sentiment'],test_size=0.10, 
                                                    random_state=1)
#y_train = y_train.astype('int')
#y_test = y_test.astype('int')


In [9]:
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(strat_df['Text'], 
                                                    strat_df['sentiment'],test_size=0.10, 
                                                    random_state=1)
#y_train_strat = y_train_strat.astype('int')
#y_test_strat = y_test_strat.astype('int')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_features = 10000)
X_train_counts = count_vect.fit_transform(X_train)
X_train_strat_counts = count_vect.fit_transform(X_train_strat)

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_strat_tfidf = tfidf_transformer.fit_transform(X_train_strat_counts)

In [12]:
clf1 = MLPClassifier(hidden_layer_sizes = (500, 250, 100), 
                    random_state = 1, max_iter = 12, verbose = True).fit(X_train_tfidf, y_train)

Iteration 1, loss = 1.67147535
Iteration 2, loss = 1.37292172
Iteration 3, loss = 1.13082937
Iteration 4, loss = 0.78915891
Iteration 5, loss = 0.44465552
Iteration 6, loss = 0.26126840
Iteration 7, loss = 0.18415429
Iteration 8, loss = 0.15064752
Iteration 9, loss = 0.13036244
Iteration 10, loss = 0.11846081
Iteration 11, loss = 0.11154856
Iteration 12, loss = 0.10485336




In [13]:
clf2 = MLPClassifier(hidden_layer_sizes = (500, 250, 100), 
                    random_state = 1, max_iter = 12, verbose = True).fit(X_train_strat_tfidf, y_train_strat)

Iteration 1, loss = 1.43602223
Iteration 2, loss = 0.73513930
Iteration 3, loss = 0.38431949
Iteration 4, loss = 0.20408084
Iteration 5, loss = 0.13600897
Iteration 6, loss = 0.10956713
Iteration 7, loss = 0.09603075
Iteration 8, loss = 0.08870600
Iteration 9, loss = 0.08493155
Iteration 10, loss = 0.07956781
Iteration 11, loss = 0.07494371
Iteration 12, loss = 0.07375123




In [14]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

X_test_strat_counts = count_vect.transform(X_test_strat)
X_test_strat_tfidf = tfidf_transformer.transform(X_test_strat_counts)

In [15]:
prediction1 = clf1.predict(X_test_tfidf)
prediction2 = clf2.predict(X_test_strat_tfidf)
prediction3 = clf1.predict(X_test_strat_tfidf)
prediction4 = clf2.predict(X_test_tfidf)

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Skewed training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction1)))
print('precision score: {}'.format(precision_score(y_test, prediction1, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction1,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction1,average='macro')))

print('Balanced training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction2)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction2, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction2,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction2,average='macro')))

print('Skewed training on balanced data:')
print('Accuracy score: {}'.format(accuracy_score(y_test_strat, prediction3)))
print('precision score: {}'.format(precision_score(y_test_strat, prediction3, average='macro')))
print('recall score: {}'.format(recall_score(y_test_strat, prediction3,average='macro')))
print('f1 score: {}'.format(f1_score(y_test_strat, prediction3,average='macro')))

print('Balanced training on skewed data:')
print('Accuracy score: {}'.format(accuracy_score(y_test, prediction4)))
print('precision score: {}'.format(precision_score(y_test, prediction4, average='macro')))
print('recall score: {}'.format(recall_score(y_test, prediction4,average='macro')))
print('f1 score: {}'.format(f1_score(y_test, prediction4,average='macro')))

Skewed training on skewed data:
Accuracy score: 0.1863000325414904
precision score: 0.1253310747670907
recall score: 0.12687842107499817
f1 score: 0.12218320722189495
Balanced training on balanced data:
Accuracy score: 0.803
precision score: 0.7983138177147763
recall score: 0.8048331491489811
f1 score: 0.7996250368938298
Skewed training on balanced data:
Accuracy score: 0.135875
precision score: 0.1330193634231403
recall score: 0.13561061864789453
f1 score: 0.10807107504822941
Balanced training on skewed data:
Accuracy score: 0.7092417832736739
precision score: 0.6958688987767336
recall score: 0.8016811323857247
f1 score: 0.7337439578596887
