In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [54]:
df = pd.read_csv('combined_data.csv')
df.head()
df['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stopWords = stopwords.words("english")

def cleanText(text):
    
    output = ""
    
    
    for word in text.split():
        cleanWord = re.sub("[^a-zA-Z]",  # Search for all non-letters
                          " ",          # Replace all non-letters with spaces
                          str(word))
        if cleanWord in stopWords:
            pass
        else:
            output = output + " " + cleanWord
    return output

In [5]:
df.Text = df.Text.apply(lambda x: cleanText(x))

In [22]:
from sklearn.model_selection import train_test_split

In [6]:
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [8]:
#Creating a balanced dataset with 10000 records for each emotion
balancedDf = stratify(df, 10000)
balancedDf['sentiment'].value_counts()

surprise    10000
neutral     10000
sadness     10000
relief      10000
love        10000
anger       10000
happy       10000
fear        10000
Name: sentiment, dtype: int64

In [15]:
happy10k = balancedDf[balancedDf['sentiment'] == 'happy']
nonHappyDf = balancedDf[balancedDf['sentiment'] != 'happy']
nonHappy10k = nonHappyDf.sample(n=10000)
print(nonHappy10k.shape)
print(nonHappy10k['sentiment'].value_counts())

(10000, 2)
anger       1496
sadness     1456
surprise    1432
love        1418
neutral     1408
fear        1407
relief      1383
Name: sentiment, dtype: int64


In [20]:
balancedHappyVsRest = pd.concat([happy10k, nonHappy10k], axis=0)

balancedHappyVsRest['sentiment'] = np.where((balancedHappyVsRest.sentiment != 'happy'),'not happy', balancedHappyVsRest.sentiment)
print(balancedHappyVsRest['sentiment'].value_counts())

happy        10000
not happy    10000
Name: sentiment, dtype: int64


In [23]:
import time

#Balanced Happy Vs Rest
bhr_train, bhr_test = train_test_split(balancedHappyVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bhr_train['Text'])
test_vectors = vectorizer.transform(bhr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bhr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bhr_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 45.677044s; Prediction time: 6.466196s
not happy:  {'precision': 0.6845703125, 'recall': 0.7077233720343261, 'f1-score': 0.6959543310995284, 'support': 1981}
happy:  {'precision': 0.7033811475409836, 'recall': 0.6800396235760278, 'f1-score': 0.6915134726769077, 'support': 2019}


In [26]:
fear10k = balancedDf[balancedDf['sentiment'] == 'fear']
nonFearDf = balancedDf[balancedDf['sentiment'] != 'fear']
nonFear10k = nonFearDf.sample(n=10000)
print(nonFear10k.shape)
print(nonFear10k['sentiment'].value_counts())
print(fear10k['sentiment'].value_counts())

balancedFearVsRest = pd.concat([fear10k, nonFear10k], axis=0)

balancedFearVsRest['sentiment'] = np.where((balancedFearVsRest.sentiment != 'fear'),'not fear', balancedFearVsRest.sentiment)
print(balancedFearVsRest['sentiment'].value_counts())

(10000, 2)
anger       1491
neutral     1451
happy       1432
surprise    1425
sadness     1422
love        1413
relief      1366
Name: sentiment, dtype: int64
fear    10000
Name: sentiment, dtype: int64
not fear    10000
fear        10000
Name: sentiment, dtype: int64


In [28]:
#Balanced Fear Vs Rest
bfr_train, bfr_test = train_test_split(balancedFearVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bfr_train['Text'])
test_vectors = vectorizer.transform(bfr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bfr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bfr_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 40.611597s; Prediction time: 6.254709s
not fear:  {'precision': 0.6623762376237624, 'recall': 0.6754164563351842, 'f1-score': 0.6688327918020495, 'support': 1981}
fear:  {'precision': 0.6752525252525252, 'recall': 0.6622090143635463, 'f1-score': 0.6686671667916978, 'support': 2019}


In [29]:
sadness10k = balancedDf[balancedDf['sentiment'] == 'sadness']
nonSadnessDf = balancedDf[balancedDf['sentiment'] != 'sadness']
nonSadness10k = nonSadnessDf.sample(n=10000)
print(nonSadness10k.shape)
print(nonSadness10k['sentiment'].value_counts())
print(sadness10k['sentiment'].value_counts())

balancedSadVsRest = pd.concat([sadness10k, nonSadness10k], axis=0)

balancedSadVsRest['sentiment'] = np.where((balancedSadVsRest.sentiment != 'sadness'),'not sadness', balancedSadVsRest.sentiment)
print(balancedSadVsRest['sentiment'].value_counts())

(10000, 2)
surprise    1484
love        1473
happy       1456
anger       1454
relief      1386
fear        1383
neutral     1364
Name: sentiment, dtype: int64
sadness    10000
Name: sentiment, dtype: int64
not sadness    10000
sadness        10000
Name: sentiment, dtype: int64


In [31]:
#Balanced Sad Vs Rest
bsr_train, bsr_test = train_test_split(balancedSadVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bsr_train['Text'])
test_vectors = vectorizer.transform(bsr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bsr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bsr_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 29.018380s; Prediction time: 4.301528s
not sadness:  {'precision': 0.7164750957854407, 'recall': 0.7551741544674407, 'f1-score': 0.7353158024084543, 'support': 1981}
sadness:  {'precision': 0.7463389121338913, 'recall': 0.7067855373947499, 'f1-score': 0.7260239124904605, 'support': 2019}


In [50]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'], 
                                                    df['sentiment'],test_size=0.20, 
                                                    random_state=42)

X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(balancedDf['Text'], 
                                                    balancedDf['sentiment'],test_size=0.20, 
                                                    random_state=42)

In [53]:
#Train balanced, test unbalanced for Sad vs rest
sadVsRestUnbalancedDf = df
sadVsRestUnbalancedDf['sentiment'] = np.where((sadVsRestUnbalancedDf.sentiment != 'sadness'),'not sadness', sadVsRestUnbalancedDf.sentiment)
sadU_train, sadU_test, ySadU_train, ySadU_test = train_test_split(sadVsRestUnbalancedDf['Text'], 
                                                    sadVsRestUnbalancedDf['sentiment'],test_size=0.20, 
                                                    random_state=42)

# vectorizer = TfidfVectorizer(min_df = 5,
#                              max_df = 0.8,
#                              sublinear_tf = True,
#                              use_idf = True)
# train_vectors = vectorizer.fit_transform(bsr_train['Text'])
# test_vectors = vectorizer.transform(sadU_test['Text'])

# # Perform classification with SVM, kernel=linear
# classifier_linear = svm.SVC(kernel='linear')
# t0 = time.time()
# classifier_linear.fit(train_vectors, bsr_train['sentiment'])
# t1 = time.time()
# prediction_linear = classifier_linear.predict(test_vectors)
# t2 = time.time()
# time_linear_train = t1-t0
# time_linear_predict = t2-t1

# # results
# print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
# report = classification_report(sadU_test['sentiment'], prediction_linear, output_dict=True)
# print('not sadness: ', report['not sadness'])
# print('sadness: ', report['sadness'])

print(bsr_test)

         sentiment                                               Text
27700  not sadness    smiley   thanks reviews today feel like idk ...
16330      sadness   still feeling pretty lousy allergy induced st...
69346      sadness   secretary called fran landed dublin whim much...
8914       sadness    Skewp    We miss babies together  haha  I m ...
10096  not sadness        Really wish I could gone  bikeradar weekend
...            ...                                                ...
32586      sadness   ive feeling bit remorseful decision kicking c...
59538      sadness   i m still full buffet palms  stomach actually...
79994      sadness                                   feel fake lashes
14962      sadness                         aw lose guy ten days  lt  
47525  not sadness   grappled guilt relatives friends usually comm...

[4000 rows x 2 columns]


In [18]:
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report


clf = svm.SVC(kernel='linear')

In [7]:
happyVsRest = pd.read_csv('combined_data.csv')

happyVsRest['sentiment'] = np.where((happyVsRest.sentiment != 'happy'),'0', happyVsRest.sentiment)
happyVsRest['sentiment'] = np.where((happyVsRest.sentiment == 'happy'),'1', happyVsRest.sentiment)

happyVsRest.head()

Unnamed: 0,sentiment,Text
0,0,@tiffanylue i know i was listenin to bad habi...
1,0,Layin n bed with a headache ughhhh...waitin o...
2,0,Funeral ceremony...gloomy friday...
3,1,wants to hang out with friends SOON!
4,0,@dannycastillo We want to trade with someone w...


In [13]:
#TRAIN UNBALANCED, TEST UNBALANCED SCENARIOS ARE OUTLINED BELOW
import time
#Non Happy vs Happy df
hnh_train, hnh_test = train_test_split(happyVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(hnh_train['Text'])
test_vectors = vectorizer.transform(hnh_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear', probability = True)
t0 = time.time()
classifier_linear.fit(train_vectors, hnh_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(hnh_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['0'])
print('happy: ', report['1'])


results = classifier_linear.predict_proba(test_vectors)
classes = classifier_linear.classes_

# results_ordered_by_probability = map(lambda x: x[0], sorted(zip(classifier_linear.classes_, results), key=lambda x: x[1], reverse=True))

print("Probability estimates")
# print(results)
for class_name, result in zip(classes, results):
    print(f"{class_name}: {result}")

Training time: 1906.666283s; Prediction time: 36.344294s
not happy:  {'precision': 0.7950749829584185, 'recall': 0.9780922431865828, 'f1-score': 0.8771385598796766, 'support': 9540}
happy:  {'precision': 0.6241007194244604, 'recall': 0.12609011627906977, 'f1-score': 0.2097944377267231, 'support': 2752}
Probability estimates
0: [0.82846148 0.17153852]
1: [0.80542524 0.19457476]


In [17]:
results2 = results[1]
for class_name, result in zip(classes, results2):
    print(f"{class_name}: {result}")

0: 0.8054252356817724
1: 0.19457476431822754


In [15]:
fearVsRest = pd.read_csv('combined_data.csv')

fearVsRest['sentiment'] = np.where((fearVsRest.sentiment != 'fear'),'not fear', fearVsRest.sentiment)

fearVsRest.head()

Unnamed: 0,sentiment,Text
0,not fear,@tiffanylue i know i was listenin to bad habi...
1,not fear,Layin n bed with a headache ughhhh...waitin o...
2,not fear,Funeral ceremony...gloomy friday...
3,not fear,wants to hang out with friends SOON!
4,not fear,@dannycastillo We want to trade with someone w...


In [16]:
#Non Fear vs Fear df
fnf_train, fnf_test = train_test_split(fearVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(fnf_train['Text'])
test_vectors = vectorizer.transform(fnf_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, fnf_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fnf_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 502.315996s; Prediction time: 34.620466s
not fear:  {'precision': 0.7567336468576314, 'recall': 0.9809354910219463, 'f1-score': 0.8543708065839647, 'support': 9022}
fear:  {'precision': 0.711892797319933, 'recall': 0.12996941896024464, 'f1-score': 0.21980863718644944, 'support': 3270}


In [19]:
sadVsRest = pd.read_csv('combined_data.csv')

sadVsRest['sadness'] = np.where((sadVsRest.sentiment != 'sadness'),'not sadness', sadVsRest.sentiment)

sadVsRest.head()

#Non Sadness vs Sadness df
sns_train, sns_test = train_test_split(sadVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(sns_train['Text'])
test_vectors = vectorizer.transform(sns_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, sns_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sns_test['sentiment'], prediction_linear, output_dict=True)

print('sadness: ', report['sadness'])

Training time: 406.767569s; Prediction time: 61.910850s
sadness:  {'precision': 0.4468814256339959, 'recall': 0.33487416538264, 'f1-score': 0.3828537874339401, 'support': 1947}


In [21]:
angerVsRest = pd.read_csv('combined_data.csv')

angerVsRest['anger'] = np.where((angerVsRest.sentiment != 'anger'),'not anger', angerVsRest.sentiment)

angerVsRest.head()

ana_train, ana_test = train_test_split(angerVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(ana_train['Text'])
test_vectors = vectorizer.transform(ana_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, ana_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(ana_test['sentiment'], prediction_linear, output_dict=True)

print('anger: ', report['anger'])

Training time: 439.986216s; Prediction time: 69.803237s
anger:  {'precision': 0.48905109489051096, 'recall': 0.2554002541296061, 'f1-score': 0.33555926544240405, 'support': 787}
