In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('combined_data.csv')
df.head()
df['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stopWords = stopwords.words("english")

def cleanText(text):
    
    output = ""
    
    
    for word in text.split():
        cleanWord = re.sub("[^a-zA-Z]",  # Search for all non-letters
                          " ",          # Replace all non-letters with spaces
                          str(word))
        if cleanWord in stopWords:
            pass
        else:
            output = output + " " + cleanWord
    return output

In [4]:
df.Text = df.Text.apply(lambda x: cleanText(x))

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

In [6]:
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [7]:
#Creating a balanced dataset with 10000 records for each emotion
balancedDf = stratify(df, 10000)
balancedDf['sentiment'].value_counts()
balancedDf.Text = balancedDf.Text.apply(lambda x: cleanText(x))

In [8]:
happy10k = balancedDf[balancedDf['sentiment'] == 'happy']
nonHappyDf = balancedDf[balancedDf['sentiment'] != 'happy']
nonHappy10k = nonHappyDf.sample(n=10000)
print(nonHappy10k.shape)
print(nonHappy10k['sentiment'].value_counts())

(10000, 2)
relief      1477
fear        1457
neutral     1432
love        1426
surprise    1412
anger       1410
sadness     1386
Name: sentiment, dtype: int64


In [9]:
balancedHappyVsRest = pd.concat([happy10k, nonHappy10k], axis=0)

balancedHappyVsRest['sentiment'] = np.where((balancedHappyVsRest.sentiment != 'happy'),'not happy', balancedHappyVsRest.sentiment)
print(balancedHappyVsRest['sentiment'].value_counts())

happy        10000
not happy    10000
Name: sentiment, dtype: int64


In [10]:
import time

#Balanced Happy Vs Rest
bhr_train, bhr_test = train_test_split(balancedHappyVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bhr_train['Text'])
test_vectors = vectorizer.transform(bhr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bhr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bhr_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 26.639510s; Prediction time: 4.784918s
not happy:  {'precision': 0.6888128969223254, 'recall': 0.7117617364967188, 'f1-score': 0.7000993048659384, 'support': 1981}
happy:  {'precision': 0.7076292882744496, 'recall': 0.6844972758791481, 'f1-score': 0.6958710976837865, 'support': 2019}


In [11]:
fear10k = balancedDf[balancedDf['sentiment'] == 'fear']
nonFearDf = balancedDf[balancedDf['sentiment'] != 'fear']
nonFear10k = nonFearDf.sample(n=10000)
print(nonFear10k.shape)
print(nonFear10k['sentiment'].value_counts())
print(fear10k['sentiment'].value_counts())

balancedFearVsRest = pd.concat([fear10k, nonFear10k], axis=0)

balancedFearVsRest['sentiment'] = np.where((balancedFearVsRest.sentiment != 'fear'),'not fear', balancedFearVsRest.sentiment)
print(balancedFearVsRest['sentiment'].value_counts())

(10000, 2)
surprise    1497
sadness     1475
neutral     1433
anger       1417
love        1413
relief      1406
happy       1359
Name: sentiment, dtype: int64
fear    10000
Name: sentiment, dtype: int64
not fear    10000
fear        10000
Name: sentiment, dtype: int64


In [12]:
#Balanced Fear Vs Rest
bfr_train, bfr_test = train_test_split(balancedFearVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bfr_train['Text'])
test_vectors = vectorizer.transform(bfr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bfr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bfr_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 26.050689s; Prediction time: 4.339577s
not fear:  {'precision': 0.66008316008316, 'recall': 0.6410903584048461, 'f1-score': 0.6504481434058899, 'support': 1981}
fear:  {'precision': 0.6575144508670521, 'recall': 0.6760772659732541, 'f1-score': 0.6666666666666667, 'support': 2019}


In [13]:
sadness10k = balancedDf[balancedDf['sentiment'] == 'sadness']
nonSadnessDf = balancedDf[balancedDf['sentiment'] != 'sadness']
nonSadness10k = nonSadnessDf.sample(n=10000)
print(nonSadness10k.shape)
print(nonSadness10k['sentiment'].value_counts())
print(sadness10k['sentiment'].value_counts())

balancedSadVsRest = pd.concat([sadness10k, nonSadness10k], axis=0)

balancedSadVsRest['sentiment'] = np.where((balancedSadVsRest.sentiment != 'sadness'),'not sadness', balancedSadVsRest.sentiment)
print(balancedSadVsRest['sentiment'].value_counts())

(10000, 2)
love        1477
neutral     1459
fear        1444
relief      1416
happy       1415
anger       1406
surprise    1383
Name: sentiment, dtype: int64
sadness    10000
Name: sentiment, dtype: int64
not sadness    10000
sadness        10000
Name: sentiment, dtype: int64


In [14]:
#Balanced Sad Vs Rest
bsr_train, bsr_test = train_test_split(balancedSadVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bsr_train['Text'])
test_vectors = vectorizer.transform(bsr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bsr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bsr_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 24.625421s; Prediction time: 3.983390s
not sadness:  {'precision': 0.7275828460038987, 'recall': 0.7536597677940434, 'f1-score': 0.7403917679147037, 'support': 1981}
sadness:  {'precision': 0.7494866529774127, 'recall': 0.7231302625061912, 'f1-score': 0.7360725989412653, 'support': 2019}


In [15]:
#Train balanced, test unbalanced for Sad vs rest
sadVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
sadVsRestUnbalancedDf.Text = sadVsRestUnbalancedDf.Text.apply(lambda x: cleanText(x))
sadVsRestUnbalancedDf['sentiment'] = np.where((sadVsRestUnbalancedDf.sentiment != 'sadness'),'not sadness', sadVsRestUnbalancedDf.sentiment)
sadU_train, sadU_test = train_test_split(sadVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bsr_train['Text'])
test_vectors = vectorizer.transform(sadU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bsr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadU_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 25.704707s; Prediction time: 12.470123s
not sadness:  {'precision': 0.9301488833746898, 'recall': 0.724697921701305, 'f1-score': 0.8146699266503667, 'support': 10345}
sadness:  {'precision': 0.3270321361058601, 'recall': 0.7108371854134566, 'f1-score': 0.44796892701084323, 'support': 1947}


In [16]:
#Train balanced, test unbalanced for Happy vs rest
happyVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
happyVsRestUnbalancedDf.Text = happyVsRestUnbalancedDf.Text.apply(lambda x: cleanText(x))
happyVsRestUnbalancedDf['sentiment'] = np.where((happyVsRestUnbalancedDf.sentiment != 'happy'),'not happy', happyVsRestUnbalancedDf.sentiment)
happyU_train, happyU_test = train_test_split(happyVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bhr_train['Text'])
test_vectors = vectorizer.transform(happyU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bhr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyU_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 26.567516s; Prediction time: 13.091756s
not happy:  {'precision': 0.8769433465085639, 'recall': 0.6976939203354298, 'f1-score': 0.7771161704611791, 'support': 9540}
happy:  {'precision': 0.38664398128455973, 'recall': 0.6606104651162791, 'f1-score': 0.48779178964314457, 'support': 2752}


In [17]:
#Train balanced, test unbalanced for Fear vs rest
fearVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
fearVsRestUnbalancedDf.Text = fearVsRestUnbalancedDf.Text.apply(lambda x: cleanText(x))

fearVsRestUnbalancedDf['sentiment'] = np.where((fearVsRestUnbalancedDf.sentiment != 'fear'),'not fear', fearVsRestUnbalancedDf.sentiment)
fearU_train, fearU_test = train_test_split(fearVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bfr_train['Text'])
test_vectors = vectorizer.transform(fearU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bfr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearU_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])




Training time: 25.871855s; Prediction time: 18.864490s
not fear:  {'precision': 0.8371115448130911, 'recall': 0.6180447794280647, 'f1-score': 0.7110884397117898, 'support': 9022}
fear:  {'precision': 0.38803054519623514, 'recall': 0.6681957186544343, 'f1-score': 0.4909560723514212, 'support': 3270}


In [18]:
print(fearU_test)

      sentiment                                               Text
43114      fear   guess since im feeling bit less shitty random...
26628      fear   Bank holiday   rain  Superb  But great excuse...
44118  not fear   feel whack messed know psychologically fucked...
14300      fear   Sad Gmail chat died  I can t help  natalidelc...
44596  not fear              feel bitter lot wish mother son place
...         ...                                                ...
29972      fear   Hehe nah watching Ace Cakes  jlsegarra   legg...
21203  not fear   Happy Birthday tooo meeee    Happy Birthday t...
24112  not fear    Sazchik I read somewhere restoring name  hop...
49774  not fear   feel little like traitor beloved oppies said ...
38586  not fear                                   ruthieor THANKS 

[12292 rows x 2 columns]


In [19]:
#Train unbalanced, test balanced for Sad vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(sadU_train['Text'])
test_vectors = vectorizer.transform(bsr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, sadU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bsr_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 367.708798s; Prediction time: 7.451621s
not sadness:  {'precision': 0.5211155378486055, 'recall': 0.9904088844018173, 'f1-score': 0.6829098503306648, 'support': 1981}
sadness:  {'precision': 0.9191489361702128, 'recall': 0.10698365527488855, 'f1-score': 0.191659272404614, 'support': 2019}


In [20]:
#Train unbalanced, test balanced for Happy vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(happyU_train['Text'])
test_vectors = vectorizer.transform(bhr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, happyU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bhr_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 334.423110s; Prediction time: 9.299474s
not happy:  {'precision': 0.5423162583518931, 'recall': 0.98334174659263, 'f1-score': 0.6990848734972188, 'support': 1981}
happy:  {'precision': 0.9191176470588235, 'recall': 0.18573551263001487, 'f1-score': 0.3090234857849196, 'support': 2019}


In [21]:
#Train unbalanced, test balanced for Fear vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(fearU_train['Text'])
test_vectors = vectorizer.transform(bfr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, fearU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bfr_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 535.196153s; Prediction time: 12.024399s
not fear:  {'precision': 0.5323819978046103, 'recall': 0.9793033821302373, 'f1-score': 0.6897777777777778, 'support': 1981}
fear:  {'precision': 0.8848314606741573, 'recall': 0.15601783060921248, 'f1-score': 0.2652631578947368, 'support': 2019}


In [22]:
#Testing on real world data

dfReal = pd.read_csv('realWorldEmotions.csv')
dfReal.Text = dfReal.Text.apply(lambda x: cleanText(x))
dfReal.head()
dfReal['Sentiment'].value_counts()

joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: Sentiment, dtype: int64

In [27]:
#Checking performance of fear vs. rest on real world data?
dfRealFear = pd.read_csv('realWorldEmotions.csv')
dfRealFear.Text = dfRealFear.Text.apply(lambda x: cleanText(x))

dfRealFear['Sentiment'] = np.where((dfRealFear.Sentiment != 'fear'),'not fear', dfRealFear.Sentiment)

dfRealFear_train, dfRealFear_test = train_test_split(dfRealFear, test_size=0.2, random_state=42)

test_vectors_real = vectorizer.transform(dfRealFear_test['Text'])
prediction_linear_fear = classifier_linear.predict(test_vectors_real)
report = classification_report(dfRealFear_test['Sentiment'], prediction_linear_fear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])



not fear:  {'precision': 0.9886039886039886, 'recall': 0.9719887955182073, 'f1-score': 0.9802259887005651, 'support': 357}
fear:  {'precision': 0.7959183673469388, 'recall': 0.9069767441860465, 'f1-score': 0.8478260869565216, 'support': 43}
