In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv('combined_data.csv')
df.head()
df['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [45]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stopWords = stopwords.words("english")

def cleanText(text):
    
    output = ""
    
    
    for word in text.split():
        cleanWord = re.sub("[^a-zA-Z]",  # Search for all non-letters
                          " ",          # Replace all non-letters with spaces
                          str(word))
        if cleanWord in stopWords:
            pass
        else:
            output = output + " " + cleanWord
    return output

In [46]:
df.Text = df.Text.apply(lambda x: cleanText(x))

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

In [48]:
def stratify(data, N):
    rows = []
    fear = data[data['sentiment'] == 'fear']
    happy = data[data['sentiment'] == 'happy']
    sad = data[data['sentiment'] == 'sadness']
    neutral = data[data['sentiment'] == 'neutral']
    love = data[data['sentiment'] == 'love']
    anger = data[data['sentiment'] == 'anger']
    surprise = data[data['sentiment'] == 'surprise']
    relief = data[data['sentiment'] == 'relief']
    
    for i in range(N):
        #print(fear.loc[np.random.choice(fear.index)])
        rows.append(fear.loc[np.random.choice(fear.index)])
        rows.append(happy.loc[np.random.choice(happy.index)])
        rows.append(sad.loc[np.random.choice(sad.index)])
        rows.append(neutral.loc[np.random.choice(neutral.index)])
        rows.append(love.loc[np.random.choice(love.index)])
        rows.append(anger.loc[np.random.choice(anger.index)])
        rows.append(surprise.loc[np.random.choice(surprise.index)])
        rows.append(relief.loc[np.random.choice(relief.index)])
    sentiments = [x['sentiment'] for x in rows]
    texts = [x['Text'] for x in rows]
    d = {'sentiment': sentiments, 'Text': texts}
    return pd.DataFrame(d)

In [49]:
#Creating a balanced dataset with 10000 records for each emotion
balancedDf = stratify(df, 10000)
balancedDf['sentiment'].value_counts()

fear        10000
relief      10000
anger       10000
happy       10000
neutral     10000
sadness     10000
surprise    10000
love        10000
Name: sentiment, dtype: int64

In [50]:
happy10k = balancedDf[balancedDf['sentiment'] == 'happy']
nonHappyDf = balancedDf[balancedDf['sentiment'] != 'happy']
nonHappy10k = nonHappyDf.sample(n=10000)
print(nonHappy10k.shape)
print(nonHappy10k['sentiment'].value_counts())

(10000, 2)
sadness     1489
relief      1452
love        1440
fear        1423
anger       1421
surprise    1420
neutral     1355
Name: sentiment, dtype: int64


In [51]:
balancedHappyVsRest = pd.concat([happy10k, nonHappy10k], axis=0)

balancedHappyVsRest['sentiment'] = np.where((balancedHappyVsRest.sentiment != 'happy'),'not happy', balancedHappyVsRest.sentiment)
print(balancedHappyVsRest['sentiment'].value_counts())

not happy    10000
happy        10000
Name: sentiment, dtype: int64


In [52]:
import time

#Balanced Happy Vs Rest
bhr_train, bhr_test = train_test_split(balancedHappyVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bhr_train['Text'])
test_vectors = vectorizer.transform(bhr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bhr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bhr_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 14.645315s; Prediction time: 2.371131s
not happy:  {'precision': 0.6789940828402367, 'recall': 0.6951034830893488, 'f1-score': 0.6869543527064106, 'support': 1981}
happy:  {'precision': 0.6937119675456389, 'recall': 0.6775631500742942, 'f1-score': 0.6855424705587572, 'support': 2019}


In [53]:
fear10k = balancedDf[balancedDf['sentiment'] == 'fear']
nonFearDf = balancedDf[balancedDf['sentiment'] != 'fear']
nonFear10k = nonFearDf.sample(n=10000)
print(nonFear10k.shape)
print(nonFear10k['sentiment'].value_counts())
print(fear10k['sentiment'].value_counts())

balancedFearVsRest = pd.concat([fear10k, nonFear10k], axis=0)

balancedFearVsRest['sentiment'] = np.where((balancedFearVsRest.sentiment != 'fear'),'not fear', balancedFearVsRest.sentiment)
print(balancedFearVsRest['sentiment'].value_counts())

(10000, 2)
sadness     1493
surprise    1458
love        1454
neutral     1441
anger       1436
happy       1386
relief      1332
Name: sentiment, dtype: int64
fear    10000
Name: sentiment, dtype: int64
fear        10000
not fear    10000
Name: sentiment, dtype: int64


In [54]:
#Balanced Fear Vs Rest
bfr_train, bfr_test = train_test_split(balancedFearVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bfr_train['Text'])
test_vectors = vectorizer.transform(bfr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bfr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bfr_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 15.614387s; Prediction time: 2.689634s
not fear:  {'precision': 0.6539245667686034, 'recall': 0.6476527006562343, 'f1-score': 0.6507735226984529, 'support': 1981}
fear:  {'precision': 0.6575073601570167, 'recall': 0.6636948984645864, 'f1-score': 0.660586640374661, 'support': 2019}


In [55]:
sadness10k = balancedDf[balancedDf['sentiment'] == 'sadness']
nonSadnessDf = balancedDf[balancedDf['sentiment'] != 'sadness']
nonSadness10k = nonSadnessDf.sample(n=10000)
print(nonSadness10k.shape)
print(nonSadness10k['sentiment'].value_counts())
print(sadness10k['sentiment'].value_counts())

balancedSadVsRest = pd.concat([sadness10k, nonSadness10k], axis=0)

balancedSadVsRest['sentiment'] = np.where((balancedSadVsRest.sentiment != 'sadness'),'not sadness', balancedSadVsRest.sentiment)
print(balancedSadVsRest['sentiment'].value_counts())

(10000, 2)
relief      1466
anger       1453
neutral     1453
happy       1441
surprise    1411
fear        1400
love        1376
Name: sentiment, dtype: int64
sadness    10000
Name: sentiment, dtype: int64
not sadness    10000
sadness        10000
Name: sentiment, dtype: int64


In [56]:
#Balanced Sad Vs Rest
bsr_train, bsr_test = train_test_split(balancedSadVsRest, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bsr_train['Text'])
test_vectors = vectorizer.transform(bsr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bsr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bsr_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 14.189356s; Prediction time: 2.277876s
not sadness:  {'precision': 0.7353664535169699, 'recall': 0.7546693589096416, 'f1-score': 0.744892874937718, 'support': 1981}
sadness:  {'precision': 0.7529232333502797, 'recall': 0.733531451213472, 'f1-score': 0.743100852985449, 'support': 2019}


In [61]:
#Train balanced, test unbalanced for Sad vs rest
sadVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
sadVsRestUnbalancedDf['sentiment'] = np.where((sadVsRestUnbalancedDf.sentiment != 'sadness'),'not sadness', sadVsRestUnbalancedDf.sentiment)
sadU_train, sadU_test = train_test_split(sadVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bsr_train['Text'])
test_vectors = vectorizer.transform(sadU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bsr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadU_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 13.909613s; Prediction time: 8.757547s
not sadness:  {'precision': 0.9246455097906819, 'recall': 0.6618656355727405, 'f1-score': 0.7714929577464789, 'support': 10345}
sadness:  {'precision': 0.2842234499693063, 'recall': 0.7134052388289677, 'f1-score': 0.4064969271290606, 'support': 1947}


In [62]:
#Train balanced, test unbalanced for Happy vs rest
happyVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
happyVsRestUnbalancedDf['sentiment'] = np.where((happyVsRestUnbalancedDf.sentiment != 'happy'),'not happy', happyVsRestUnbalancedDf.sentiment)
happyU_train, happyU_test = train_test_split(happyVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bhr_train['Text'])
test_vectors = vectorizer.transform(happyU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bhr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyU_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 14.661261s; Prediction time: 9.324991s
not happy:  {'precision': 0.8438431805632247, 'recall': 0.8009433962264151, 'f1-score': 0.8218338262973917, 'support': 9540}
happy:  {'precision': 0.41334569045412417, 'recall': 0.48619186046511625, 'f1-score': 0.4468191684755385, 'support': 2752}


In [63]:
#Train balanced, test unbalanced for Fear vs rest
fearVsRestUnbalancedDf = pd.read_csv('combined_data.csv')
fearVsRestUnbalancedDf['sentiment'] = np.where((fearVsRestUnbalancedDf.sentiment != 'fear'),'not fear', fearVsRestUnbalancedDf.sentiment)
fearU_train, fearU_test = train_test_split(fearVsRestUnbalancedDf, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(bfr_train['Text'])
test_vectors = vectorizer.transform(fearU_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, bfr_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearU_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 17.898834s; Prediction time: 11.710220s
not fear:  {'precision': 0.8018028846153846, 'recall': 0.7394147639104411, 'f1-score': 0.7693460961826779, 'support': 9022}
fear:  {'precision': 0.4081067472306143, 'recall': 0.4957186544342508, 'f1-score': 0.44766639049986195, 'support': 3270}


In [64]:
#Train unbalanced, test balanced for Sad vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(sadU_train['Text'])
test_vectors = vectorizer.transform(bsr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, sadU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bsr_test['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])

Training time: 333.817491s; Prediction time: 6.473613s
not sadness:  {'precision': 0.5249399839957322, 'recall': 0.9934376577486118, 'f1-score': 0.6869109947643979, 'support': 1981}
sadness:  {'precision': 0.9482071713147411, 'recall': 0.1178801386825161, 'f1-score': 0.20969162995594712, 'support': 2019}


In [65]:
#Train unbalanced, test balanced for Happy vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(happyU_train['Text'])
test_vectors = vectorizer.transform(bhr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, happyU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bhr_test['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 431.555881s; Prediction time: 9.125947s
not happy:  {'precision': 0.5425383542538355, 'recall': 0.9818273599192328, 'f1-score': 0.6988860941430112, 'support': 1981}
happy:  {'precision': 0.9132530120481928, 'recall': 0.18771669143140168, 'f1-score': 0.3114215283483977, 'support': 2019}


In [66]:
#Train unbalanced, test balanced for Fear vs rest

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(fearU_train['Text'])
test_vectors = vectorizer.transform(bfr_test['Text'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, fearU_train['sentiment'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(bfr_test['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 604.992728s; Prediction time: 10.971755s
not fear:  {'precision': 0.5326797385620915, 'recall': 0.9873801110550228, 'f1-score': 0.6920219352556165, 'support': 1981}
fear:  {'precision': 0.9237804878048781, 'recall': 0.150074294205052, 'f1-score': 0.2582019599488709, 'support': 2019}
