In [28]:
#Importiing necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import re
import snowballstemmer
import time

In [13]:
df = pd.read_csv('combined_data.csv')
df.head()
df['sentiment'].value_counts()

fear        16241
happy       13508
sadness      9796
neutral      8960
love         4720
anger        4069
surprise     2639
relief       1526
Name: sentiment, dtype: int64

In [14]:
#Doing the necessary data preprocessing to remove stop words and introduce stemming
df['Text'] = df['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
df.dropna(inplace=True)


ss = snowballstemmer.stemmer('english')
def replace(x):
    words = x.split()
    newtext = ''
    for w in words:
        n = ss.stemWord(w)
        newtext += n
        newtext += " "
    return newtext
df['Text'] = df['Text'].apply(lambda x: replace(x))
df.head()

Unnamed: 0,sentiment,Text
0,sadness,i know i was listenin to bad habit earlier and...
1,sadness,Layin n bed with a headach ughhhhwaitin on you...
2,sadness,Funer ceremonygloomi friday
3,happy,want to hang out with friend SOON
4,neutral,We want to trade with someon who has Houston t...


In [41]:
realWorldTest = pd.read_csv('realWorldEmotions.csv')
realWorldTest['Text'] = realWorldTest['Text'].apply(lambda x: re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)",'',x))
realWorldTest.dropna(inplace=True)

realWorldTest['Text'] = realWorldTest['Text'].apply(lambda x: replace(x))
realWorldTest['Sentiment'] = np.where((realWorldTest.Sentiment == 'joy'),'happy', realWorldTest.Sentiment)
realWorldTest['Sentiment'].value_counts()

happy       695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: Sentiment, dtype: int64

In [23]:
def stratify(data, N, OVR):
    
    if OVR == 'Happy vs Rest':
        rows = []
        notHappy = data[data['sentiment'] == 'not happy']
        happy = data[data['sentiment'] == 'happy']
        
        for i in range(N):
            rows.append(notHappy.loc[np.random.choice(notHappy.index)])
            rows.append(happy.loc[np.random.choice(happy.index)])
        sentiments = [x['sentiment'] for x in rows]
        texts = [x['Text'] for x in rows]
        d = {'sentiment': sentiments, 'Text': texts}
        return pd.DataFrame(d)
    
    
    elif OVR == 'Fear vs Rest':
        rows = []
        notFear = data[data['sentiment'] == 'not fear']
        fear = data[data['sentiment'] == 'fear']
        
        for i in range(N):
            rows.append(notFear.loc[np.random.choice(notFear.index)])
            rows.append(fear.loc[np.random.choice(fear.index)])
        sentiments = [x['sentiment'] for x in rows]
        texts = [x['Text'] for x in rows]
        d = {'sentiment': sentiments, 'Text': texts}
        return pd.DataFrame(d)
    
    
    elif OVR == 'Sadness vs Rest':
        rows = []
        notSadness = data[data['sentiment'] == 'not sadness']
        sadness = data[data['sentiment'] == 'sadness']
        
        for i in range(N):
            rows.append(notSadness.loc[np.random.choice(notSadness.index)])
            rows.append(sadness.loc[np.random.choice(sadness.index)])
        sentiments = [x['sentiment'] for x in rows]
        texts = [x['Text'] for x in rows]
        d = {'sentiment': sentiments, 'Text': texts}
        return pd.DataFrame(d)

## Analyzing Happy Vs Rest (all scenarios) below 

In [26]:
#Create train and test data (unbalanced for happy vs rest analysis)
happyVsRest = df.copy()

happyVsRest['sentiment'] = np.where((happyVsRest.sentiment != 'happy'),'not happy', happyVsRest.sentiment)

happyVsRest_train_unbalanced, happyVsRest_test_unbalanced = train_test_split(happyVsRest, test_size=0.1, random_state=1)


happyVsRest_train_balanced = stratify(happyVsRest_train_unbalanced, 10000, 'Happy vs Rest')
happyVsRest_test_balanced = stratify(happyVsRest_test_unbalanced, 10000, 'Happy vs Rest')


not happy    10000
happy        10000
Name: sentiment, dtype: int64

##### Balanced Train, Balanced Test for Happy vs Rest shown below

In [34]:
#Balanced Happy Vs Rest (balanced train, balanced test)
hvr_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = hvr_balanced_vectorizer.fit_transform(happyVsRest_train_balanced['Text'])
test_vectors = hvr_balanced_vectorizer.transform(happyVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
bhvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
bhvr_classifier_linear.fit(train_vectors, happyVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = bhvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

dfRealHappy = realWorldTest.copy()
dfRealHappy['Sentiment'] = np.where((dfRealHappy.Sentiment == 'joy'),'happy', dfRealHappy.Sentiment)
dfRealHappy['Sentiment'] = np.where((dfRealHappy.Sentiment != 'happy'),'not happy', dfRealHappy.Sentiment)


test_vectors_bhvr_real = hvr_balanced_vectorizer.transform(dfRealHappy['Text'])
prediction_linear_bhvr_real = bhvr_classifier_linear.predict(test_vectors_bhvr_real)
report = classification_report(dfRealHappy['Sentiment'], prediction_linear_bhvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])



Training time: 49.970367s; Prediction time: 45.858910s
not happy:  {'precision': 0.6462878007677184, 'recall': 0.6903, 'f1-score': 0.667569266476476, 'support': 10000}
happy:  {'precision': 0.6676682047429982, 'recall': 0.6222, 'f1-score': 0.6441327190848387, 'support': 10000}

Performance on real world data shown below
not happy:  {'precision': 0.9068100358422939, 'recall': 0.7754789272030651, 'f1-score': 0.8360181743081371, 'support': 1305}
happy:  {'precision': 0.668552036199095, 'recall': 0.8503597122302158, 'f1-score': 0.7485750474984167, 'support': 695}


##### Balanced Train, Unbalanced Test for Happy vs Rest shown below

In [35]:
#Balanced Happy Vs Rest (balanced train, unbalanced test)
hvr_bu_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = hvr_bu_vectorizer.fit_transform(happyVsRest_train_balanced['Text'])
test_vectors = hvr_bu_vectorizer.transform(happyVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
buhvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
buhvr_classifier_linear.fit(train_vectors, happyVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = buhvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW
test_vectors_buhvr_real = hvr_bu_vectorizer.transform(dfRealHappy['Text'])
prediction_linear_buhvr_real = buhvr_classifier_linear.predict(test_vectors_buhvr_real)
report = classification_report(dfRealHappy['Sentiment'], prediction_linear_buhvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 47.136967s; Prediction time: 10.136422s
not happy:  {'precision': 0.8740198640878202, 'recall': 0.6856674184949764, 'f1-score': 0.7684706423072504, 'support': 4877}
happy:  {'precision': 0.3392241379310345, 'recall': 0.6201733648542159, 'f1-score': 0.43856227361382005, 'support': 1269}

Performance on real world data shown below
not happy:  {'precision': 0.9068100358422939, 'recall': 0.7754789272030651, 'f1-score': 0.8360181743081371, 'support': 1305}
happy:  {'precision': 0.668552036199095, 'recall': 0.8503597122302158, 'f1-score': 0.7485750474984167, 'support': 695}


##### Unbalanced Train, Balanced Test for Happy vs Rest shown below

In [36]:
#Balanced Happy Vs Rest (unbalanced train, balanced test)
hvr_unbalanced_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = hvr_unbalanced_balanced_vectorizer.fit_transform(happyVsRest_train_unbalanced['Text'])
test_vectors = hvr_unbalanced_balanced_vectorizer.transform(happyVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
unbalanced_balanced_hvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
unbalanced_balanced_hvr_classifier_linear.fit(train_vectors, happyVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = unbalanced_balanced_hvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW
test_vectors_unbalanced_balanced_hvr_real = hvr_unbalanced_balanced_vectorizer.transform(dfRealHappy['Text'])
prediction_linear_unbalanced_balanced_hvr_real = unbalanced_balanced_hvr_classifier_linear.predict(test_vectors_unbalanced_balanced_hvr_real)
report = classification_report(dfRealHappy['Sentiment'], prediction_linear_unbalanced_balanced_hvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 521.538046s; Prediction time: 72.239591s
not happy:  {'precision': 0.5225954117208708, 'recall': 0.9818, 'f1-score': 0.682113453989648, 'support': 10000}
happy:  {'precision': 0.8499587798845837, 'recall': 0.1031, 'f1-score': 0.18389369481851423, 'support': 10000}

Performance on real world data shown below
not happy:  {'precision': 0.7096069868995634, 'recall': 0.9961685823754789, 'f1-score': 0.8288173414089894, 'support': 1305}
happy:  {'precision': 0.9702380952380952, 'recall': 0.23453237410071942, 'f1-score': 0.37775202780996525, 'support': 695}


##### Unbalanced Train, Unbalanced Test for Happy vs Rest shown below

In [38]:
#Balanced Happy Vs Rest (unbalanced train, unbalanced test)
hvr_unbalanced_unbalanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = hvr_unbalanced_unbalanced_vectorizer.fit_transform(happyVsRest_train_unbalanced['Text'])
test_vectors = hvr_unbalanced_unbalanced_vectorizer.transform(happyVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
unbalanced_unbalanced_hvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
unbalanced_unbalanced_hvr_classifier_linear.fit(train_vectors, happyVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = unbalanced_unbalanced_hvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(happyVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW
test_vectors_unbalanced_unbalanced_hvr_real = hvr_unbalanced_unbalanced_vectorizer.transform(dfRealHappy['Text'])
prediction_linear_unbalanced_unbalanced_hvr_real = unbalanced_unbalanced_hvr_classifier_linear.predict(test_vectors_unbalanced_unbalanced_hvr_real)
report = classification_report(dfRealHappy['Sentiment'], prediction_linear_unbalanced_unbalanced_hvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not happy: ', report['not happy'])
print('happy: ', report['happy'])

Training time: 527.741826s; Prediction time: 20.152330s
not happy:  {'precision': 0.8086148648648649, 'recall': 0.9815460323969654, 'f1-score': 0.8867277947578031, 'support': 4877}
happy:  {'precision': 0.6017699115044248, 'recall': 0.10717100078802207, 'f1-score': 0.18193979933110366, 'support': 1269}

Performance on real world data shown below
not happy:  {'precision': 0.7096069868995634, 'recall': 0.9961685823754789, 'f1-score': 0.8288173414089894, 'support': 1305}
happy:  {'precision': 0.9702380952380952, 'recall': 0.23453237410071942, 'f1-score': 0.37775202780996525, 'support': 695}


## Analyzing Fear Vs Rest (all scenarios) below 

In [39]:
#Create train and test data (for fear vs rest analysis)
fearVsRest = df.copy()

fearVsRest['sentiment'] = np.where((fearVsRest.sentiment != 'fear'),'not fear', fearVsRest.sentiment)

fearVsRest_train_unbalanced, fearVsRest_test_unbalanced = train_test_split(fearVsRest, test_size=0.1, random_state=1)


fearVsRest_train_balanced = stratify(fearVsRest_train_unbalanced, 10000, 'Fear vs Rest')
fearVsRest_test_balanced = stratify(fearVsRest_test_unbalanced, 10000, 'Fear vs Rest')


##### Balanced Train, Balanced Test for Fear vs Rest shown below

In [48]:
#Fear Vs Rest (balanced train, balanced test)
fvr_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = fvr_balanced_vectorizer.fit_transform(fearVsRest_train_balanced['Text'])
test_vectors = fvr_balanced_vectorizer.transform(fearVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
bfvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
bfvr_classifier_linear.fit(train_vectors, fearVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = bfvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

dfRealFear = realWorldTest.copy()
dfRealFear['Sentiment'] = np.where((dfRealFear.Sentiment != 'fear'),'not fear', dfRealFear.Sentiment)


test_vectors_bfvr_real = fvr_balanced_vectorizer.transform(dfRealFear['Text'])
prediction_linear_bfvr_real = bfvr_classifier_linear.predict(test_vectors_bfvr_real)
report = classification_report(dfRealFear['Sentiment'], prediction_linear_bfvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])



Training time: 45.711342s; Prediction time: 34.902468s
not fear:  {'precision': 0.6020854021847071, 'recall': 0.6063, 'f1-score': 0.6041853512705531, 'support': 10000}
fear:  {'precision': 0.6035246727089627, 'recall': 0.5993, 'f1-score': 0.6014049172102358, 'support': 10000}

Performance on real world data shown below
not fear:  {'precision': 0.9935379644588045, 'recall': 0.34628378378378377, 'f1-score': 0.5135699373695197, 'support': 1776}
fear:  {'precision': 0.15930485155684287, 'recall': 0.9821428571428571, 'f1-score': 0.27414330218068533, 'support': 224}


##### Balanced Train, Unbalanced Test for Fear vs Rest shown below

In [49]:
#Fear Vs Rest (balanced train, unbalanced test)
fvr_balanced_unbalanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = fvr_balanced_unbalanced_vectorizer.fit_transform(fearVsRest_train_balanced['Text'])
test_vectors = fvr_balanced_unbalanced_vectorizer.transform(fearVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
balanced_unbalanced_fvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
balanced_unbalanced_fvr_classifier_linear.fit(train_vectors, fearVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = balanced_unbalanced_fvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_balanced_unbalanced_fvr_real = fvr_balanced_unbalanced_vectorizer.transform(dfRealFear['Text'])
prediction_linear_balanced_unbalanced_fvr_real = balanced_unbalanced_fvr_classifier_linear.predict(test_vectors_balanced_unbalanced_fvr_real)
report = classification_report(dfRealFear['Sentiment'], prediction_linear_balanced_unbalanced_fvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 46.023930s; Prediction time: 9.357774s
not fear:  {'precision': 0.7972893341190336, 'recall': 0.608363309352518, 'f1-score': 0.6901300688599847, 'support': 4448}
fear:  {'precision': 0.36700581395348836, 'recall': 0.5948174322732627, 'f1-score': 0.4539325842696629, 'support': 1698}

Performance on real world data shown below
not fear:  {'precision': 0.9935379644588045, 'recall': 0.34628378378378377, 'f1-score': 0.5135699373695197, 'support': 1776}
fear:  {'precision': 0.15930485155684287, 'recall': 0.9821428571428571, 'f1-score': 0.27414330218068533, 'support': 224}


##### Unbalanced Train, Balanced Test for Fear vs Rest shown below

In [50]:
#Fear Vs Rest (unbalanced train, balanced test)
fvr_unbalanced_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = fvr_unbalanced_balanced_vectorizer.fit_transform(fearVsRest_train_unbalanced['Text'])
test_vectors = fvr_unbalanced_balanced_vectorizer.transform(fearVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
unbalanced_balanced_fvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
unbalanced_balanced_fvr_classifier_linear.fit(train_vectors, fearVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = unbalanced_balanced_fvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_unbalanced_balanced_fvr_real = fvr_unbalanced_balanced_vectorizer.transform(dfRealFear['Text'])
prediction_linear_unbalanced_balanced_fvr_real = unbalanced_balanced_fvr_classifier_linear.predict(test_vectors_unbalanced_balanced_fvr_real)
report = classification_report(dfRealFear['Sentiment'], prediction_linear_unbalanced_balanced_fvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 813.404830s; Prediction time: 72.826474s
not fear:  {'precision': 0.5298511472943199, 'recall': 0.986, 'f1-score': 0.689293578943689, 'support': 10000}
fear:  {'precision': 0.899352983465133, 'recall': 0.1251, 'f1-score': 0.21964708980774292, 'support': 10000}

Performance on real world data shown below
not fear:  {'precision': 0.9730185497470489, 'recall': 0.9746621621621622, 'f1-score': 0.9738396624472573, 'support': 1776}
fear:  {'precision': 0.7963800904977375, 'recall': 0.7857142857142857, 'f1-score': 0.7910112359550562, 'support': 224}


##### Unbalanced Train, Unbalanced Test for Fear vs Rest shown below

In [51]:
#Fear Vs Rest (unbalanced train, unbalanced test)
fvr_unbalanced_unbalanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = fvr_unbalanced_unbalanced_vectorizer.fit_transform(fearVsRest_train_unbalanced['Text'])
test_vectors = fvr_unbalanced_unbalanced_vectorizer.transform(fearVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
unbalanced_unbalanced_fvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
unbalanced_unbalanced_fvr_classifier_linear.fit(train_vectors, fearVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = unbalanced_unbalanced_fvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(fearVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_unbalanced_unbalanced_fvr_real = fvr_unbalanced_unbalanced_vectorizer.transform(dfRealFear['Text'])
prediction_linear_unbalanced_unbalanced_fvr_real = unbalanced_unbalanced_fvr_classifier_linear.predict(test_vectors_unbalanced_unbalanced_fvr_real)
report = classification_report(dfRealFear['Sentiment'], prediction_linear_unbalanced_unbalanced_fvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not fear: ', report['not fear'])
print('fear: ', report['fear'])

Training time: 800.303915s; Prediction time: 21.587022s
not fear:  {'precision': 0.7459993190330269, 'recall': 0.9851618705035972, 'f1-score': 0.8490602596396047, 'support': 4448}
fear:  {'precision': 0.7573529411764706, 'recall': 0.12131919905771496, 'f1-score': 0.20913705583756345, 'support': 1698}

Performance on real world data shown below
not fear:  {'precision': 0.9730185497470489, 'recall': 0.9746621621621622, 'f1-score': 0.9738396624472573, 'support': 1776}
fear:  {'precision': 0.7963800904977375, 'recall': 0.7857142857142857, 'f1-score': 0.7910112359550562, 'support': 224}


## Analyzing Sadness Vs Rest (all scenarios) below 

In [52]:
#Create train and test data (for fear vs rest analysis)
sadVsRest = df.copy()

sadVsRest['sentiment'] = np.where((sadVsRest.sentiment != 'sadness'),'not sadness', sadVsRest.sentiment)

sadVsRest_train_unbalanced, sadVsRest_test_unbalanced = train_test_split(sadVsRest, test_size=0.1, random_state=1)


sadVsRest_train_balanced = stratify(sadVsRest_train_unbalanced, 10000, 'Sadness vs Rest')
sadVsRest_test_balanced = stratify(sadVsRest_test_unbalanced, 10000, 'Sadness vs Rest')


##### Balanced Train, Balanced Test for Sad vs Rest shown below

In [54]:
#Sad Vs Rest (balanced train, balanced test)
svr_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = svr_balanced_vectorizer.fit_transform(sadVsRest_train_balanced['Text'])
test_vectors = svr_balanced_vectorizer.transform(sadVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
bsvr_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
bsvr_classifier_linear.fit(train_vectors, sadVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = bsvr_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

dfRealSadness = realWorldTest.copy()
dfRealSadness['Sentiment'] = np.where((dfRealSadness.Sentiment != 'sadness'),'not sadness', dfRealSadness.Sentiment)


test_vectors_bsvr_real = svr_balanced_vectorizer.transform(dfRealSadness['Text'])
prediction_linear_bsvr_real = bsvr_classifier_linear.predict(test_vectors_bsvr_real)
report = classification_report(dfRealSadness['Sentiment'], prediction_linear_bsvr_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


Training time: 50.934468s; Prediction time: 31.978887s
not sadness:  {'precision': 0.6491897932575899, 'recall': 0.6971, 'f1-score': 0.6722924100684733, 'support': 10000}
sadness:  {'precision': 0.6729648024184841, 'recall': 0.6233, 'f1-score': 0.6471809780915793, 'support': 10000}

Performance on real world data shown below
not sadness:  {'precision': 0.9380804953560371, 'recall': 0.854122621564482, 'f1-score': 0.8941350055330136, 'support': 1419}
sadness:  {'precision': 0.7076271186440678, 'recall': 0.8623063683304647, 'f1-score': 0.7773467804499612, 'support': 581}


##### Balanced Train, Unbalanced Test for Sad vs Rest shown below

In [55]:
#Sad Vs Rest (balanced train, unbalanced test)
svr_balanced_unbalanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = svr_balanced_unbalanced_vectorizer.fit_transform(sadVsRest_train_balanced['Text'])
test_vectors = svr_balanced_unbalanced_vectorizer.transform(sadVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
svr_balanced_unbalanced_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
svr_balanced_unbalanced_classifier_linear.fit(train_vectors, sadVsRest_train_balanced['sentiment'])
t1 = time.time()
prediction_linear = svr_balanced_unbalanced_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_svr_balanced_unbalanced_real = svr_balanced_unbalanced_vectorizer.transform(dfRealSadness['Text'])
prediction_linear_svr_balanced_unbalanced_real = svr_balanced_unbalanced_classifier_linear.predict(test_vectors_svr_balanced_unbalanced_real)
report = classification_report(dfRealSadness['Sentiment'], prediction_linear_svr_balanced_unbalanced_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


Training time: 46.948130s; Prediction time: 9.424777s
not sadness:  {'precision': 0.9078226208271052, 'recall': 0.7082604470359573, 'f1-score': 0.7957200567747572, 'support': 5145}
sadness:  {'precision': 0.2959662288930582, 'recall': 0.6303696303696303, 'f1-score': 0.4028088094478136, 'support': 1001}

Performance on real world data shown below
not sadness:  {'precision': 0.9380804953560371, 'recall': 0.854122621564482, 'f1-score': 0.8941350055330136, 'support': 1419}
sadness:  {'precision': 0.7076271186440678, 'recall': 0.8623063683304647, 'f1-score': 0.7773467804499612, 'support': 581}


##### Unbalanced Train, Balanced Test for Sad vs Rest shown below

In [56]:
#Sad Vs Rest (unbalanced train, balanced test)
svr_unbalanced_balanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = svr_unbalanced_balanced_vectorizer.fit_transform(sadVsRest_train_unbalanced['Text'])
test_vectors = svr_unbalanced_balanced_vectorizer.transform(sadVsRest_test_balanced['Text'])

# Perform classification with SVM, kernel=linear
svr_unbalanced_balanced_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
svr_unbalanced_balanced_classifier_linear.fit(train_vectors, sadVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = svr_unbalanced_balanced_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadVsRest_test_balanced['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_svr_unbalanced_balanced_real = svr_unbalanced_balanced_vectorizer.transform(dfRealSadness['Text'])
prediction_linear_svr_unbalanced_balanced_real = svr_unbalanced_balanced_classifier_linear.predict(test_vectors_svr_unbalanced_balanced_real)
report = classification_report(dfRealSadness['Sentiment'], prediction_linear_svr_unbalanced_balanced_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


Training time: 476.914578s; Prediction time: 49.155798s
not sadness:  {'precision': 0.5066659855953415, 'recall': 0.9919, 'f1-score': 0.6707238732799136, 'support': 10000}
sadness:  {'precision': 0.8085106382978723, 'recall': 0.0342, 'f1-score': 0.0656241005468675, 'support': 10000}

Performance on real world data shown below
not sadness:  {'precision': 0.7306900102986612, 'recall': 1.0, 'f1-score': 0.8443915501338888, 'support': 1419}
sadness:  {'precision': 1.0, 'recall': 0.09982788296041308, 'f1-score': 0.1815336463223787, 'support': 581}


##### Unbalanced Train, Unbalanced Test for Sad vs Rest shown below

In [57]:
#Sad Vs Rest (unbalanced train, unbalanced test)
svr_unbalanced_unbalanced_vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = svr_unbalanced_unbalanced_vectorizer.fit_transform(sadVsRest_train_unbalanced['Text'])
test_vectors = svr_unbalanced_unbalanced_vectorizer.transform(sadVsRest_test_unbalanced['Text'])

# Perform classification with SVM, kernel=linear
svr_unbalanced_unbalanced_classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
svr_unbalanced_unbalanced_classifier_linear.fit(train_vectors, sadVsRest_train_unbalanced['sentiment'])
t1 = time.time()
prediction_linear = svr_unbalanced_unbalanced_classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(sadVsRest_test_unbalanced['sentiment'], prediction_linear, output_dict=True)
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


#REAL WORLD TEST PERFORMANCE SHOWN BELOW

test_vectors_svr_unbalanced_unbalanced_real = svr_unbalanced_unbalanced_vectorizer.transform(dfRealSadness['Text'])
prediction_linear_svr_unbalanced_unbalanced_real = svr_unbalanced_unbalanced_classifier_linear.predict(test_vectors_svr_unbalanced_unbalanced_real)
report = classification_report(dfRealSadness['Sentiment'], prediction_linear_svr_unbalanced_unbalanced_real, output_dict=True)

print()
print("Performance on real world data shown below")
print('not sadness: ', report['not sadness'])
print('sadness: ', report['sadness'])


Training time: 488.968851s; Prediction time: 14.895323s
not sadness:  {'precision': 0.8405200789993417, 'recall': 0.9926141885325559, 'f1-score': 0.9102575528027805, 'support': 5145}
sadness:  {'precision': 0.45714285714285713, 'recall': 0.03196803196803197, 'f1-score': 0.059757236227824466, 'support': 1001}

Performance on real world data shown below
not sadness:  {'precision': 0.7306900102986612, 'recall': 1.0, 'f1-score': 0.8443915501338888, 'support': 1419}
sadness:  {'precision': 1.0, 'recall': 0.09982788296041308, 'f1-score': 0.1815336463223787, 'support': 581}
