In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time

preprocessed_data = pd.read_csv('processed_emotions_dataset.csv',index_col=0)
preprocessed_data['preprocessed_text_split'] = preprocessed_data['preprocessed_text'].str.split()
preprocessed_data = preprocessed_data.dropna()
print(preprocessed_data.head(10))

                                                text     label  \
0      i just feel really helpless and heavy hearted      fear   
1  ive enjoyed being able to slouch about relax a...   sadness   
2  i gave up my internship with the dmrg and am f...      fear   
3                         i dont know i feel so lost   sadness   
4  i am a kindergarten teacher and i am thoroughl...      fear   
5         i was beginning to feel quite disheartened   sadness   
6  i would think that whomever would be lucky eno...      love   
7  i fear that they won t ever feel that deliciou...       joy   
8  im forever taking some time out to have a lie ...  surprise   
9  i can still lose the weight without feeling de...   sadness   

                                   preprocessed_text  \
0          just feel realli helpless and heavi heart   
1  ive enjoy be abl to slouch about relax and unw...   
2  gave up my internship with the dmrg and am fee...   
3                             dont know feel so l

In [2]:
# Function to retrieve top few number of each category
def get_top_data(preprocessed_data, top_n):
    top_data_joy = preprocessed_data[preprocessed_data['label'] == 'joy'].head(top_n)
    top_data_sadness = preprocessed_data[preprocessed_data['label'] == 'sadness'].head(top_n)
    top_data_anger = preprocessed_data[preprocessed_data['label'] == 'anger'].head(top_n)
    top_data_fear = preprocessed_data[preprocessed_data['label'] == 'fear'].head(top_n)
    top_data_love = preprocessed_data[preprocessed_data['label'] == 'love'].head(top_n)
    top_data_surprise = preprocessed_data[preprocessed_data['label'] == 'surprise'].head(top_n)
    data_equal_size_per_label = pd.concat([top_data_joy, top_data_sadness, top_data_anger, top_data_fear, top_data_love, top_data_surprise])
    return data_equal_size_per_label

# Function call to get the top 15000 from each sentiment
data_equal_size_per_label = get_top_data(preprocessed_data, top_n=15000)

# After selecting top few samples of each sentiment
print("After segregating and taking equal number of rows for each sentiment:")
print(data_equal_size_per_label['label'].value_counts())
data_equal_size_per_label.head(10)

After segregating and taking equal number of rows for each sentiment:
label
joy         15000
sadness     15000
anger       15000
fear        15000
love        15000
surprise    15000
Name: count, dtype: int64


Unnamed: 0,text,label,preprocessed_text,preprocessed_text_split
7,i fear that they won t ever feel that deliciou...,joy,fear that they won ever feel that delici excit...,"[fear, that, they, won, ever, feel, that, deli..."
10,i try to be nice though so if you get a bitchy...,joy,tri to be nice though so if you get bitchi per...,"[tri, to, be, nice, though, so, if, you, get, ..."
12,i have officially graduated im not feeling as ...,joy,have offici graduat im not feel a ecstat a tho...,"[have, offici, graduat, im, not, feel, a, ecst..."
14,i feel my portfolio demonstrates how eager i a...,joy,feel my portfolio demonstr how eager am to lea...,"[feel, my, portfolio, demonstr, how, eager, am..."
15,i may be more biased than the next because i h...,joy,may be more bias than the next becaus have dep...,"[may, be, more, bias, than, the, next, becaus,..."
16,i didn t feel terrific,joy,didn feel terrif,"[didn, feel, terrif]"
21,i am feeling much stronger and more confident ...,joy,am feel much stronger and more confid now and ...,"[am, feel, much, stronger, and, more, confid, ..."
22,i take a shower i feel wonderful energetic and...,joy,take shower feel wonder energet and all my pre...,"[take, shower, feel, wonder, energet, and, all..."
26,i feel like i am actually getting something us...,joy,feel like am actual get someth use out of it,"[feel, like, am, actual, get, someth, use, out..."
27,i was able to overcome this anxiousness and fe...,joy,wa abl to overcom thi anxious and feel peac a ...,"[wa, abl, to, overcom, thi, anxious, and, feel..."


In [3]:
# Skip-gram model (sg = 1)
size = 1000
window = 3
min_count = 1
workers = 3
sg = 1

word2vec_model_file = 'word2vec_' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(data_equal_size_per_label['preprocessed_text_split']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

sg_w2v_model = Word2Vec.load(word2vec_model_file)


Time taken to train word2vec model: 19.15617799758911


In [4]:
# Total number of the words 
print(len(sg_w2v_model.wv))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for the word nice")
print(len(sg_w2v_model.wv['nice']))

24348
Length of the vector generated for the word nice
1000


In [5]:
# Train Test Split Function
def split_train_test(data_equal_size_per_label, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(data_equal_size_per_label[['text','preprocessed_text_split']], 
                                                        data_equal_size_per_label['label'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    # X_train = X_train.reset_index()
    # X_test = X_test.reset_index()
    # Y_train = Y_train.to_frame()
    # Y_train = Y_train.reset_index()
    # Y_test = Y_test.to_frame()
    # Y_test = Y_test.reset_index()
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(data_equal_size_per_label)


Value counts for Train sentiments
label
fear        10572
love        10546
sadness     10545
joy         10508
anger       10427
surprise    10402
Name: count, dtype: int64
Value counts for Test sentiments
label
surprise    4598
anger       4573
joy         4492
sadness     4455
love        4454
fear        4428
Name: count, dtype: int64


In [6]:
# Store the vectors for train data in following file
word2vec_filename = 'train_word2vec.csv'
with open(word2vec_filename, 'w') as word2vec_file:
    for index, row in X_train.iterrows():
        model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['preprocessed_text_split']], axis=0)).tolist() # this is where we can add the probabliities from LDA to do a weighted avg instead
        # if index == 0:
        #     header = ",".join(str(ele) for ele in range(1000))
        #     word2vec_file.write(header)
        #     word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

In [16]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename, header=None)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_df, Y_train)
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

# Test model
test_features_word2vec = []
for index, row in X_test.iterrows():
    model_vector = np.mean([sg_w2v_model.wv[token] for token in row['preprocessed_text_split']], axis=0).tolist() # this is where we can add the probabliities from LDA to do a weighted avg instead
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(classification_report(Y_test,test_predictions_word2vec))

Time taken to fit the model with word2vec vectors: 99.00531816482544
              precision    recall  f1-score   support

       anger       0.24      0.24      0.24      4573
        fear       0.22      0.24      0.23      4428
         joy       0.25      0.25      0.25      4492
        love       0.29      0.29      0.29      4454
     sadness       0.25      0.24      0.25      4455
    surprise       0.32      0.30      0.31      4598

    accuracy                           0.26     27000
   macro avg       0.26      0.26      0.26     27000
weighted avg       0.26      0.26      0.26     27000



In [15]:
type(model_vector)

numpy.ndarray

In [11]:
word2vec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.136748,0.007968,0.126728,0.141341,-0.019738,-0.013882,0.028697,0.067027,-0.059400,0.015920,...,0.011304,-0.044032,0.082148,-0.005767,0.077980,0.070310,-0.100464,-0.096644,0.011750,-0.063679
1,0.081536,0.015476,0.142125,0.125711,-0.049791,-0.060880,0.072929,0.110954,-0.122056,0.021158,...,-0.013020,-0.043412,0.065545,0.006164,0.039328,0.039257,-0.087567,-0.021173,0.038129,-0.065973
2,0.113411,0.054585,0.150720,0.182355,-0.029868,-0.011712,0.074743,0.032017,-0.075775,0.017285,...,-0.051781,0.009105,0.126549,-0.009556,0.115662,0.035854,-0.156091,-0.051777,0.100583,-0.015398
3,0.093993,0.012024,0.115904,0.112445,0.017658,0.007313,0.077514,0.042433,-0.056403,0.038597,...,0.014005,-0.086076,0.089747,-0.007296,0.056042,0.068427,-0.073300,-0.068122,0.023981,-0.089070
4,0.083298,0.040043,0.184852,0.157534,-0.024539,-0.029961,0.147358,0.087780,-0.125293,0.024725,...,-0.060195,-0.016713,0.141719,-0.043830,0.109405,0.057650,-0.139219,-0.082996,0.048123,-0.001969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62995,0.016483,0.036803,0.169598,0.149909,-0.029449,-0.027888,0.056955,0.054978,-0.054855,0.073228,...,-0.107963,-0.065031,0.215270,0.027194,0.206374,0.055063,-0.175919,-0.095489,0.055149,-0.064701
62996,0.113365,0.028381,0.127115,0.140026,-0.030701,0.003696,0.013907,0.083048,-0.045503,0.023704,...,-0.005925,-0.019372,0.068567,-0.004937,0.074965,0.062238,-0.070393,-0.068164,0.029360,-0.052166
62997,0.067098,0.030910,0.112865,0.148546,-0.003980,-0.015066,0.103125,0.052034,-0.059157,0.030212,...,-0.049408,-0.078799,0.073377,-0.021775,0.108815,0.058079,-0.114651,-0.060154,0.032671,-0.051571
62998,0.119125,0.025730,0.104771,0.132973,-0.043787,-0.023368,-0.001445,0.064221,-0.046767,0.065681,...,-0.008335,-0.033685,0.057660,-0.039281,0.064536,0.070580,-0.043613,-0.083904,0.016732,-0.087801
