In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train and validation sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')

# shape of train and test sets: (rows, columns)
display(df_train.shape, df_valid.shape)

# first 5 datapoints of train and validation sets
display(df_train.head())
display(df_valid.head())

(3192, 2)

(1065, 2)

Unnamed: 0,Sentence,Sentiment
0,UPM-Kymmene is one of the world 's leading pri...,positive
1,Nokia was up 0.12 pct to 16.70 eur after kicki...,positive
2,Mr K.R. Vasantha has been appointed Managing D...,neutral
3,Consolidated net sales increased 16 % to reach...,positive
4,CS Cabot exports 55 % of its production mainly...,neutral


Unnamed: 0,Sentence,Sentiment
0,The uranium found locally is naturally occurri...,neutral
1,STUK today is a full service house expert in r...,neutral
2,It is hand-painted resin with real 14-0 treble...,neutral
3,Finnish management software solutions provider...,negative
4,Finnish silicon wafer technology company Okmet...,positive


In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.Sentence)
x_valid = preprocess_corpus(df_valid.Sentence)

# extract dependent features
y_train = df_train.Sentiment
y_valid = df_valid.Sentiment

# shape of train and validation features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(3192,)

(3192,)

(1065,)

(1065,)

In [5]:
# bag of words transformation
# instantiate a CountVectorizer
bow_vectorizer = CountVectorizer(min_df=4, max_df=0.01)

# train and construct bag of words
x_train_bow = pd.DataFrame(bow_vectorizer.fit_transform(x_train).toarray(), columns=bow_vectorizer.get_feature_names_out())
x_valid_bow = pd.DataFrame(bow_vectorizer.transform(x_valid).toarray(), columns=bow_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_bow.shape, x_valid_bow.shape)

# first 5 datapoints of transformed train & validtion sets
display(x_train_bow.head())
display(x_valid_bow.head())

(3192, 1466)

(1065, 1466)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# tf-idf transformation
# tf-idf transformation
# instantiate a CountVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=4, max_df=0.01)

# train and construct bag of words
x_train_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(x_train).toarray(), columns=tfidf_vectorizer.get_feature_names_out())
x_valid_tfidf = pd.DataFrame(tfidf_vectorizer.transform(x_valid).toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_tfidf.shape, x_valid_tfidf.shape)

# first 5 datapoints of transformed train & validation sets
display(x_train_tfidf.head())
display(x_valid_tfidf.head())

(3192, 1466)

(1065, 1466)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# word2vec transformation
# encode independent feature: x_train  & x_valid
# convert into word2vec representation(document matrix)
x_train_w2v, _ = transform(corpus=x_train, model_load_path='../models/word2vec.model')
x_valid_w2v, _ = transform(corpus=x_valid, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(x_train_w2v.shape, x_valid_w2v.shape)

# first 5 datapoints of encoded train & validation sets
display(x_train_w2v.head())
display(x_valid_w2v.head())

(3192, 100)

(1065, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.242757,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.189665,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.198599,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.24851,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.164316,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.019193,0.033047,0.015969,0.00756,0.008128,-0.05074,0.015594,0.082862,-0.01998,-0.016471,...,0.047983,0.001007,0.009092,0.011367,0.059942,0.016393,0.02923,-0.034602,0.02059,0.00309
1,-0.062693,0.108787,0.048197,0.023045,0.022389,-0.166193,0.049548,0.256699,-0.066523,-0.056093,...,0.152906,0.003304,0.022139,0.036476,0.194929,0.056292,0.08836,-0.111141,0.069278,0.012869
2,-0.033062,0.05858,0.026355,0.011531,0.013699,-0.092959,0.02996,0.145712,-0.037916,-0.030962,...,0.086858,0.001443,0.010666,0.018872,0.10754,0.029164,0.045718,-0.062479,0.042334,0.006935
3,-0.123161,0.210547,0.091419,0.054468,0.052463,-0.344753,0.100767,0.544042,-0.138679,-0.110934,...,0.303716,0.020882,0.061303,0.083601,0.404393,0.124408,0.172903,-0.223795,0.132085,0.021604
4,-0.107937,0.185419,0.080251,0.047835,0.045531,-0.305427,0.08499,0.471371,-0.126692,-0.097493,...,0.263592,0.016822,0.05144,0.066612,0.35321,0.111452,0.151459,-0.194723,0.119754,0.020665


In [8]:
# encode class labels
# initialize a label encoder
le = LabelEncoder()

# transform the class labels using label encoder
y_train_le = pd.DataFrame(le.fit_transform(y_train), columns=['encoded_sentiment'])
y_valid_le = pd.DataFrame(le.fit_transform(y_valid), columns=['encoded_sentiment'])

display(pd.DataFrame({'sentiments': le.classes_, 'encoded_sentiments': le.transform(le.classes_)}, columns=['sentiments', 'encoded_sentiments']))

y_train_le.head()

Unnamed: 0,sentiments,encoded_sentiments
0,negative,0
1,neutral,1
2,positive,2


Unnamed: 0,encoded_sentiment
0,2
1,2
2,1
3,2
4,1


In [9]:
# create a disctionary of all embeddings
# embedded train sets
train_embeddings = {
    'BoW': x_train_bow,
    'TF-IDF': x_train_tfidf,
    'Word2Vec': x_train_w2v,
}

# embedded validation sets
valid_embeddings = {
    'BoW': x_valid_bow,
    'TF-IDF': x_valid_tfidf,
    'Word2Vec': x_valid_w2v,
}

# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

# initialize a result map for storing embedding-wise results
# stores results for all embeddings
resultMap = {}

In [10]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# train on all embeddings

# for each embedding
for embedding_name, x_train_embedded in list(train_embeddings.items()):
    print('\n\t\t ' + embedding_name)
    print('########################################')
    
    # initializing a result map to store model-wise results
    # stores results for one embedding
    results = {
        'time_to_train': [],
        'time_to_test': [],
        'accuracy': [],
        'f1': []
    }
    
    # for each model
    for model_name, model in list(models.items()):
        # training start
        print('\ntraining', model_name + "...")
        start_time = time.time()
        
        model.fit(x_train_embedded, y_train_le.values.ravel())
        
        # training end
        end_time = time.time()
        time_to_train = end_time - start_time
        print('training completed:', '{:.2f}'.format(time_to_train), 'seconds')
        
        # testing start
        print('testing...')
        start_time = time.time()
        
        # make predictions on validation set
        y_pred = model.predict(valid_embeddings[embedding_name])
        
        # testing end
        end_time = time.time()
        time_to_test = end_time - start_time
        print('testing completed:', '{:.2f}'.format(time_to_test), 'seconds')
    
        # add test results 
        results['time_to_train'].append(time_to_train)
        results['time_to_test'].append(time_to_test)
        results['accuracy'].append(accuracy_score(y_valid_le, y_pred))
        results['f1'].append(f1_score(y_valid_le, y_pred, average=None))
    
    # adding model-wise results for each embedding
    resultMap[embedding_name] = results
    print('\n########################################\n')


		 BoW
########################################

training Gaussian Naive Bayes...
training completed: 0.09 seconds
testing...
testing completed: 0.05 seconds

training Linear SVM...
training completed: 0.13 seconds
testing...
testing completed: 0.03 seconds

training Kernel SVM...
training completed: 8.49 seconds
testing...
testing completed: 3.86 seconds

training XGBoost...
training completed: 54.65 seconds
testing...
testing completed: 0.03 seconds

########################################


		 TF-IDF
########################################

training Gaussian Naive Bayes...
training completed: 0.09 seconds
testing...
testing completed: 0.09 seconds

training Linear SVM...
training completed: 0.08 seconds
testing...
testing completed: 0.01 seconds

training Kernel SVM...
training completed: 11.17 seconds
testing...
testing completed: 4.16 seconds

training XGBoost...
training completed: 58.96 seconds
testing...
testing completed: 0.04 seconds

######################################

In [11]:
# display results
# compare time taken for training and testing
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # time to train
    display(pd.DataFrame(
        {
            'Train': results['time_to_train'],
            'Test': results['time_to_test']
        },
        index=models.keys(), 
    ))


BoW:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.088768,0.052177
Linear SVM,0.125815,0.025627
Kernel SVM,8.486927,3.857519
XGBoost,54.64827,0.025523



TF-IDF:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.093594,0.088202
Linear SVM,0.081999,0.014004
Kernel SVM,11.171856,4.159997
XGBoost,58.962211,0.040027



Word2Vec:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.009971,0.003003
Linear SVM,0.504028,0.001997
Kernel SVM,1.239003,0.476054
XGBoost,22.478042,0.006986


In [12]:
# compare accuracy
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # accuracy 
    display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))


BoW:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.4723
Linear SVM,0.602817
Kernel SVM,0.628169
XGBoost,0.621596



TF-IDF:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.485446
Linear SVM,0.619718
Kernel SVM,0.633803
XGBoost,0.612207



Word2Vec:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.443192
Linear SVM,0.528638
Kernel SVM,0.525822
XGBoost,0.542723


In [13]:
# compare f1 scores 
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # f1 score
    display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))


BoW:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.393939,0.381877,0.273973,0.32
neutral,0.519182,0.708098,0.749601,0.728751
positive,0.478049,0.545455,0.51446,0.481884



TF-IDF:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.392381,0.398649,0.251121,0.271493
neutral,0.555152,0.727773,0.75367,0.727139
positive,0.474359,0.549125,0.543319,0.466546



Word2Vec:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.277136,0.0,0.0,0.177778
neutral,0.603006,0.685472,0.680481,0.677215
positive,0.250883,0.285185,0.220779,0.405616


### Best Combinations: 

|   Accuracy   |   Model + Embedding  |
| ------------ | -------------------- |
|     63.3%    |  Kernel SVM + TF-IDF |
|     62.8%    |   Kernel SVM + BoW   |