In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train and validation sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')

# shape of train and test sets: (rows, columns)
display(df_train.shape, df_valid.shape)

# first 5 datapoints of train set
df_train.head()

(3192, 2)

(1065, 2)

Unnamed: 0,Sentence,Sentiment
0,UPM-Kymmene is one of the world 's leading pri...,positive
1,Nokia was up 0.12 pct to 16.70 eur after kicki...,positive
2,Mr K.R. Vasantha has been appointed Managing D...,neutral
3,Consolidated net sales increased 16 % to reach...,positive
4,CS Cabot exports 55 % of its production mainly...,neutral


In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.Sentence)
x_valid = preprocess_corpus(df_valid.Sentence)

# extract dependent features
y_train = df_train.Sentiment
y_valid = df_valid.Sentiment

# shape of train and test features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(3192,)

(3192,)

(1065,)

(1065,)

In [5]:
# bag of words transformation
# instantiate a CountVectorizer
cv = CountVectorizer(min_df=4, max_df=0.01)

# train and construct bag of words
x_train_bow = pd.DataFrame(cv.fit_transform(x_train).toarray(), columns=cv.get_feature_names_out())
x_valid_bow = pd.DataFrame(cv.transform(x_valid).toarray(), columns=cv.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_bow.shape, x_valid_bow.shape)

# first 5 datapoints of transformed train & test sets
display(x_train_bow.head())
display(x_valid_bow.head())

(3192, 1466)

(1065, 1466)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accounted,accounting,acerta,...,worldwide,worth,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# tf-idf transformation

In [7]:
# word2vec transformation
# encode independent feature: x_train  & x_valid
# convert into word2vec representation(document matrix)
x_train_w2v, _ = transform(corpus=x_train, model_load_path='../models/word2vec.model')
x_valid_w2v, _ = transform(corpus=x_valid, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(x_train_w2v.shape, x_valid_w2v.shape)

# first 5 datapoints of encoded train set
display(x_train_w2v.head())

(3192, 100)

(1065, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.242757,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.189665,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.198599,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.24851,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.164316,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403


In [8]:
# encode class labels
# initialize a label encoder
le = LabelEncoder()

# transform the class labels using label encoder
y_train_le = pd.DataFrame(le.fit_transform(y_train), columns=['encoded_sentiment'])
y_valid_le = pd.DataFrame(le.fit_transform(y_valid), columns=['encoded_sentiment'])

display(pd.DataFrame({'sentiments': le.classes_, 'encoded_sentiments': le.transform(le.classes_)}, columns=['sentiments', 'encoded_sentiments']))

y_train_le.head()

Unnamed: 0,sentiments,encoded_sentiments
0,negative,0
1,neutral,1
2,positive,2


Unnamed: 0,encoded_sentiment
0,2
1,2
2,1
3,2
4,1


In [9]:
# create a disctionary of all embeddings
# embedded train sets
train_embeddings = {
    'BoW': x_train_bow,
#     'tf-idf': x_train_tfidf,
    'Word2Vec': x_train_w2v,
}

# embedded validation sets
valid_embeddings = {
    'BoW': x_valid_bow,
#     'tf-idf': x_valid_tfidf,
    'Word2Vec': x_valid_w2v,
}

# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

# initialize a result map for storing embedding-wise results
# stores results for all embeddings
resultMap = {}

In [10]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# train on all embeddings

# for each embedding
for embedding_name, x_train_embedded in list(train_embeddings.items()):
    print('\nembedding used:', embedding_name)
    
    # initializing a result map to store model-wise results
    # stores results for one embedding
    results = {
        'time_to_train': [],
        'accuracy': [],
        'f1': []
    }
    
    # for each model
    for model_name, model in list(models.items()):
        # training start
        print('training', model_name + "...")
        start_time = time.time()
        
        model.fit(x_train_embedded, y_train_le.values.ravel())
        
        # training end
        end_time = time.time()
        print('training completed:', '{:.2f}'.format(end_time - start_time), 'seconds')
        
        # make predictions on validation set
        y_pred = model.predict(valid_embeddings[embedding_name])
    
        # add results to result map
        results['time_to_train'].append(end_time - start_time)
        results['accuracy'].append(accuracy_score(y_valid_le, y_pred))
        results['f1'].append(f1_score(y_valid_le, y_pred, average=None))
    
    # adding model-wise results for each embedding
    resultMap[embedding_name] = results


embedding used: BoW
training Gaussian Naive Bayes...
training completed: 0.08 seconds
training Linear SVM...
training completed: 0.11 seconds
training Kernel SVM...
training completed: 8.44 seconds
training XGBoost...
training completed: 54.59 seconds

embedding used: Word2Vec
training Gaussian Naive Bayes...
training completed: 0.01 seconds
training Linear SVM...
training completed: 0.33 seconds
training Kernel SVM...
training completed: 0.73 seconds
training XGBoost...
training completed: 21.93 seconds


In [11]:
# display results
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

    # accuracy 
    display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

    # f1 score
    display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))


BoW:


Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.082009
Linear SVM,0.111015
Kernel SVM,8.440003
XGBoost,54.588968


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.4723
Linear SVM,0.602817
Kernel SVM,0.628169
XGBoost,0.619718


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.393939,0.381877,0.273973,0.327434
neutral,0.519182,0.708098,0.749601,0.726199
positive,0.478049,0.545455,0.51446,0.477231



Word2Vec:


Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.006973
Linear SVM,0.331
Kernel SVM,0.730973
XGBoost,21.928999


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.443192
Linear SVM,0.528638
Kernel SVM,0.525822
XGBoost,0.53615


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.277136,0.0,0.0,0.163636
neutral,0.603006,0.685472,0.680481,0.673518
positive,0.250883,0.285185,0.220779,0.393798
