In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train and validation sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')

# shape of train and test sets: (rows, columns)
display(df_train.shape, df_valid.shape)

# first 5 datapoints of train and validation sets
display(df_train.head())
display(df_valid.head())

(3192, 2)

(1065, 2)

Unnamed: 0,Sentence,Sentiment
0,UPM-Kymmene is one of the world 's leading pri...,positive
1,Nokia was up 0.12 pct to 16.70 eur after kicki...,positive
2,Mr K.R. Vasantha has been appointed Managing D...,neutral
3,Consolidated net sales increased 16 % to reach...,positive
4,CS Cabot exports 55 % of its production mainly...,neutral


Unnamed: 0,Sentence,Sentiment
0,The uranium found locally is naturally occurri...,neutral
1,STUK today is a full service house expert in r...,neutral
2,It is hand-painted resin with real 14-0 treble...,neutral
3,Finnish management software solutions provider...,negative
4,Finnish silicon wafer technology company Okmet...,positive


In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.Sentence)
x_valid = preprocess_corpus(df_valid.Sentence)

# extract dependent features
y_train = df_train.Sentiment
y_valid = df_valid.Sentiment

# shape of train and validation features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(3192,)

(3192,)

(1065,)

(1065,)

In [5]:
# bag of words transformation
# instantiate a CountVectorizer
bow_vectorizer = CountVectorizer(min_df=4, max_df=0.01)

# train and construct bag of words
x_train_bow = pd.DataFrame(bow_vectorizer.fit_transform(x_train).toarray(), columns=bow_vectorizer.get_feature_names_out())
x_valid_bow = pd.DataFrame(bow_vectorizer.transform(x_valid).toarray(), columns=bow_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_bow.shape, x_valid_bow.shape)

# first 5 datapoints of transformed train & validtion sets
display(x_train_bow.head())
display(x_valid_bow.head())

(3192, 1354)

(1065, 1354)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accumulate,acerta,across,...,worth,write,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accumulate,acerta,across,...,worth,write,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# tf-idf transformation
# tf-idf transformation
# instantiate a CountVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=4, max_df=0.01)

# train and construct bag of words
x_train_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(x_train).toarray(), columns=tfidf_vectorizer.get_feature_names_out())
x_valid_tfidf = pd.DataFrame(tfidf_vectorizer.transform(x_valid).toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_tfidf.shape, x_valid_tfidf.shape)

# first 5 datapoints of transformed train & validation sets
display(x_train_tfidf.head())
display(x_valid_tfidf.head())

(3192, 1354)

(1065, 1354)

Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accumulate,acerta,across,...,worth,write,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,able,abp,abroad,access,accessory,accordance,account,accumulate,acerta,across,...,worth,write,www,yahoo,yesterday,yet,yhoo,yit,zinc,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# word2vec transformation
# encode independent feature: x_train  & x_valid
# convert into word2vec representation(document matrix)
x_train_w2v, _ = transform(corpus=x_train, model_load_path='../models/word2vec.model')
x_valid_w2v, _ = transform(corpus=x_valid, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(x_train_w2v.shape, x_valid_w2v.shape)

# first 5 datapoints of encoded train & validation sets
display(x_train_w2v.head())
display(x_valid_w2v.head())

(3192, 100)

(1065, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.106715,0.183824,0.078675,0.039524,0.045597,-0.288272,0.085725,0.449023,-0.114871,-0.090446,...,0.257599,0.009447,0.043548,0.063163,0.33726,0.094499,0.150658,-0.191586,0.120565,0.013052
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.189665,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157
2,-0.076327,0.133481,0.05647,0.031461,0.029904,-0.206473,0.064642,0.318556,-0.082676,-0.065952,...,0.186415,0.007885,0.027156,0.048165,0.240993,0.068104,0.107373,-0.135107,0.088116,0.010484
3,-0.098803,0.172337,0.06953,0.048289,0.043873,-0.293859,0.079965,0.453978,-0.121126,-0.094832,...,0.24142,0.029056,0.057741,0.06357,0.341761,0.111469,0.133045,-0.181623,0.113092,0.023307
4,-0.060102,0.103584,0.045892,0.022114,0.024023,-0.164212,0.050208,0.250561,-0.065215,-0.052447,...,0.146232,0.005439,0.021338,0.037428,0.187625,0.053754,0.082929,-0.107971,0.068166,0.007335


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.020791,0.031437,0.01552,0.009276,0.006851,-0.05118,0.016983,0.082946,-0.020763,-0.014753,...,0.047976,0.000233,0.008044,0.009972,0.060323,0.017544,0.028904,-0.035012,0.020442,0.002685
1,-0.062693,0.108787,0.048197,0.023045,0.022389,-0.166193,0.049548,0.256699,-0.066523,-0.056093,...,0.152906,0.003304,0.022139,0.036476,0.194929,0.056292,0.08836,-0.111141,0.069278,0.012869
2,-0.034856,0.060391,0.027469,0.012226,0.013515,-0.095855,0.031257,0.150956,-0.038916,-0.032693,...,0.089125,0.001679,0.011707,0.019436,0.111348,0.030132,0.047945,-0.064545,0.044021,0.007618
3,-0.12283,0.208376,0.090955,0.053971,0.052974,-0.34138,0.099246,0.537688,-0.136044,-0.109898,...,0.301007,0.019685,0.06019,0.08202,0.399729,0.122577,0.171311,-0.220381,0.131607,0.020112
4,-0.108874,0.187367,0.081281,0.049034,0.045589,-0.306289,0.086452,0.475711,-0.128637,-0.097546,...,0.26686,0.015848,0.049485,0.068457,0.356345,0.110665,0.1538,-0.196324,0.121088,0.020203


In [8]:
# encode class labels
# initialize a label encoder
le = LabelEncoder()

# transform the class labels using label encoder
y_train_le = pd.DataFrame(le.fit_transform(y_train), columns=['encoded_sentiment'])
y_valid_le = pd.DataFrame(le.fit_transform(y_valid), columns=['encoded_sentiment'])

display(pd.DataFrame({'sentiments': le.classes_, 'encoded_sentiments': le.transform(le.classes_)}, columns=['sentiments', 'encoded_sentiments']))

y_train_le.head()

Unnamed: 0,sentiments,encoded_sentiments
0,negative,0
1,neutral,1
2,positive,2


Unnamed: 0,encoded_sentiment
0,2
1,2
2,1
3,2
4,1


In [9]:
# create a disctionary of all embeddings
# embedded train sets
train_embeddings = {
    'BoW': x_train_bow,
    'TF-IDF': x_train_tfidf,
    'Word2Vec': x_train_w2v,
}

# embedded validation sets
valid_embeddings = {
    'BoW': x_valid_bow,
    'TF-IDF': x_valid_tfidf,
    'Word2Vec': x_valid_w2v,
}

# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': XGBClassifier()
}

# initialize a result map for storing embedding-wise results
# stores results for all embeddings
resultMap = {}

In [10]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# train on all embeddings

# for each embedding
for embedding_name, x_train_embedded in list(train_embeddings.items()):
    print('\n\t\t ' + embedding_name)
    print('########################################')
    
    # initializing a result map to store model-wise results
    # stores results for one embedding
    results = {
        'time_to_train': [],
        'time_to_test': [],
        'accuracy': [],
        'f1': []
    }
    
    # for each model
    for model_name, model in list(models.items()):
        # training start
        print('\ntraining', model_name + "...")
        start_time = time.time()
        
        model.fit(x_train_embedded, y_train_le.values.ravel())
        
        # training end
        end_time = time.time()
        time_to_train = end_time - start_time
        print('training completed:', '{:.2f}'.format(time_to_train), 'seconds')
        
        # testing start
        print('testing...')
        start_time = time.time()
        
        # make predictions on validation set
        y_pred = model.predict(valid_embeddings[embedding_name])
        
        # testing end
        end_time = time.time()
        time_to_test = end_time - start_time
        print('testing completed:', '{:.2f}'.format(time_to_test), 'seconds')
    
        # add test results 
        results['time_to_train'].append(time_to_train)
        results['time_to_test'].append(time_to_test)
        results['accuracy'].append(accuracy_score(y_valid_le, y_pred))
        results['f1'].append(f1_score(y_valid_le, y_pred, average=None))
    
    # adding model-wise results for each embedding
    resultMap[embedding_name] = results
    print('\n########################################\n')


		 BoW
########################################

training Gaussian Naive Bayes...
training completed: 0.12 seconds
testing...
testing completed: 0.08 seconds

training Linear SVM...
training completed: 0.16 seconds
testing...
testing completed: 0.02 seconds

training Kernel SVM...
training completed: 13.85 seconds
testing...
testing completed: 4.99 seconds

training XGBoost...
training completed: 12.53 seconds
testing...
testing completed: 0.09 seconds

########################################


		 TF-IDF
########################################

training Gaussian Naive Bayes...
training completed: 0.13 seconds
testing...
testing completed: 0.07 seconds

training Linear SVM...
training completed: 0.06 seconds
testing...
testing completed: 0.01 seconds

training Kernel SVM...
training completed: 13.37 seconds
testing...
testing completed: 4.76 seconds

training XGBoost...
training completed: 11.34 seconds
testing...
testing completed: 0.07 seconds

#####################################

In [11]:
# display results
# compare time taken for training and testing
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # time to train
    display(pd.DataFrame(
        {
            'Train': results['time_to_train'],
            'Test': results['time_to_test']
        },
        index=models.keys(), 
    ))


BoW:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.121995,0.083002
Linear SVM,0.162,0.016002
Kernel SVM,13.853994,4.991004
XGBoost,12.526038,0.091001



TF-IDF:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.132,0.070844
Linear SVM,0.055998,0.012082
Kernel SVM,13.37296,4.762998
XGBoost,11.336001,0.070002



Word2Vec:


Unnamed: 0,Train,Test
Gaussian Naive Bayes,0.015999,0.008003
Linear SVM,0.729004,0.001999
Kernel SVM,1.098992,0.459
XGBoost,7.258001,0.010999


In [12]:
# compare accuracy
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # accuracy 
    display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))


BoW:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.43662
Linear SVM,0.60939
Kernel SVM,0.624413
XGBoost,0.631925



TF-IDF:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.456338
Linear SVM,0.624413
Kernel SVM,0.628169
XGBoost,0.615962



Word2Vec:


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.433803
Linear SVM,0.524883
Kernel SVM,0.523944
XGBoost,0.532394


In [13]:
# compare f1 scores 
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    # f1 score
    display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))


BoW:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.363958,0.387097,0.276786,0.40625
neutral,0.473545,0.701225,0.736589,0.7376
positive,0.45297,0.571805,0.52968,0.512821



TF-IDF:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.361702,0.414716,0.278261,0.353846
neutral,0.532178,0.716774,0.74062,0.729927
positive,0.44591,0.572973,0.543027,0.502355



Word2Vec:


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.280778,0.0,0.0,0.235294
neutral,0.60477,0.67867,0.679573,0.663873
positive,0.170385,0.268482,0.213043,0.41349


### Best Combinations: 

|   Accuracy   |   Model + Embedding  |
| ------------ | -------------------- |
|     63.2%    |   XGBoost + TF-IDF   |
|     62.8%    |  Kernel SVM + TF-IDF |