In [1]:
import sys
sys.path.insert(0, '../../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train set into dataframe
df_train = pd.read_csv('../../data/train_data.csv')

# shape: (rows, columns)
display(df_train.shape)

# class distribution
display(df_train['Sentiment'].value_counts())

# first 5 datapoints
df_train.head()

(3192, 2)

neutral     1566
positive    1110
negative     516
Name: Sentiment, dtype: int64

Unnamed: 0,clean_sentence,Sentiment
0,upm kymmene one world leading printing paper p...,positive
1,nokia pct eur kicking morning negative territory,positive
2,vasantha appointed managing director incap con...,neutral
3,consolidated net sale increased reach eur oper...,positive
4,cabot export production mainly goodyear bridge...,neutral


In [4]:
# encode train set
# separate independent & dependent features
X_train = df_train.clean_sentence
Y_train = df_train.Sentiment

print(X_train.shape, Y_train.shape)

# encode independent feature: X_train
# convert into word2vec representation(document matrix)
X_train_w2v, _ = transform(corpus=X_train, model_load_path='../../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_train_w2v.shape)

# first 5 datapoints
display(X_train_w2v.head())

# label-encode dependent feature: Y_train
# spawn a labelencoder
le = LabelEncoder()

# train and transform class labels
Y_train_le = pd.DataFrame(le.fit_transform(Y_train), columns=['encoded_sentiment'])

# map labels to encoded labels
map_labels = pd.DataFrame(
    {
        'labels': le.classes_,
        'encoded_labels': le.transform(le.classes_)
    },
    columns=['labels', 'encoded_labels']
)

map_labels

(3192,) (3192,)


(3192, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.10126,0.172969,0.074064,0.037569,0.042028,-0.270767,0.080114,0.420502,-0.107179,-0.086516,...,0.242757,0.009125,0.040415,0.059732,0.317194,0.08835,0.142855,-0.176983,0.111694,0.013735
1,-0.073112,0.133197,0.055148,0.031629,0.032111,-0.219228,0.06223,0.337676,-0.088739,-0.068926,...,0.189665,0.00997,0.037263,0.049762,0.25124,0.079446,0.102962,-0.143865,0.0864,0.010157
2,-0.079655,0.140546,0.05799,0.033934,0.03181,-0.217193,0.069451,0.338383,-0.087553,-0.06952,...,0.198599,0.007707,0.029304,0.051794,0.255313,0.07238,0.114715,-0.143702,0.092664,0.011519
3,-0.100543,0.177276,0.070339,0.049749,0.046601,-0.305025,0.080718,0.473022,-0.124815,-0.099016,...,0.24851,0.031143,0.062889,0.067259,0.353135,0.117301,0.137864,-0.187069,0.116288,0.022561
4,-0.067153,0.115755,0.05176,0.02575,0.026858,-0.184075,0.056593,0.280819,-0.073858,-0.058905,...,0.164316,0.006068,0.023441,0.042702,0.210729,0.060377,0.093277,-0.121066,0.076454,0.008403


Unnamed: 0,labels,encoded_labels
0,negative,0
1,neutral,1
2,positive,2


In [5]:
# load validation set into dataframe
df_valid = pd.read_csv('../../data/valid_data.csv')

# shape: (rows, columns)
display(df_valid.shape)

# class distribution
display(df_valid['Sentiment'].value_counts())

# first 5 datapoints
df_valid.head()

(1065, 2)

neutral     522
positive    371
negative    172
Name: Sentiment, dtype: int64

Unnamed: 0,Sentence,Sentiment
0,The uranium found locally is naturally occurri...,neutral
1,STUK today is a full service house expert in r...,neutral
2,It is hand-painted resin with real 14-0 treble...,neutral
3,Finnish management software solutions provider...,negative
4,Finnish silicon wafer technology company Okmet...,positive


In [6]:
# preprocess and normalize validation set
df_valid['clean_sentence'] = preprocess_corpus(df_valid['Sentence'])

df_valid.loc[:,['Sentence', 'clean_sentence']].head()

Unnamed: 0,Sentence,clean_sentence
0,The uranium found locally is naturally occurri...,uranium found locally naturally occurring make...
1,STUK today is a full service house expert in r...,stuk today full service house expert radiation...
2,It is hand-painted resin with real 14-0 treble...,hand painted resin real treble long deep top b...
3,Finnish management software solutions provider...,finnish management software solution provider ...
4,Finnish silicon wafer technology company Okmet...,finnish silicon wafer technology company okmet...


In [7]:
# encode validation set
# separate independent & dependent features
X_valid = df_valid.clean_sentence
Y_valid = df_valid.Sentiment

print(X_valid.shape, Y_valid.shape)

# encode independent feature: X_valid
# convert into word2vec representation(document matrix)
X_valid_w2v, _ = transform(corpus=X_valid, model_load_path='../../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_valid_w2v.shape)

# first 5 datapoints
display(X_valid_w2v.head())

# label-encode dependent feature: Y_valid
# use labelencoder spawned for train set
# train and transform class labels
Y_valid_le = pd.DataFrame(le.fit_transform(Y_valid), columns=['encoded_sentiment'])

(1065,) (1065,)


(1065, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.019193,0.033047,0.015969,0.00756,0.008128,-0.05074,0.015594,0.082862,-0.01998,-0.016471,...,0.047983,0.001007,0.009092,0.011367,0.059942,0.016393,0.02923,-0.034602,0.02059,0.00309
1,-0.062693,0.108787,0.048197,0.023045,0.022389,-0.166193,0.049548,0.256699,-0.066523,-0.056093,...,0.152906,0.003304,0.022139,0.036476,0.194929,0.056292,0.08836,-0.111141,0.069278,0.012869
2,-0.033062,0.05858,0.026355,0.011531,0.013699,-0.092959,0.02996,0.145712,-0.037916,-0.030962,...,0.086858,0.001443,0.010666,0.018872,0.10754,0.029164,0.045718,-0.062479,0.042334,0.006935
3,-0.123161,0.210547,0.091419,0.054468,0.052463,-0.344753,0.100767,0.544042,-0.138679,-0.110934,...,0.303716,0.020882,0.061303,0.083601,0.404393,0.124408,0.172903,-0.223795,0.132085,0.021604
4,-0.107937,0.185419,0.080251,0.047835,0.045531,-0.305427,0.08499,0.471371,-0.126692,-0.097493,...,0.263592,0.016822,0.05144,0.066612,0.35321,0.111452,0.151459,-0.194723,0.119754,0.020665


In [8]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

# initializing a result map
results = {
    'time_to_train': [],
    'accuracy': [],
    'f1': []
}

# train and test each models
for name, model in list(models.items()):
    # training start
    start_time = time.time()
    
    model.fit(X_train_w2v, Y_train_le.values.ravel())
    
    # training end
    end_time = time.time()
    
    # make predictions on validation set
    Y_pred = model.predict(X_valid_w2v)
    
    # add results to result map
    results['time_to_train'].append(end_time - start_time)
    results['accuracy'].append(accuracy_score(Y_valid_le, Y_pred))
    results['f1'].append(f1_score(Y_valid_le, Y_pred, average=None))
    
# display the results
# time to train
display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

# accuracy 
display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

# f1 score
display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=le.classes_))

Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.011992
Linear SVM,0.379001
Kernel SVM,0.896983
XGBoost,22.604953


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.443192
Linear SVM,0.528638
Kernel SVM,0.525822
XGBoost,0.54554


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
negative,0.277136,0.0,0.0,0.195556
neutral,0.603006,0.685472,0.680481,0.677699
positive,0.250883,0.285185,0.220779,0.40566
