In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train and validation sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')

# shape of train and test sets: (rows, columns)
df_train.shape, df_valid.shape

((16926, 2), (5642, 2))

In [4]:
# extract independent features
# preprocess text column
x_train = preprocess_corpus(df_train.headline)
x_valid = preprocess_corpus(df_valid.headline)

# extract dependent features
y_train = df_train.clickbait
y_valid = df_valid.clickbait

# shape of train and test features: (rows, columns)
display(x_train.shape, y_train.shape)
display(x_valid.shape, y_valid.shape)

(16926,)

(16926,)

(5642,)

(5642,)

In [5]:
# bag of words transformation
# instantiate a CountVectorizer
cv = CountVectorizer(min_df=15)

# train and construct bag of words
x_train_bow = pd.DataFrame(cv.fit_transform(x_train).toarray(), columns=cv.get_feature_names_out())
x_valid_bow = pd.DataFrame(cv.transform(x_valid).toarray(), columns=cv.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(x_train_bow.shape, x_valid_bow.shape)

# first 5 datapoints of transformed train & test sets
display(x_train_bow.head())
display(x_valid_bow.head())

(16926, 1340)

(5642, 1340)

Unnamed: 0,absolute,absolutely,abuse,access,accident,accidentally,account,accused,across,act,...,wtf,yahoo,yankee,year,york,young,youtube,zealand,zimbabwe,zodiac
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,absolute,absolutely,abuse,access,accident,accidentally,account,accused,across,act,...,wtf,yahoo,yankee,year,york,young,youtube,zealand,zimbabwe,zodiac
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# word2vec transformation
# encode independent feature: x_train  & x_valid
# convert into word2vec representation(document matrix)
x_train_w2v, _ = transform(corpus=x_train, model_load_path='../models/word2vec.model')
x_valid_w2v, _ = transform(corpus=x_valid, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(x_train_w2v.shape, x_valid_w2v.shape)

# first 5 datapoints of encoded train set
display(x_train_w2v.head())

(16926, 100)

(5642, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.042667,0.07656,0.033115,0.010363,0.043448,-0.129825,0.030019,0.174838,-0.045415,-0.040404,...,0.08556,0.023163,0.01783,0.026024,0.154294,0.088923,0.07215,-0.120526,0.013626,0.01788
1,-0.105256,0.183864,0.078464,0.015432,0.102162,-0.312194,0.073484,0.424201,-0.114222,-0.093521,...,0.203236,0.062087,0.037498,0.060872,0.361855,0.211108,0.163453,-0.276645,0.041788,0.027813
2,-0.117433,0.204922,0.087462,0.011423,0.123132,-0.349052,0.077376,0.479356,-0.118469,-0.099815,...,0.214424,0.073693,0.043468,0.061796,0.402968,0.241415,0.190069,-0.317817,0.039098,0.027031
3,-0.236498,0.403493,0.173219,0.043679,0.237102,-0.70555,0.158065,0.950663,-0.267106,-0.215982,...,0.467919,0.14586,0.086725,0.150724,0.812961,0.476877,0.362651,-0.626618,0.07851,0.063869
4,-0.116208,0.206998,0.092716,0.019684,0.118122,-0.365422,0.076832,0.492775,-0.134824,-0.106024,...,0.241468,0.074368,0.041955,0.07044,0.413168,0.246715,0.193683,-0.331463,0.049994,0.031526


In [7]:
# tf-idf transformation

In [8]:
# class labels already encoded
# 1: positive
# 0: negative
y_train.head()

0    1
1    1
2    0
3    1
4    1
Name: clickbait, dtype: int64

In [9]:
# create a disctionary of all embeddings
# embedded train sets
train_embeddings = {
    'BoW': x_train_bow,
#     'tf-idf': x_train_tfidf,
    'Word2Vec': x_train_w2v,
}

# embedded validation sets
valid_embeddings = {
    'BoW': x_valid_bow,
#     'tf-idf': x_valid_tfidf,
    'Word2Vec': x_valid_w2v,
}

# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

# initialize a result map for storing embedding-wise results
# stores results for all embeddings
resultMap = {}

In [10]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# train on all embeddings

# for each embedding
for embedding_name, x_train_embedded in list(train_embeddings.items()):
    print('\nembedding used:', embedding_name)
    
    # initializing a result map to store model-wise results
    # stores results for one embedding
    results = {
        'time_to_train': [],
        'accuracy': [],
        'f1': []
    }
    
    # for each model
    for model_name, model in list(models.items()):
        # training start
        print('training', model_name + "...")
        start_time = time.time()
        
        model.fit(x_train_embedded, y_train.values.ravel())
        
        # training end
        end_time = time.time()
        print('training completed:', '{:.2f}'.format(end_time - start_time), 'seconds')
        
        # make predictions on validation set
        y_pred = model.predict(valid_embeddings[embedding_name])
    
        # add results to result map
        results['time_to_train'].append(end_time - start_time)
        results['accuracy'].append(accuracy_score(y_valid, y_pred))
        results['f1'].append(f1_score(y_valid, y_pred, average=None))
    
    # adding model-wise results for each embedding
    resultMap[embedding_name] = results


embedding used: BoW
training Gaussian Naive Bayes...
training completed: 0.33 seconds
training Linear SVM...
training completed: 0.21 seconds
training Kernel SVM...
training completed: 156.09 seconds
training XGBoost...
training completed: 108.43 seconds

embedding used: Word2Vec
training Gaussian Naive Bayes...
training completed: 0.03 seconds
training Linear SVM...
training completed: 2.21 seconds
training Kernel SVM...
training completed: 22.62 seconds
training XGBoost...
training completed: 44.76 seconds


In [11]:
# display results
for embedding_name, results in list(resultMap.items()):
    print('\n' + embedding_name + ':')
    
    display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

    # accuracy 
    display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

    # f1 score
    display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=['Clickbait', 'Not Clickbait']))


BoW:


Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.328037
Linear SVM,0.208013
Kernel SVM,156.092798
XGBoost,108.42903


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.917937
Linear SVM,0.925381
Kernel SVM,0.925913
XGBoost,0.814959


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
Clickbait,0.908588,0.920401,0.921369,0.824183
Not Clickbait,0.925551,0.929775,0.92996,0.804714



Word2Vec:


Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.03
Linear SVM,2.205002
Kernel SVM,22.617999
XGBoost,44.762014


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.655973
Linear SVM,0.84385
Kernel SVM,0.780043
XGBoost,0.823467


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
Clickbait,0.687188,0.843211,0.791394,0.816236
Not Clickbait,0.617838,0.844484,0.767385,0.83015
