In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import transform

In [3]:
# load train set into dataframe
df_train = pd.read_csv('../data/train_data.csv')

# shape: (rows, columns)
display(df_train.shape)

# class distribution
# 1: Clickbait
# 0: Not clickbait
display(df_train['clickbait'].value_counts())

# first 5 datapoints
df_train.head()

(16926, 2)

1    9149
0    7777
Name: clickbait, dtype: int64

Unnamed: 0,headline,clickbait
0,13 Crucial Money-Saving Charts You Wish You Kn...,1
1,"This Couple Shares Their House With A ""Unicorn...",1
2,Bomb Kills 7 Afghan Civilians at U.S. Base,0
3,19 Reasons Why No One Should Ever Play Video G...,1
4,23 Dance Moves That Changed Our Lives In 2015,1


In [4]:
# encode train set
# separate independent & dependent features
X_train = preprocess_corpus(df_train.headline)
Y_train = df_train.clickbait

print(X_train.shape, Y_train.shape)

# encode independent feature: X_train
# convert into word2vec representation(document matrix)
X_train_w2v, _ = transform(corpus=X_train, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_train_w2v.shape)

# first 5 datapoints
display(X_train_w2v.head())

# class labels already encoded
# 1: positive
# 0: negative
display(Y_train.head())

(16926,) (16926,)


(16926, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.042667,0.07656,0.033115,0.010363,0.043448,-0.129825,0.030019,0.174838,-0.045415,-0.040404,...,0.08556,0.023163,0.01783,0.026024,0.154294,0.088923,0.07215,-0.120526,0.013626,0.01788
1,-0.105256,0.183864,0.078464,0.015432,0.102162,-0.312194,0.073484,0.424201,-0.114222,-0.093521,...,0.203236,0.062087,0.037498,0.060872,0.361855,0.211108,0.163453,-0.276645,0.041788,0.027813
2,-0.117433,0.204922,0.087462,0.011423,0.123132,-0.349052,0.077376,0.479356,-0.118469,-0.099815,...,0.214424,0.073693,0.043468,0.061796,0.402968,0.241415,0.190069,-0.317817,0.039098,0.027031
3,-0.236498,0.403493,0.173219,0.043679,0.237102,-0.70555,0.158065,0.950663,-0.267106,-0.215982,...,0.467919,0.14586,0.086725,0.150724,0.812961,0.476877,0.362651,-0.626618,0.07851,0.063869
4,-0.116208,0.206998,0.092716,0.019684,0.118122,-0.365422,0.076832,0.492775,-0.134824,-0.106024,...,0.241468,0.074368,0.041955,0.07044,0.413168,0.246715,0.193683,-0.331463,0.049994,0.031526


0    1
1    1
2    0
3    1
4    1
Name: clickbait, dtype: int64

In [5]:
# load validation set into dataframe
df_valid = pd.read_csv('../data/valid_data.csv')

# shape: (rows, columns)
display(df_valid.shape)

# class distribution
# 1: clickbait
# 0: not clickbait
display(df_valid['clickbait'].value_counts())

# first 5 datapoints
df_valid.head()

(5642, 2)

1    3047
0    2595
Name: clickbait, dtype: int64

Unnamed: 0,headline,clickbait
0,British rapper Derek B dies at age 44,0
1,Few TV Reports on Audience Flight,0
2,Access Industries to Sue Chase Over Losses,0
3,A Guy Surprised His Girlfriend With Corgis And...,1
4,Ten US missionaries charged with child kidnapp...,0


In [6]:
# encode validation set
# separate independent & dependent features
X_valid = preprocess_corpus(df_valid.headline)
Y_valid = df_valid.clickbait

print(X_valid.shape, Y_valid.shape)

# encode independent feature: X_valid
# convert into word2vec representation(document matrix)
X_valid_w2v, _ = transform(corpus=X_valid, model_load_path='../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_valid_w2v.shape)

# first 5 datapoints
display(X_valid_w2v.head())

# class labels already encoded
# 1: positive
# 0: negative
display(Y_valid.head())

(5642,) (5642,)


(5642, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.102778,0.191594,0.076402,0.008968,0.1098,-0.319564,0.070834,0.435434,-0.111942,-0.098389,...,0.209879,0.060758,0.037041,0.061273,0.367849,0.210089,0.169001,-0.288643,0.033318,0.028342
1,-0.116365,0.201767,0.085438,0.017545,0.122983,-0.336877,0.079621,0.458549,-0.119882,-0.097435,...,0.219921,0.07093,0.042872,0.061036,0.38233,0.229007,0.182231,-0.30848,0.048813,0.035985
2,-0.060614,0.113661,0.047501,0.011885,0.063253,-0.186135,0.04207,0.255287,-0.064985,-0.055606,...,0.123205,0.036246,0.02285,0.031611,0.212524,0.12992,0.097374,-0.171051,0.023028,0.020857
3,-0.163631,0.269809,0.118334,0.026566,0.154422,-0.478601,0.103347,0.649884,-0.184561,-0.143018,...,0.315559,0.101199,0.062173,0.097846,0.550577,0.321167,0.255833,-0.428914,0.058593,0.054122
4,-0.057772,0.1049,0.037814,0.011961,0.057116,-0.171075,0.040907,0.233384,-0.059863,-0.047145,...,0.106293,0.038255,0.022568,0.034348,0.192915,0.114502,0.091733,-0.155412,0.017874,0.018129


0    0
1    0
2    0
3    1
4    0
Name: clickbait, dtype: int64

In [7]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
# instantiate all models
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

# initializing a result map
results = {
    'time_to_train': [],
    'accuracy': [],
    'f1': []
}

# train and test each models
for name, model in list(models.items()):
    # training start
    print('\ntraining', name + "...")
    start_time = time.time()
    
    model.fit(X_train_w2v, Y_train.values.ravel())

    # training end
    end_time = time.time()
    print('training completed:', '{:.2f}'.format(end_time - start_time), 'seconds')
    
    # make predictions on validation set
    Y_pred = model.predict(X_valid_w2v)
    
    # add results to result map
    results['time_to_train'].append(end_time - start_time)
    results['accuracy'].append(accuracy_score(Y_valid, Y_pred))
    results['f1'].append(f1_score(Y_valid, Y_pred, average=None))
    
# display the results
# time to train
display(pd.DataFrame(results['time_to_train'], index=models.keys(), columns=['Time (seconds)']))

# accuracy 
display(pd.DataFrame(results['accuracy'], index=models.keys(), columns=['Accuracy']))

# f1 score
display(pd.DataFrame(np.asarray(results['f1']).T, columns=models.keys(), index=['Clickbait', 'Not Clickbait']))


training Gaussian Naive Bayes...
training completed: 0.04 seconds

training Linear SVM...
training completed: 2.40 seconds

training Kernel SVM...
training completed: 23.21 seconds

training XGBoost...
training completed: 45.21 seconds


Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,0.038032
Linear SVM,2.395039
Kernel SVM,23.205848
XGBoost,45.213144


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.655973
Linear SVM,0.84385
Kernel SVM,0.780043
XGBoost,0.823467


Unnamed: 0,Gaussian Naive Bayes,Linear SVM,Kernel SVM,XGBoost
Clickbait,0.687188,0.843211,0.791394,0.816236
Not Clickbait,0.617838,0.844484,0.767385,0.83015
