In [1]:
import sys
sys.path.insert(0, '../scripts/')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoders
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV 

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load train, validation and test sets into dataframe
df_train = pd.read_csv('../data/train_data.csv')
df_valid = pd.read_csv('../data/valid_data.csv')
df_test = pd.read_csv('../data/test_data.csv')

# combine train and validation sets
# shuffle dataframe randomly
df_train = pd.concat([df_train, df_valid]).sample(frac=1, random_state=42).reset_index(drop=True)

# shape of train and test sets: (rows, columns)
display(df_train.shape, df_test.shape)

(22568, 2)

(5642, 2)

In [4]:
# extract independent features
# preprocess text column
X_train = preprocess_corpus(df_train.headline)
X_test = preprocess_corpus(df_test.headline)

# extract dependent features
y_train = df_train.clickbait
y_test = df_test.clickbait

# size of train & test sets
display(X_train.shape, X_test.shape)

(22568,)

(5642,)

In [5]:
# first 5 datapoints of train, validation and test sets
display(df_train.head())
display(df_test.head())

Unnamed: 0,headline,clickbait
0,Your Friend Who's Back From Traveling,1
1,U.S. jury deliberates immigrant smuggler case,0
2,Top Russian Aide Calls for Less Kremlin Control,0
3,"How Well Do You Remember The Kids From ""Hey Ar...",1
4,Nick Kroll Just Said The Most Disgusting Thing...,1


Unnamed: 0,headline,clickbait
0,Australian Prime Minister denies striking a de...,0
1,The Bottom for Housing Is Probably Not Near,0
2,Do You Remember The Hogwarts House Of These Mi...,1
3,Here's Every Tweet Drake Has Ever Favorited,1
4,What Would Zooey Deschanel Name You,1


# BoW Transformation

In [6]:
# bag of words transformation
# instantiate a CountVectorizer
bow_vectorizer = CountVectorizer(min_df=5)

# train and construct bag of words
X_train_bow = pd.DataFrame(bow_vectorizer.fit_transform(X_train).toarray(), columns=bow_vectorizer.get_feature_names_out())
X_test_bow = pd.DataFrame(bow_vectorizer.transform(X_test).toarray(), columns=bow_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(X_train_bow.shape, X_test_bow.shape)

(22568, 4017)

(5642, 4017)

In [7]:
# first 5 datapoints of transformed train, validation & test sets
display(X_train_bow.head())
display(X_test_bow.head())

Unnamed: 0,aaron,abandon,abc,abduct,able,aboard,abortion,abroad,absolute,absolutely,...,zelda,zendaya,zero,zimbabwe,zimbabwean,zodiac,zombie,zone,zoo,zoolander
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,aaron,abandon,abc,abduct,able,aboard,abortion,abroad,absolute,absolutely,...,zelda,zendaya,zero,zimbabwe,zimbabwean,zodiac,zombie,zone,zoo,zoolander
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Transformation

In [8]:
# tf-idf transformation
# instantiate a CountVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=5)

# train and construct bag of words
X_train_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(X_train).toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(tfidf_vectorizer.transform(X_test).toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# shape of document matrix: (rows, columns)
display(X_train_tfidf.shape, X_test_tfidf.shape)

(22568, 4017)

(5642, 4017)

In [9]:
# first 5 datapoints of transformed train, validation & test sets
display(X_train_tfidf.head())
display(X_test_tfidf.head())

Unnamed: 0,aaron,abandon,abc,abduct,able,aboard,abortion,abroad,absolute,absolutely,...,zelda,zendaya,zero,zimbabwe,zimbabwean,zodiac,zombie,zone,zoo,zoolander
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,aaron,abandon,abc,abduct,able,aboard,abortion,abroad,absolute,absolutely,...,zelda,zendaya,zero,zimbabwe,zimbabwean,zodiac,zombie,zone,zoo,zoolander
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Hyperparameter Tuning: Kernel SVM + TF-IDF

In [10]:
params_kernel_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['poly', 'rbf', 'sigmoid'],
}

# Kernel SVM + TF-IDF

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search_cv = GridSearchCV(SVC(), params_kernel_svm, verbose=4, cv=3)

grid_search_cv.fit(X_train_tfidf, y_train.values)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
