In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd
import numpy as np

# word2vec
from gensim.models import Word2Vec

# local scripts
from text_utils import preprocess_corpus
from word2vec_utils import fit_transform, transform

In [3]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

# shape: (rows, columns)
display(df.shape)

# first 5 datapoints
df.head()

(16926, 2)

Unnamed: 0,headline,clickbait
0,13 Crucial Money-Saving Charts You Wish You Kn...,1
1,"This Couple Shares Their House With A ""Unicorn...",1
2,Bomb Kills 7 Afghan Civilians at U.S. Base,0
3,19 Reasons Why No One Should Ever Play Video G...,1
4,23 Dance Moves That Changed Our Lives In 2015,1


In [4]:
# preprocess documents
# remove special characters, stopwords
# lemmatization
clean_headlines = preprocess_corpus(df.headline)

clean_headlines.head()

0    crucial money saving chart wish knew sooner
1     couple share house unicorn beyond adorable
2                 bomb kill afghan civilian base
3                reason one ever play video game
4                        dance move changed life
Name: headline, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# params = {
#     'window': 5,
#     'min_count': 2,
#     'epochs': 5
# }

# trains a word2vec model
# builds vocabulary
# returns document_matrix and model_path
# document_matrix, model_path = fit_transform(
#     corpus=clean_headlines, 
#     model_save_path='../../models/word2vec.model', 
#     params=params
# )

# loads a pre-trained model
# returns document_matrix and model_path
document_matrix, _ = transform(
    corpus=df.headline, 
    model_load_path='../../models/word2vec.model',
)

document_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.022821,0.043124,0.019096,0.006133,0.024353,-0.071657,0.016336,0.09747,-0.025611,-0.022787,...,0.047528,0.013053,0.009424,0.014252,0.086497,0.048809,0.039677,-0.066688,0.008327,0.01075
1,-0.046545,0.081055,0.034726,0.006641,0.045166,-0.139008,0.03194,0.187843,-0.050273,-0.040907,...,0.090853,0.027441,0.016607,0.027821,0.161418,0.094218,0.073036,-0.122185,0.018143,0.011699
2,-0.049629,0.087215,0.036691,0.004442,0.052971,-0.147398,0.032353,0.203459,-0.050758,-0.041484,...,0.089185,0.029067,0.016608,0.025381,0.169752,0.101653,0.079437,-0.133485,0.016989,0.013699
3,-0.08368,0.139283,0.059884,0.014904,0.084011,-0.2482,0.055297,0.335095,-0.095153,-0.077084,...,0.165197,0.051394,0.031166,0.053842,0.28663,0.167797,0.128922,-0.221228,0.028087,0.024871
4,-0.01006,0.018265,0.007958,0.003375,0.009607,-0.032204,0.005962,0.042215,-0.012261,-0.007369,...,0.021011,0.005816,0.003951,0.004539,0.036957,0.021555,0.0177,-0.029457,0.005785,0.003636


In [6]:
# convert document matrix to dataframe
df_w2v = pd.DataFrame(document_matrix)

# include the class labels
df_w2v['clickbait'] = df['clickbait']

display(df_w2v.shape)
df_w2v.head()

(16926, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,clickbait
0,-0.022821,0.043124,0.019096,0.006133,0.024353,-0.071657,0.016336,0.09747,-0.025611,-0.022787,...,0.013053,0.009424,0.014252,0.086497,0.048809,0.039677,-0.066688,0.008327,0.01075,1
1,-0.046545,0.081055,0.034726,0.006641,0.045166,-0.139008,0.03194,0.187843,-0.050273,-0.040907,...,0.027441,0.016607,0.027821,0.161418,0.094218,0.073036,-0.122185,0.018143,0.011699,1
2,-0.049629,0.087215,0.036691,0.004442,0.052971,-0.147398,0.032353,0.203459,-0.050758,-0.041484,...,0.029067,0.016608,0.025381,0.169752,0.101653,0.079437,-0.133485,0.016989,0.013699,0
3,-0.08368,0.139283,0.059884,0.014904,0.084011,-0.2482,0.055297,0.335095,-0.095153,-0.077084,...,0.051394,0.031166,0.053842,0.28663,0.167797,0.128922,-0.221228,0.028087,0.024871,1
4,-0.01006,0.018265,0.007958,0.003375,0.009607,-0.032204,0.005962,0.042215,-0.012261,-0.007369,...,0.005816,0.003951,0.004539,0.036957,0.021555,0.0177,-0.029457,0.005785,0.003636,1


In [7]:
# load the pre-trained model
model = Word2Vec.load('../../models/word2vec.model')

In [8]:
# top 10 similar words
# similar to celebrity
display(model.wv.most_similar('celebrity'))


# similar to president
display(model.wv.most_similar('president'))

[('one', 0.9994243383407593),
 ('movie', 0.9993767738342285),
 ('song', 0.9993736743927002),
 ('disney', 0.9992916584014893),
 ('show', 0.9992498755455017),
 ('guess', 0.9991766810417175),
 ('christmas', 0.9991670846939087),
 ('dog', 0.9991593360900879),
 ('actually', 0.999046802520752),
 ('age', 0.9990253448486328)]

[('win', 0.9997058510780334),
 ('state', 0.9997019171714783),
 ('say', 0.9996983408927917),
 ('leader', 0.9996972680091858),
 ('australian', 0.9996802806854248),
 ('british', 0.9996708631515503),
 ('new', 0.9996646046638489),
 ('china', 0.999664306640625),
 ('two', 0.9996587634086609),
 ('death', 0.9996582269668579)]

In [9]:
# word-word cosine similarity
model.wv.similarity(w1='leader', w2='president')

0.9996973