In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd
import numpy as np

# word2vec
from gensim.models import Word2Vec

# local scripts
from word2vec_utils import fit_transform, transform

In [3]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying
5,breanichole rapeisntokay made gay joke rape jo...,gender
6,hindustan time report gay rape joke taken obje...,gender
7,willy real man know able cook one attractive q...,gender
8,notallmen evolving notallmen evolved blameonen...,gender
9,anirban stupid mamata regime hundred bjp worke...,religion


In [4]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# params = {
#     'window': 5,
#     'min_count': 2,
#     'epochs': 5
# }

# trains a word2vec model
# builds vocabulary
# returns document_matrix and model_path
# document_matrix, model_path = fit_transform(
#     corpus=df.clean_tweet, 
#     model_save_path='../../models/word2vec.model', 
#     params=params
# )

# loads a pre-trained model
# returns document_matrix and model_path
document_matrix, _ = transform(
    corpus=df.clean_tweet, 
    model_load_path='../../models/word2vec.model',
)

document_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.506708,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,0.257129,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.496192,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.622101,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.446736,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936


In [5]:
# convert document matrix to dataframe
df_w2v = pd.DataFrame(document_matrix)

# include the class labels
df_w2v['cyberbullying_type'] = df['cyberbullying_type']

display(df_w2v.shape)
df_w2v.head()

(28614, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,cyberbullying_type
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025,age
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668,not_cyberbullying
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698,age
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416,ethnicity
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936,other_cyberbullying


In [6]:
# load the pre-trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('bully'))

# similar to dumb
display(model.wv.most_similar('dumb'))

[('middle', 0.965322732925415),
 ('elementary', 0.9573146104812622),
 ('high', 0.9516770839691162),
 ('graduation', 0.9479526877403259),
 ('relentlessly', 0.9460520148277283),
 ('confrontation', 0.9452431201934814),
 ('grade', 0.9413620829582214),
 ('teased', 0.9374991059303284),
 ('teacher', 0.9353218674659729),
 ('tormentor', 0.9342045187950134)]

[('trayvon', 0.9649240970611572),
 ('shut', 0.9571725130081177),
 ('goshawty', 0.9547345042228699),
 ('sayin', 0.9497713446617126),
 ('ignorant', 0.9482175707817078),
 ('stupid', 0.9463467001914978),
 ('spic', 0.9458626508712769),
 ('beaner', 0.9441934823989868),
 ('nigga', 0.9439464807510376),
 ('subban', 0.9413833022117615)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='bully', w2='teased'))

0.93749917