In [1]:
# import required packages
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying
5,breanichole rapeisntokay made gay joke rape jo...,gender
6,hindustan time report gay rape joke taken obje...,gender
7,willy real man know able cook one attractive q...,gender
8,notallmen evolving notallmen evolved blameonen...,gender
9,anirban stupid mamata regime hundred bjp worke...,religion


In [3]:
# check if null value exist
display(df.clean_tweet.isna().sum())

# replace null values with empty string
df.clean_tweet = df.clean_tweet.fillna('')

# verify null count
display(df.clean_tweet.isna().sum())

90

0

In [4]:
# required tokenization for word2vec
tweets = df.clean_tweet.apply(simple_preprocess)

tweets.head()

0    [hate, ppl, high, school, used, bully, hot, om...
1                      [kat, andre, asshole, omg, mkr]
2    [new, access, trading, cause, need, high, leve...
3    [fuck, david, duke, racist, think, america, be...
4    [may, say, lot, hate, apologetic, army, hope, ...
Name: clean_tweet, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# spawn a Word2Vec model
# model = Word2Vec(window=5, min_count=2)

# build vocabulary from entire corpus
# model.build_vocab(tweets, progress_per=1000)

# train the word2vec
# model.train(tweets, total_examples=model.corpus_count, epochs=5)

# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

'\nNot training a new word2vec model.\nA model has already been trained and is loaded for further usage.\nUncomment the code block to train and save a new model.\n'

In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('bully'))

# similar to dumb
display(model.wv.most_similar('dumb'))

[('middle', 0.965322732925415),
 ('elementary', 0.9573146104812622),
 ('high', 0.9516770839691162),
 ('graduation', 0.9479526877403259),
 ('relentlessly', 0.9460520148277283),
 ('confrontation', 0.9452431201934814),
 ('grade', 0.9413620829582214),
 ('teased', 0.9374991059303284),
 ('teacher', 0.9353218674659729),
 ('tormentor', 0.9342045187950134)]

[('trayvon', 0.9649240970611572),
 ('shut', 0.9571725130081177),
 ('goshawty', 0.9547345042228699),
 ('sayin', 0.9497713446617126),
 ('ignorant', 0.9482175707817078),
 ('stupid', 0.9463467001914978),
 ('spic', 0.9458626508712769),
 ('beaner', 0.9441934823989868),
 ('nigga', 0.9439464807510376),
 ('subban', 0.9413833022117615)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='bully', w2='teased'))

0.93749917

In [9]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(n)) for n in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(14964, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
emilyosment,-0.004793,0.012991,0.014334,0.010615,-0.009132,-0.038383,0.005464,0.043048,-0.006638,-0.018775,...,0.019853,0.012172,-0.004809,0.016164,0.037076,0.011594,6e-06,-0.020643,0.000406,-0.023728
bleed,-0.027011,0.054985,0.02725,0.03182,0.003178,-0.122038,0.025748,0.127838,-0.046067,-0.026551,...,0.031881,0.034965,0.010488,0.040194,0.081613,0.024307,0.005719,-0.049306,0.01155,-0.038516
surely,-0.027868,0.025453,-0.005798,-0.036349,-0.000906,-0.178474,0.005754,0.209821,-0.031016,-0.061604,...,0.09301,0.097668,-0.043097,0.043864,0.111085,0.061619,-0.017218,-0.113981,0.094759,-0.068876
rapper,-0.045004,0.105878,0.101974,0.020865,-0.009697,-0.25126,0.12367,0.295109,-0.10337,-0.153898,...,0.15708,0.020963,0.080528,0.006814,0.199923,0.070892,-0.0001,-0.113822,-0.003454,-0.00809
indignant,0.004932,0.021559,-0.006736,-0.009105,0.00637,-0.038852,0.007039,0.060422,-0.017879,-0.02402,...,0.033146,0.020752,-0.005399,0.011678,0.034707,0.004646,-0.003044,-0.027406,0.017193,-0.016526
aftr,-0.019897,0.019411,0.001644,-0.005279,-0.003192,-0.03888,-0.010833,0.043608,-0.011909,-0.007416,...,0.01902,0.009331,-0.004575,-0.003863,0.026316,-0.000525,0.001802,-0.027862,0.005961,-0.003481
min,-0.060264,0.105877,0.062432,0.02573,-0.011669,-0.277301,0.050899,0.297659,-0.100136,-0.10589,...,0.11082,0.085152,-0.000301,0.06116,0.224191,0.064564,0.03254,-0.144687,0.044474,-0.05947
helicopter,-0.009197,0.003742,-0.00051,0.014795,0.008572,-0.017819,0.00215,0.014532,-0.003691,-0.01766,...,0.007414,0.009574,0.011733,-0.003562,0.010313,0.011585,-0.004077,-0.00615,0.008359,-0.007878
dedicated,0.000297,0.042173,-0.000714,0.042185,0.017291,-0.089707,0.031084,0.115978,-0.056395,-0.026477,...,0.022404,0.038928,-0.012271,0.048693,0.079122,0.037819,-0.007703,-0.033739,0.027571,-0.049491
actively,-0.008885,0.050879,0.032862,0.02832,0.009631,-0.170413,0.04108,0.215705,-0.079819,-0.075703,...,0.102801,0.063497,0.006612,0.043127,0.132991,0.059715,-0.016004,-0.092589,0.050778,-0.066054


In [10]:
# generate document vectors from word vectors
doc_vector = []
words = set(model.wv.index_to_key)

for tweet in tweets:
    tweet_vector = np.zeros(100)
    for word in tweet:
        if word in model.wv.index_to_key:
            tweet_vector += model.wv[word]
    tweet_vector = tweet_vector if len(tweet)==0 else (tweet_vector/len(tweet))
    doc_vector.append(tweet_vector)
    
len(doc_vector), len(doc_vector[0])

(28614, 100)

In [11]:
# convert document vectors to document matrix as dataframe
df_w2v = pd.DataFrame(doc_vector)

# include the class labels
df_w2v['cyberbullying_type'] = df['cyberbullying_type']

display(df_w2v.shape)
df_w2v.head()

(28614, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,cyberbullying_type
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025,age
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668,not_cyberbullying
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698,age
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416,ethnicity
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936,other_cyberbullying
