In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying
5,breanichole rapeisntokay made gay joke rape jo...,gender
6,hindustan time report gay rape joke taken obje...,gender
7,willy real man know able cook one attractive q...,gender
8,notallmen evolving notallmen evolved blameonen...,gender
9,anirban stupid mamata regime hundred bjp worke...,religion


In [3]:
# check if null value exist
display(df.clean_tweet.isna().sum())

# replace null values with empty string
df.clean_tweet = df.clean_tweet.fillna('')

# verify null count
display(df.clean_tweet.isna().sum())

90

0

In [4]:
# required tokenization for word2vec
tweets = df.clean_tweet.apply(simple_preprocess)

tweets.head()

0    [hate, ppl, high, school, used, bully, hot, om...
1                      [kat, andre, asshole, omg, mkr]
2    [new, access, trading, cause, need, high, leve...
3    [fuck, david, duke, racist, think, america, be...
4    [may, say, lot, hate, apologetic, army, hope, ...
Name: clean_tweet, dtype: object

In [5]:
# spawn a Word2Vec model
model = Word2Vec(window=5, min_count=1)

# build vocabulary from entire corpus
model.build_vocab(tweets, progress_per=250)

In [6]:
# train the word2vec
model.train(tweets, total_examples=model.corpus_count, epochs=5)

(1641456, 1817090)

In [7]:
# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

In [8]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [9]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('bully'))

# similar to dumb
display(model.wv.most_similar('dumb'))

[('middle', 0.965322732925415),
 ('elementary', 0.9573146104812622),
 ('high', 0.9516770839691162),
 ('graduation', 0.9479526877403259),
 ('relentlessly', 0.9460520148277283),
 ('confrontation', 0.9452431201934814),
 ('grade', 0.9413620829582214),
 ('teased', 0.9374991059303284),
 ('teacher', 0.9353218674659729),
 ('tormentor', 0.9342045187950134)]

[('trayvon', 0.9649240970611572),
 ('shut', 0.9571725130081177),
 ('goshawty', 0.9547345042228699),
 ('sayin', 0.9497713446617126),
 ('ignorant', 0.9482175707817078),
 ('stupid', 0.9463467001914978),
 ('spic', 0.9458626508712769),
 ('beaner', 0.9441934823989868),
 ('nigga', 0.9439464807510376),
 ('subban', 0.9413833022117615)]

In [10]:
# word-word cosine similarity
display(model.wv.similarity(w1='bully', w2='teased'))

0.93749917

In [11]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(n)) for n in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(14964, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
blatantly,-0.024535,0.037337,0.030147,-0.012739,-0.005555,-0.151342,0.024435,0.183432,-0.046623,-0.06242,...,0.077865,0.045794,0.002741,0.019044,0.093769,0.035687,-0.004631,-0.094735,0.026344,-0.037141
blacklisted,-0.022212,0.02298,0.030418,0.018838,0.005434,-0.061802,0.02836,0.063145,-0.041244,-0.029681,...,0.028725,0.017556,0.018304,0.018131,0.047438,0.019834,0.008928,-0.038672,-0.004162,-0.004904
minsan,-0.012402,0.015721,0.020421,0.007224,-1.1e-05,-0.038565,0.011739,0.044589,-0.013769,-0.012124,...,0.02331,0.011009,-0.010429,0.01622,0.027872,0.01802,-0.002111,-0.01976,0.006899,-0.02761
askhermore,-0.012201,0.000784,-0.003351,0.004143,-0.008806,-0.029281,-0.007255,0.039673,-0.008498,-0.019304,...,0.006134,0.018412,-0.001498,0.014883,0.027319,0.011288,-0.006388,-0.02301,0.004931,-0.015685
zine,-0.012743,0.028418,0.028008,0.017149,-0.00116,-0.085261,0.009382,0.091496,-0.035668,-0.025891,...,0.034601,0.012513,-0.009746,0.02771,0.064331,0.012616,-0.001191,-0.047457,0.018607,-0.035068
cosas,-0.013385,0.041991,0.042939,0.038043,-0.012277,-0.120518,0.025414,0.126569,-0.024706,-0.034135,...,0.045082,0.034534,0.012269,0.042562,0.067797,0.02806,0.014954,-0.050864,0.005808,-0.06054
pic,-0.423829,0.272058,0.675613,0.391416,0.021192,-1.439025,0.53303,1.44459,-0.575413,-0.455991,...,0.501921,-0.084448,0.419995,0.223096,1.105426,0.144225,0.664305,-0.66099,-0.38005,0.057064
stripper,-0.022017,0.025811,0.036794,0.006218,-0.001456,-0.111196,0.033168,0.112659,-0.032618,-0.048474,...,0.044368,0.015906,0.004929,0.001552,0.085282,0.02007,0.01677,-0.048152,-0.007704,-0.003941
suffered,-0.028863,0.041931,0.023248,0.000491,-0.004974,-0.097988,0.018211,0.094282,-0.055439,-0.034227,...,0.046813,0.028116,-0.009955,0.014933,0.078835,0.010524,0.020027,-0.058948,0.006075,-0.007941
squldz,-0.002897,0.024353,0.039078,0.020201,-0.00386,-0.051455,0.009222,0.054174,-0.009124,-0.022386,...,0.011709,0.001551,0.015386,0.010664,0.023728,0.014042,0.014695,-0.019509,-0.003891,-0.011061


In [12]:
# generate document vectors from word vectors
doc_vector = []
words = set(model.wv.index_to_key)

for tweet in tweets:
    tweet_vector = np.zeros(100)
    for word in tweet:
        if word in model.wv.index_to_key:
            tweet_vector += model.wv[word]
    tweet_vector = tweet_vector if len(tweet)==0 else (tweet_vector/len(tweet))
    doc_vector.append(tweet_vector)
    
len(doc_vector), len(doc_vector[0])

(28614, 100)

In [13]:
# extract document matrix as dataframe
df_w2v = pd.DataFrame(doc_vector)

# include the class labels
df_w2v['cyberbullying_type'] = df['cyberbullying_type']

display(df_w2v.shape)
df_w2v.head()

(28614, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,cyberbullying_type
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025,age
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668,not_cyberbullying
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698,age
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416,ethnicity
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936,other_cyberbullying
