In [1]:
# import required packages
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying
5,breanichole rapeisntokay made gay joke rape jo...,gender
6,hindustan time report gay rape joke taken obje...,gender
7,willy real man know able cook one attractive q...,gender
8,notallmen evolving notallmen evolved blameonen...,gender
9,anirban stupid mamata regime hundred bjp worke...,religion


In [3]:
# check if null value exist
display(df.clean_tweet.isna().sum())

# replace null values with empty string
df.clean_tweet = df.clean_tweet.fillna('')

# verify null count
display(df.clean_tweet.isna().sum())

90

0

In [4]:
# required tokenization for word2vec
tweets = df.clean_tweet.apply(simple_preprocess)

tweets.head()

0    [hate, ppl, high, school, used, bully, hot, om...
1                      [kat, andre, asshole, omg, mkr]
2    [new, access, trading, cause, need, high, leve...
3    [fuck, david, duke, racist, think, america, be...
4    [may, say, lot, hate, apologetic, army, hope, ...
Name: clean_tweet, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# spawn a Word2Vec model
# model = Word2Vec(window=5, min_count=2)

# build vocabulary from entire corpus
# model.build_vocab(tweets, progress_per=1000)

# train the word2vec
# model.train(tweets, total_examples=model.corpus_count, epochs=5)

# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

'\nNot training a new word2vec model.\nA model has already been trained and is loaded for further usage.\nUncomment the code block to train and save a new model.\n'

In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('bully'))

# similar to dumb
display(model.wv.most_similar('dumb'))

[('middle', 0.965322732925415),
 ('elementary', 0.9573146104812622),
 ('high', 0.9516770839691162),
 ('graduation', 0.9479526877403259),
 ('relentlessly', 0.9460520148277283),
 ('confrontation', 0.9452431201934814),
 ('grade', 0.9413620829582214),
 ('teased', 0.9374991059303284),
 ('teacher', 0.9353218674659729),
 ('tormentor', 0.9342045187950134)]

[('trayvon', 0.9649240970611572),
 ('shut', 0.9571725130081177),
 ('goshawty', 0.9547345042228699),
 ('sayin', 0.9497713446617126),
 ('ignorant', 0.9482175707817078),
 ('stupid', 0.9463467001914978),
 ('spic', 0.9458626508712769),
 ('beaner', 0.9441934823989868),
 ('nigga', 0.9439464807510376),
 ('subban', 0.9413833022117615)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='bully', w2='teased'))

0.93749917

In [9]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(word)) for word in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(14964, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
swim,-0.016192,0.05097,0.038516,0.031989,0.006755,-0.164538,0.023973,0.169723,-0.050496,-0.062756,...,0.049447,0.039125,-0.003252,0.063031,0.125179,0.041714,0.014036,-0.067356,0.026716,-0.063782
cranboonitz,-0.010225,0.020895,0.006692,0.011847,0.010744,-0.082252,0.005391,0.082809,-0.036677,-0.028743,...,0.034996,0.030146,-0.004744,0.025941,0.064958,0.028077,-0.001096,-0.035168,0.03612,-0.042367
incoherent,-0.007473,0.00876,0.002317,0.009725,-0.00915,-0.035808,0.003866,0.032312,-0.014842,-0.013401,...,0.006639,0.013164,-0.012924,0.014233,0.011315,0.005745,-0.007297,-0.012274,0.012156,-0.031773
succeeding,-0.009584,0.019142,0.012872,0.018994,-0.001779,-0.085978,0.003166,0.083965,-0.021903,-0.017847,...,0.023065,0.011557,0.002887,0.026443,0.049353,0.012399,0.004975,-0.036715,0.016611,-0.036695
generalisation,-0.017711,0.016516,-0.012281,-0.013805,-0.009014,-0.038469,-0.011118,0.037006,-0.001551,-0.005401,...,0.019917,0.017186,-0.005662,-0.004786,0.017323,0.009752,0.01032,-0.031333,0.013886,-0.00787
kent,-0.017487,0.019351,0.021287,0.017192,0.002861,-0.068852,0.018296,0.073776,-0.015681,-0.033791,...,0.02981,0.017921,-0.00446,0.030831,0.055547,0.022893,-0.003331,-0.043318,0.017207,-0.016921
recover,-0.02367,0.038805,0.020255,0.02443,-0.004227,-0.100927,0.012175,0.113186,-0.02591,-0.040676,...,0.044634,0.039185,-0.001975,0.021925,0.062281,0.02901,0.004452,-0.048802,0.015675,-0.026631
crazed,-0.005717,0.011034,0.009325,0.001767,0.005527,-0.064414,0.008706,0.056345,-0.01325,-0.023054,...,0.019664,0.014016,-0.01019,0.018581,0.030296,0.013156,0.014322,-0.037855,0.006867,-0.011776
coexistence,-0.013657,0.020463,0.007536,-0.000164,-0.010045,-0.08834,-0.003483,0.094027,-0.013458,-0.02287,...,0.023668,0.033182,-0.009076,0.023085,0.059465,0.028899,-0.005369,-0.038665,0.030819,-0.032433
secondlady,-5.8e-05,-0.003906,-0.000442,0.009225,0.000936,-0.033619,0.005092,0.038287,-0.012577,-0.006833,...,0.033234,0.012307,-0.016917,0.002611,0.004178,0.021932,-0.021543,-0.022388,0.014168,-0.016251


In [10]:
# generate document matrix from word vectors
document_matrix = []
words = set(model.wv.index_to_key)

for tweet in tweets:
    tweet_vector = np.zeros(100)
    for word in tweet:
        if word in model.wv.index_to_key:
            tweet_vector += model.wv[word]
    tweet_vector = tweet_vector if len(tweet)==0 else (tweet_vector/len(tweet))
    document_matrix.append(tweet_vector)
    
len(document_matrix), len(document_matrix[0])

(28614, 100)

In [11]:
# convert document matrix to dataframe
df_w2v = pd.DataFrame(document_matrix)

# include the class labels
df_w2v['cyberbullying_type'] = df['cyberbullying_type']

display(df_w2v.shape)
df_w2v.head()

(28614, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,cyberbullying_type
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025,age
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668,not_cyberbullying
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698,age
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416,ethnicity
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936,other_cyberbullying
