In [1]:
# import required packages
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# read the dataset into a dataframe
df = pd.read_csv('../../data/train_data.csv')

display(df.shape)
df.head(10)

(28614, 2)

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying
5,breanichole rapeisntokay made gay joke rape jo...,gender
6,hindustan time report gay rape joke taken obje...,gender
7,willy real man know able cook one attractive q...,gender
8,notallmen evolving notallmen evolved blameonen...,gender
9,anirban stupid mamata regime hundred bjp worke...,religion


In [3]:
# check if null value exist
display(df.clean_tweet.isna().sum())

# replace null values with empty string
df.clean_tweet = df.clean_tweet.fillna('')

# verify null count
display(df.clean_tweet.isna().sum())

90

0

In [4]:
# required tokenization for word2vec
tweets = df.clean_tweet.apply(simple_preprocess)

tweets.head()

0    [hate, ppl, high, school, used, bully, hot, om...
1                      [kat, andre, asshole, omg, mkr]
2    [new, access, trading, cause, need, high, leve...
3    [fuck, david, duke, racist, think, america, be...
4    [may, say, lot, hate, apologetic, army, hope, ...
Name: clean_tweet, dtype: object

In [5]:
"""
Not training a new word2vec model.
A model has already been trained and is loaded for further usage.
Uncomment the code block to train and save a new model.
"""
# spawn a Word2Vec model
# model = Word2Vec(window=5, min_count=2)

# build vocabulary from entire corpus
# model.build_vocab(tweets, progress_per=1000)

# train the word2vec
# model.train(tweets, total_examples=model.corpus_count, epochs=5)

# save the model
# commented to avoid overwriting the trained model
# model.save("../../models/word2vec.model")

'\nNot training a new word2vec model.\nA model has already been trained and is loaded for further usage.\nUncomment the code block to train and save a new model.\n'

In [6]:
# load a trained model
model = Word2Vec.load('../../models/word2vec.model')

In [7]:
# top 10 similar words
# similar to bully
display(model.wv.most_similar('bully'))

# similar to dumb
display(model.wv.most_similar('dumb'))

[('middle', 0.965322732925415),
 ('elementary', 0.9573146104812622),
 ('high', 0.9516770839691162),
 ('graduation', 0.9479526877403259),
 ('relentlessly', 0.9460520148277283),
 ('confrontation', 0.9452431201934814),
 ('grade', 0.9413620829582214),
 ('teased', 0.9374991059303284),
 ('teacher', 0.9353218674659729),
 ('tormentor', 0.9342045187950134)]

[('trayvon', 0.9649240970611572),
 ('shut', 0.9571725130081177),
 ('goshawty', 0.9547345042228699),
 ('sayin', 0.9497713446617126),
 ('ignorant', 0.9482175707817078),
 ('stupid', 0.9463467001914978),
 ('spic', 0.9458626508712769),
 ('beaner', 0.9441934823989868),
 ('nigga', 0.9439464807510376),
 ('subban', 0.9413833022117615)]

In [8]:
# word-word cosine similarity
display(model.wv.similarity(w1='bully', w2='teased'))

0.93749917

In [9]:
# extract word vectors as dataframe from the model
word_vectors = pd.DataFrame([model.wv.get_vector(str(word)) for word in model.wv.key_to_index], index = model.wv.key_to_index)

display(word_vectors.shape)
word_vectors.sample(10)

(14964, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
fiving,-0.00418,0.011004,0.008864,0.030317,0.010306,-0.040311,0.005436,0.032164,-0.017889,-0.017366,...,0.014288,0.011613,0.010849,0.02316,0.03284,0.003168,-0.000905,-0.024694,0.0128,-0.018908
inclusive,-0.008973,0.017072,0.026882,0.002646,0.002169,-0.135618,0.014574,0.132379,-0.040094,-0.03993,...,0.066306,0.0475,-0.002961,0.029912,0.095546,0.033747,0.013989,-0.073101,0.017821,-0.043935
thegeek,-0.024262,0.030241,0.031196,0.020661,0.001705,-0.075843,0.014947,0.095824,-0.015683,-0.029006,...,0.023302,0.008798,0.012101,0.01175,0.04922,0.018428,0.006829,-0.037559,0.016661,-0.014965
instruction,-0.018524,0.026947,0.024693,0.018911,-0.005835,-0.064576,0.019627,0.075655,-0.018632,-0.026387,...,0.024665,0.017041,-0.003775,0.018352,0.050005,0.015278,-0.009907,-0.025241,0.02254,-0.019569
colin,-0.099874,0.301789,0.227966,0.2326,0.103665,-0.711918,0.0928,0.856416,-0.213289,-0.163051,...,0.217306,0.063866,0.019978,0.255847,0.524943,0.084402,0.113449,-0.325788,0.028994,-0.272696
stalked,-0.01709,0.024763,0.011745,0.00752,0.005819,-0.078474,0.008721,0.076405,-0.028536,-0.033842,...,0.03638,0.018124,0.004675,0.022091,0.05958,0.029134,0.011851,-0.030349,0.020929,-0.027264
mena,0.004329,-0.002214,0.008523,-0.000954,-0.000661,-0.013937,-0.010625,0.007285,0.007508,-0.002357,...,0.00292,0.009965,-0.000119,-0.001533,-0.007548,0.009852,-0.010528,-0.000418,0.015278,-0.009482
symbo,-0.022434,0.026545,0.020067,0.02342,-0.003905,-0.082668,0.007425,0.091665,-0.032794,-0.037276,...,0.039163,0.012248,0.010094,0.016206,0.068377,0.026077,0.000694,-0.047634,0.002508,-0.033175
brittney,-0.001988,0.00332,0.018347,0.014833,-0.013084,-0.029538,0.015529,0.025772,0.005558,-0.017013,...,0.003783,0.013767,0.004899,0.016327,0.014118,0.00504,0.005804,-0.011509,-0.001157,-0.014647
diy,-0.012129,0.009449,0.004589,-0.005549,0.00282,-0.029901,0.004236,0.022592,-0.003955,-0.004976,...,0.011821,0.004138,-0.00288,0.001116,0.022141,0.002177,-0.01004,-0.012927,0.009266,0.002957


In [10]:
# generate document vectors from word vectors
document_vectors = []
words = set(model.wv.index_to_key)

for tweet in tweets:
    tweet_vector = np.zeros(100)
    for word in tweet:
        if word in model.wv.index_to_key:
            tweet_vector += model.wv[word]
    tweet_vector = tweet_vector if len(tweet)==0 else (tweet_vector/len(tweet))
    document_vectors.append(tweet_vector)
    
len(document_vectors), len(document_vectors[0])

(28614, 100)

In [11]:
# convert document vectors to document matrix as dataframe
document_matrix = pd.DataFrame(document_vectors)

# include the class labels
document_matrix['cyberbullying_type'] = df['cyberbullying_type']

display(document_matrix.shape)
document_matrix.head()

(28614, 101)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,cyberbullying_type
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025,age
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668,not_cyberbullying
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698,age
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416,ethnicity
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936,other_cyberbullying
