In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# local scripts
from word2vec_utils import transform

In [3]:
# load train set into dataframe
df_train = pd.read_csv('../../data/train_data.csv')

# shape: (rows, columns)
display(df_train.shape)

# class distribution
display(df_train['cyberbullying_type'].value_counts())

# first 5 datapoints
df_train.head()

(28614, 2)

gender                 4795
not_cyberbullying      4792
religion               4791
age                    4786
ethnicity              4766
other_cyberbullying    4684
Name: cyberbullying_type, dtype: int64

Unnamed: 0,clean_tweet,cyberbullying_type
0,hate ppl high school used bully hot omg love m...,age
1,kat andre asshole omg mkr,not_cyberbullying
2,new access trading cause need high level opini...,age
3,fuck david duke racist think america belong du...,ethnicity
4,may say lot hate apologetic army hope choke ev...,other_cyberbullying


In [4]:
# separate and encode independent and dependent variables
X_train = df_train.clean_tweet
Y_train = df_train.cyberbullying_type

print(X_train.shape, Y_train.shape)

# encode independent feature: X_train
# convert into word2vec representation(document matrix)
X_train_w2v, _ = transform(corpus=X_train, model_load_path='../../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_train_w2v.shape)

# first 5 datapoints
display(X_train_w2v.head())

# label-encode dependent feature: Y_train
# spawn a labelencoder
le = LabelEncoder()

# train and transform class labels
Y_train_le = pd.DataFrame(le.fit_transform(Y_train), columns=['encoded_cyberbullying_type'])

Y_train_le.head()

(28614,) (28614,)


(28614, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.237988,0.411318,0.153711,0.38292,0.262672,-1.023562,0.585043,1.101426,-0.876988,-0.330105,...,0.506708,0.294901,0.098529,0.47323,1.009498,0.419733,-0.015129,-0.419448,0.251839,-0.21025
1,-0.144806,0.617245,0.748015,0.834076,0.283397,-1.274383,0.15688,1.549933,-0.290592,-0.27923,...,0.257129,-0.316109,0.147053,0.433169,0.932522,-0.024955,0.404652,-0.522447,-0.42257,-0.535668
2,-0.146266,0.386033,0.010834,0.229401,0.296175,-0.986068,0.512315,1.086754,-0.742112,-0.268087,...,0.496192,0.360697,0.038448,0.503674,0.955136,0.387838,-0.007955,-0.412584,0.386322,-0.276698
3,-0.010017,0.101545,0.185687,-0.089176,-0.120168,-1.224311,0.320136,1.062363,-0.440994,-0.509981,...,0.622101,0.199142,0.278352,0.094113,0.765693,0.206304,0.292828,-0.48187,-0.140193,-0.005416
4,-0.050589,0.187251,0.080036,0.021537,0.001453,-0.789317,0.276012,0.801677,-0.326183,-0.302832,...,0.446736,0.250055,0.083603,0.199339,0.573685,0.278995,0.017548,-0.340371,0.115137,-0.216936


Unnamed: 0,encoded_cyberbullying_type
0,0
1,3
2,0
3,1
4,4


In [5]:
# map labels to encoded labels
map_labels = pd.DataFrame(
    {
        'labels': le.classes_,
        'encoded_labels': le.transform(le.classes_)
    },
    columns=['labels', 'encoded_labels']
)
display(map_labels)

Unnamed: 0,labels,encoded_labels
0,age,0
1,ethnicity,1
2,gender,2
3,not_cyberbullying,3
4,other_cyberbullying,4
5,religion,5


In [6]:
# load validation set into dataframe
df_valid = pd.read_csv('../../data/valid_data.csv')

# shape: (rows, columns)
display(df_valid.shape)

# class distribution
display(df_valid['cyberbullying_type'].value_counts())

# first 5 datapoints
df_valid.head()

(9539, 2)

gender                 1647
religion               1641
age                    1603
ethnicity              1592
not_cyberbullying      1529
other_cyberbullying    1527
Name: cyberbullying_type, dtype: int64

Unnamed: 0,tweet_text,cyberbullying_type
0,"Fucking Slut ""@CallMeKatiee__ DUMB BITCH. ""@__...",ethnicity
1,@TheRealJacquet it's not a fucking excuse it's...,ethnicity
2,@iamyaokhari Men HATE getting the last word. T...,gender
3,@sibbysoyabean I have several strands of pearl...,not_cyberbullying
4,@AshForSyria @TheMoeDee @RazanSpeaks Or are yo...,not_cyberbullying


In [7]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# instantiate a word lemmatizer
lemma = WordNetLemmatizer()

# load all the english stopwords
stop_words = stopwords.words('english')

# total stopwords
print(len(stop_words))

# first 10 stopwords
print(stop_words[:10])

## define the cleaning and preprocessing actions
# convert text to lowercase
# remove links
# tokenization
# remove stop words
# lemmatization
# remove all words with length less than 3
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+',' ',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    text = word_tokenize(text)
    text = [item for item in text if item not in stop_words]
    text = [lemma.lemmatize(w) for w in text]
    text = [i for i in text if len(i)>2]
    text = ' '.join(text)
    return text

# preprocess the tweet_text column for the validation set
df_valid['clean_tweet'] = df_valid['tweet_text'].apply(clean_text)

df_valid.loc[:,['tweet_text', 'clean_tweet']].head()

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


Unnamed: 0,tweet_text,clean_tweet
0,"Fucking Slut ""@CallMeKatiee__ DUMB BITCH. ""@__...",fucking slut callmekatiee dumb bitch chelssss ...
1,@TheRealJacquet it's not a fucking excuse it's...,therealjacquet fucking excuse fact look line b...
2,@iamyaokhari Men HATE getting the last word. T...,iamyaokhari men hate getting last word follow ...
3,@sibbysoyabean I have several strands of pearl...,sibbysoyabean several strand pearl love
4,@AshForSyria @TheMoeDee @RazanSpeaks Or are yo...,ashforsyria themoedee razanspeaks whining whin...


In [8]:
# separate and encode independent and dependent variables of validation set
X_valid = df_valid.clean_tweet
Y_valid = df_valid.cyberbullying_type

print(X_valid.shape, Y_valid.shape)

# encode independent feature: X_valid
# convert into word2vec representation(document matrix)
X_valid_w2v, _ = transform(corpus=X_valid, model_load_path='../../models/word2vec.model')

# shape of document matrix: (rows, columns)
display(X_valid_w2v.shape)

# first 5 datapoints
display(X_valid_w2v.head())

# label-encode dependent feature: Y_valid
# spawn a labelencoder
le = LabelEncoder()

# train and transform class labels
Y_valid_le = pd.DataFrame(le.fit_transform(Y_valid), columns=['encoded_cyberbullying_type'])

Y_valid_le.head()

(9539,) (9539,)


(9539, 100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.320946,0.274817,0.591292,0.021929,-0.194136,-1.226122,0.374527,1.177079,-0.412008,-0.623276,...,0.776594,0.086303,0.248456,-0.088654,0.864108,0.197663,0.288792,-0.605877,-0.319431,0.311824
1,-0.24368,0.274648,0.492268,0.275656,-0.056358,-1.365305,0.483564,1.215446,-0.624377,-0.526777,...,0.658706,0.117993,0.33661,0.193765,0.942633,0.371835,0.286223,-0.522543,-0.259851,0.033777
2,-0.097557,0.191508,0.01953,-0.191714,-0.024607,-0.813744,0.279741,0.818382,-0.371429,-0.387706,...,0.551212,0.345961,0.050749,0.118106,0.625708,0.248799,0.017506,-0.408042,0.18723,-0.047362
3,-0.00258,0.068588,-0.027981,0.02996,0.063804,-0.280458,0.100151,0.36712,-0.142564,-0.051717,...,0.189381,0.139178,-0.029333,0.141998,0.228474,0.116125,-0.064738,-0.162721,0.141677,-0.098699
4,-0.031537,0.086399,0.019493,-0.000666,0.00726,-0.335409,0.077078,0.363412,-0.140412,-0.114818,...,0.162653,0.132741,0.008963,0.105125,0.243877,0.115774,-0.000438,-0.165261,0.116367,-0.105382


Unnamed: 0,encoded_cyberbullying_type
0,1
1,1
2,2
3,3
4,3


In [9]:
# train and evaluate 3 models: NaiveBayes, SVM and XGBoost
