In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NganLuong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import numpy as np
import pandas as pd

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

# metrics
import time
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# local scripts
from text_utils import preprocess_corpus

In [3]:
df_train = pd.read_csv(r'../../data/train_data.csv', delimiter=None)
df_valid = pd.read_csv(r'../../data/valid_data.csv', delimiter=None)

df_valid['clean_tweet'] = preprocess_corpus(df_valid['tweet_text'])

Xtr, Ytr = df_train["clean_tweet"], df_train["cyberbullying_type"]
Xva, Yva = df_valid["clean_tweet"], df_valid["cyberbullying_type"]

In [4]:
Xtr.sample(5)

23764    although smollet lie awful rare incident relat...
14047    camlafontaine put woman room lawwwwd know coul...
1474     size girl bullied ugly girl high school belly ...
12371    ever call ugly got female stocking life findin...
14481                           ashleylynch join oapi site
Name: clean_tweet, dtype: object

In [5]:
Ytr.sample(5)

1253             ethnicity
25867                  age
14677    not_cyberbullying
3787              religion
24428             religion
Name: cyberbullying_type, dtype: object

In [6]:
Xva.sample(5)

3263    remember hot girl bullied high school cry ecst...
5298    kirindave player love make dig wow old wow old...
2162    charsibhangi gawd still people talking pay dat...
3836           see said barack hussein never muslim idiot
9433    sarahschwartz woman died domestic violence las...
Name: clean_tweet, dtype: object

In [7]:
Yva.sample(5)

2778          age
6311          age
3139    ethnicity
763     ethnicity
881     ethnicity
Name: cyberbullying_type, dtype: object

In [8]:
# vectorize tweet texts
tv = TfidfVectorizer()

vectors = tv.fit_transform(Xtr.values.astype('U'))
Xtr_tv = pd.DataFrame(vectors.toarray(), columns=tv.get_feature_names())

vectors = tv.transform(Xva.values.astype('U'))
Xva_tv = pd.DataFrame(vectors.toarray(), columns=tv.get_feature_names())

In [9]:
le = LabelEncoder()

Ytr_le = pd.DataFrame(le.fit_transform(Ytr),columns=['encoded_cyberbullying_type'])

Yva_le = pd.DataFrame(le.transform(Yva),columns=['encoded_cyberbullying_type'])

In [10]:
print(Xtr_tv.shape, Ytr_le.shape)
print(Xva_tv.shape, Yva_le.shape)

(28614, 36281) (28614, 1)
(9539, 36281) (9539, 1)


In [11]:
Xtr_tv.sample(5)

Unnamed: 0,aaa,aaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaah,aaaaaaaaaah,aaaaaaaaaajajajajajajajahahahajahaja,aaaaah,aaaaargh,aaaag,aaaah,...,zusterschap,zvakaoma,zvlahos,zyampii,zyeth,zyme,zynga,zython,zzoegrimm,zzz
7647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
Ytr_le.sample(5)

Unnamed: 0,encoded_cyberbullying_type
12152,5
22160,5
8568,0
17549,3
6771,5


In [13]:
models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': LinearSVC(),
    'Kernel SVM': SVC(),
    'XGBoost': GradientBoostingClassifier()
}

results = {
    'time_to_train': [],
    'accuracy': [],
    'f1': []
}

In [15]:
name, model = list(models.items())[0]
print("Running model", name)
start_time = time.time()

model.fit(Xtr_tv, Ytr_le.values.ravel())

end_time = time.time()

Ypred = model.predict(Xva_tv)

results['time_to_train'].append(end_time - start_time)
results['accuracy'].append(accuracy_score(Yva_le, Ypred))
results['f1'].append(f1_score(Yva_le, Ypred, average=None))

Running model Gaussian Naive Bayes


In [16]:
name, model = list(models.items())[1]
print("Running model", name)
start_time = time.time()

model.fit(Xtr_tv, Ytr_le.values.ravel())

end_time = time.time()

Ypred = model.predict(Xva_tv)

results['time_to_train'].append(end_time - start_time)
results['accuracy'].append(accuracy_score(Yva_le, Ypred))
results['f1'].append(f1_score(Yva_le, Ypred, average=None))

Running model Linear SVM


In [None]:
name, model = list(models.items())[2]
print("Running model", name)
start_time = time.time()

model.fit(Xtr_tv, Ytr_le.values.ravel())

end_time = time.time()

Ypred = model.predict(Xva_tv)

results['time_to_train'].append(end_time - start_time)
results['accuracy'].append(accuracy_score(Yva_le, Ypred))
results['f1'].append(f1_score(Yva_le, Ypred, average=None))

Running model Kernel SVM


In [19]:
display(pd.DataFrame(results['time_to_train'], index=list(models.keys())[:2], columns=['Time (seconds)']))

display(pd.DataFrame(results['accuracy'], index=list(models.keys())[:2], columns=['Accuracy']))

display(pd.DataFrame(np.asarray(results['f1']).T, columns=list(models.keys())[:2], index=le.classes_))

Unnamed: 0,Time (seconds)
Gaussian Naive Bayes,47.486894
Linear SVM,8.633816


Unnamed: 0,Accuracy
Gaussian Naive Bayes,0.472901
Linear SVM,0.822099


Unnamed: 0,Gaussian Naive Bayes,Linear SVM
age,0.46077,0.95586
ethnicity,0.529801,0.974774
gender,0.408263,0.874381
not_cyberbullying,0.279797,0.549191
other_cyberbullying,0.47657,0.590761
religion,0.639565,0.949819
