In [1]:
import os 
import time 
import pandas as pd 
import numpy as np
import re

In [2]:
old_data = pd.read_csv('cyberbullying_tweets.csv', encoding='ANSI')
old_data

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",NotHarassment
1,Why is #aussietv so white? #MKR #theblock #ImA...,NotHarassment
2,@XochitlSuckkks a classy whore? Or more red ve...,NotHarassment
3,"@Jason_Gio meh. :P thanks for the heads up, b...",NotHarassment
4,@RudhoeEnglish This is an ISIS account pretend...,NotHarassment
...,...,...
39864,"Black ppl aren't expected to do anything, depe...",Low
39865,Turner did not withhold his disappointment. Tu...,Low
39866,I swear to God. This dumb nigger bitch. I have...,Low
39867,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,Low


In [3]:
def get_text_list(filename, delim='\n'):
    texts = ''
    with open(filename, 'rb') as f:
        for line in f.readlines():
            l = line.strip() 
            if l != b'':
                texts += l.decode()
            else:
                texts += '\n'

    return [re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text.replace('\n', ' ').strip()) for text in texts.split(delim) if text.replace('\n', ' ').strip() != '' and len(text.replace('\n', ' ').strip().split(' ')) > 1]

In [4]:
low_list = get_text_list('low.txt', '\n\n')
medium_list = get_text_list('medium.txt')
high_list = get_text_list('high.txt')

df_l = pd.DataFrame({'tweet_text':low_list, 'cyberbullying_type':['Low' for x in range(len(low_list))]})
df_h = pd.DataFrame({'tweet_text':high_list, 'cyberbullying_type':['High' for x in range(len(high_list))]})
df_m = pd.DataFrame({'tweet_text':medium_list, 'cyberbullying_type':['Medium' for x in range(len(medium_list))]})
final_df = pd.concat([old_data, df_l, df_m, df_h]).reset_index(drop=True)
final_df

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",NotHarassment
1,Why is #aussietv so white? #MKR #theblock #ImA...,NotHarassment
2,@XochitlSuckkks a classy whore? Or more red ve...,NotHarassment
3,"@Jason_Gio meh. :P thanks for the heads up, b...",NotHarassment
4,@RudhoeEnglish This is an ISIS account pretend...,NotHarassment
...,...,...
40243,you turn me rock hard baby so hot lets fuck,High
40244,In case my sexy big ass stopped you from seein...,High
40245,Dm me hot sexy,High
40246,DM for leaked videos,High


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
X_all = vect.fit_transform(final_df['tweet_text'][:1000])

map_dict = {'High':3.0,'Medium':2.0,'Low':1.0,'NotHarassment':0.0}
labels = {3.0:'High',2.0:'Medium',1.0:'Low',0.0:'NotHarassment'}

def map(x):
    return map_dict[x]

y_all = final_df['cyberbullying_type'].apply(map).values[:1000]

In [6]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.20, random_state=42)

In [61]:
from sklearn.svm import SVC

model_svc = SVC(kernel='linear')
model_svc.fit(X, y)

In [62]:
y_pred = model_svc.predict(X_test)

from sklearn.metrics import accuracy_score

print('Accuracy:', accuracy_score(y_pred, y_test)*100)

Accuracy: 93.35403726708074


In [63]:
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier()
model_rfc.fit(X, y)

In [72]:
y_pred = model_rfc.predict(X_test)

print('Accuracy:', accuracy_score(y_pred, y_test)*100)

Accuracy: 91.46583850931677


In [65]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(X, y)

In [73]:
y_pred = mlp.predict(X_test)

print('Accuracy:', accuracy_score(y_pred, y_test)*100)

Accuracy: 91.46583850931677


In [67]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(X, y)

In [68]:
y_pred = knn.predict(X_test)

print('Accuracy:', accuracy_score(y_pred, y_test)*100)

Accuracy: 38.19875776397515


In [85]:
from sklearn.tree import DecisionTreeClassifier

# Compute error rate, alpha and w
def compute_error(y, y_pred, w_i):
    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

def compute_alpha(error):
    return np.log((1 - error) / error)

def update_weights(w_i, alpha, y, y_pred):
    return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))



class AdaBoost:
    
    def __init__(self):
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        
        # Clear before calling
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        # Iterate over M weak classifiers
        for m in range(0, M):
            
            # Set weights for current boosting iteration
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)  # At m = 0, weights are all the same and equal to 1 / N
            else:
                # (d) Update w_i
                w_i = update_weights(w_i, alpha_m, y, y_pred)
            
            # (a) Fit weak classifier and predict labels
            G_m = DecisionTreeClassifier(max_depth = 1)     # Stump: Two terminal-node classification tree
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)
            
            self.G_M.append(G_m) # Save to list of weak classifiers

            # (b) Compute error
            error_m = compute_error(y, y_pred, w_i)
            self.training_errors.append(error_m)

            # (c) Compute alpha
            alpha_m = compute_alpha(error_m)
            self.alphas.append(alpha_m)

        assert len(self.G_M) == len(self.alphas)
     

    def predict(self, X):
        # Initialise dataframe with weak predictions for each observation
        weak_preds = pd.DataFrame(index = range(X.shape[0]), columns = range(self.M)) 

        # Predict class label for each weak classifier, weighted by alpha_m
        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        # Calculate final predictions
        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

In [87]:
ab = AdaBoost()
ab.fit(X, y, M = 400)

In [88]:
# Predict on test set
y_pred = ab.predict(X_test)
print('Accuracy:', accuracy_score(y_pred, y_test)*100)

  weak_preds.iloc[:,m] = y_pred_m


Accuracy: 19.354037267080745


In [90]:
from sklearn.ensemble import AdaBoostClassifier

ab_sk = AdaBoostClassifier()
ab_sk.fit(X, y)

In [91]:
y_pred = ab_sk.predict(X_test)
print('Accuracy:', accuracy_score(y_pred, y_test)*100)

Accuracy: 90.03726708074534


In [8]:
import pickle

In [69]:

pickle.dump(mlp, open('mlp.sk', 'wb'))

In [70]:
pickle.dump(model_rfc, open('rfc.sk', 'wb'))

In [71]:
pickle.dump(model_svc, open('svc.sk', 'wb'))

In [9]:
pickle.dump(vect, open('vect.sk', 'wb'))

In [10]:
pickle.dump(labels, open('labels.lst', 'wb'))

In [6]:
final_df.to_csv('final.csv')

In [7]:
from svm import LinearSVM
model_svc_custom = LinearSVM(C=1)

In [8]:
print(X.shape, y.shape)

(800, 4475) (800,)


In [10]:
import pickle
pickle.dump(X, open('x.data', 'wb'))
pickle.dump(X_test, open('xt.data', 'wb'))
pickle.dump(y, open('y.data', 'wb'))
pickle.dump(y_test, open('yt.data', 'wb'))