In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import html
from bs4 import BeautifulSoup
import regex as re
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier



def split_sets(dataset):
    split_data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]     # 0: train 1: validation 2: test

    split_list = [random.choice(split_data) for i in range(len(dataset))]

    # print(split_list.count(0)/len(split_list))
    # print(split_list.count(1)/len(split_list))

    train_set = []
    test_set = []

    index = 0
    for num in split_list:
        if num == 0:
            train_set.append(dataset[index])

        else:
            test_set.append(dataset[index])
        index = index + 1

    return np.array(train_set), np.array(test_set)





def filtering(input):
    holder = re.sub(r'#', ' #', input)
    holder = re.sub(r'@', ' @', holder)
    holder = re.sub(r'(http|https)\S+', '', holder)                                          # getting rid of url
    holder = BeautifulSoup(html.unescape(holder), 'html.parser').text                       # html parse
    holder = ''.join(filter(lambda x: x.isalnum() or x in [' ', '#', '@'], holder)).lower() # filtering
    return holder






def process_data():
    hate_data = pd.read_csv("/content/drive/MyDrive/hate_speech.csv",
                            names = ["Index", "Is_Hate", "Tweet"])
    hate_data.head()

    hate_data = np.array(hate_data)

    index = 0
    for sentence in hate_data[:,2]:                 # filtering hate_dataset
        hate_data[index,2] = filtering(sentence)
        index = index + 1

    train_set, test_set = split_sets(hate_data)

    train_x = train_set[:, 2]
    test_x = test_set[:, 2]

    train_y = train_set[:, 1]
    train_y = train_y.astype('int')

    test_y = test_set[:, 1]
    test_y = test_y.astype('int')


    pipe = Pipeline([
        ('count', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classify', SGDClassifier())
    ])

    pipe_two = Pipeline([
        ('count', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classify', LinearSVC())
    ])

    pipe_three = Pipeline([
        ('count', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('decision_tree', DecisionTreeClassifier())
    ])


    # SGD classifier
    model = pipe.fit(train_x, train_y)
    predict_y = model.predict(test_x)
    print('the f1 score using SGD Classifier is: ' + str(f1_score(test_y, predict_y)))

    # Linear SVC
    model_two = pipe_two.fit(train_x, train_y)
    predict_y_two = model_two.predict(test_x)
    print('the f1 score using Linear SVC is: ' + str(f1_score(test_y, predict_y_two)))

    # Decision Tree Classifier
    model_three = pipe_three.fit(train_x, train_y)
    predict_y_three = model_three.predict(test_x)
    print('the f1 score using Decision Tree Classifier is: ' + str(f1_score(test_y, predict_y_three)))




if __name__ == '__main__':
    process_data()





the f1 score using SGD Classifier is: 0.9127459366980326
the f1 score using Linear SVC is: 0.9316539335122858
the f1 score using Decision Tree Classifier is: 0.9039640375970576
