<a href="https://colab.research.google.com/github/kopalgarg/hate_speech_classification/blob/main/notebooks/HateSpeechClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
%%capture

import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
tf.test.gpu_device_name()
import re
import torch

# -- Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# -- Tweet Preprocessing
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
!pip install emot
import emot
!pip install pyspellchecker
from spellchecker import SpellChecker
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from emot.emo_unicode import UNICODE_EMOJI

# -- Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# -- Performance Metrics
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, precision_score, recall_score,  accuracy_score, precision_recall_curve

# -- Explainability
!pip install shap
import shap
!pip install lime
import lime

In [9]:
# Connect G-Drive
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [35]:
# Set Data Paths
ROOT = "gdrive/MyDrive/CSC2612/"
general = "gdrive/MyDrive/CSC2612/data/general"
antiAsian = "gdrive/MyDrive/CSC2612/data/antiAsian"
prof = "gdrive/MyDrive/CSC2612/data/prof"

In [37]:
# Read in General Tweetes
generalTweets = pd.read_csv(os.path.join(general,'train_E6oV3lV.csv'))
generalTweets = generalTweets[['tweet', 'label']]
generalTweets.head()

Unnamed: 0,tweet,label
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [12]:
# Label Key with Examples
print('0: neutral')
print(generalTweets[generalTweets['label']==0].iloc[1])

print('1: hate speech')
print(generalTweets[generalTweets['label']==1].iloc[1])

0: neutral
tweet    @user @user thanks for #lyft credit i can't us...
label                                                    0
Name: 1, dtype: object
1: hate speech
tweet    no comment!  in #australia   #opkillingbay #se...
label                                                    1
Name: 14, dtype: object


In [32]:
# Read in Prof's Tweets
profTweets = pd.read_csv(os.path.join(prof,'B_volunteer_labelled_data_20210913.csv'))
profTweets.head(10)

Unnamed: 0,tweet id,clean text of tweet,Does this tweet contain covid-related stigmatizing language against the people of Asian-descent?,"If you identify the tweet talking about one or more of the topics below, please check them below (multiple answer possible)","If you have a comment, or want to add another related topic that is not listed in task 2, you can mention it here. Also, if the tweet is a duplicate, put ""duplicate"" here.",Labeller ID
0,https://twitter.com/edent/status/1244240695029...,I miss my #atlutd match day family... #5Stripe...,not stigmatizing,Other,,17
1,https://twitter.com/edent/status/1244379980634...,This happened to ME and MY FAMILY. We were set...,not stigmatizing,Other,,17
2,https://twitter.com/edent/status/1246427051084...,#Coronavirus..Ban wildlife trade..China,not stigmatizing,Other,,17
3,https://twitter.com/edent/status/1239157574441...,I bet most of the cases happened because peopl...,not stigmatizing,Other,,17
4,https://twitter.com/edent/status/1238240240235...,At the start of this decade we all said that w...,not stigmatizing,Other,,17
5,https://twitter.com/edent/status/1242885979389...,"Last year, the Polish FM stated that #Poland w...",not stigmatizing,News,,17
6,https://twitter.com/edent/status/1240412108463...,Why are allowing people to sell necessities a...,not stigmatizing,Other,,17
7,https://twitter.com/edent/status/1245624359344...,Recall... \n We hope you delete this tweet. URL,not stigmatizing,Other,,17
8,https://twitter.com/edent/status/1249123462171...,All the foolish talk of reopening things is ju...,not stigmatizing,Other,,17
9,https://twitter.com/edent/status/1238542266928...,people can be such scared sheep😱. IMO panic un...,not stigmatizing,Other,,17


In [38]:
# Feature Engineering

# -- remove URL
def remove_url(tweet):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', tweet)

# -- remove HTML
def remove_html(tweet):
  return BeautifulSoup(tweet, 'lxml').text

# -- lowercase
def lower_case(tweet):
  return tweet.str.lower()

# -- covert emojis and emoticons to words
def convert_emoji(tweet):
  for emot in UNICODE_EMOJI:
    tweet = tweet.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
  return tweet

# -- remove special characters and non-ASCII characters
def remove_special_char(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"â²", "", tweet)
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"â¹", "", tweet)
    tweet = re.sub(r"â½", "", tweet)
    tweet = re.sub(r"â¾", "", tweet)
    tweet = re.sub(r"ã¼berweist", "", tweet)
    tweet = re.sub(r"ã¼cretsiz", "", tweet)
    tweet = re.sub(r"zã¼rich", "", tweet)
    tweet = re.sub(r"ã¼retime", "", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
    tweet = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'mentioned', tweet)
    tweet = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'referance', tweet)
    tweet = re.sub(r'£|\$', 'money', tweet)
    tweet = re.sub(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ', tweet)
    tweet = re.sub(r'\d+(\.\d+)?', ' ', tweet) 
    tweet = re.sub(r'[^\w\d\s]', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    tweet = re.sub(r'^\s+|\s+?$', '', tweet.lower())
    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet) 
    tweet = re.sub(r"_", "  ", tweet)
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        tweet = tweet.replace(p, f' {p} ')
    return str(tweet)

# -- spellcheck
def spellcheck(tweet):
  spell = SpellChecker(distance = 1, language='en')
  words = set(nltk.corpus.words.words())
  corrected_tweet= []
  misspelled_words = spell.unknown(tweet.split())
  for word in tweet.split():
    if word in misspelled_words:
      corrected_tweet.append(spell.correction(word))
    else:
      corrected_tweet.append(word)
  return " ".join(corrected_tweet)

# -- ensure English
def ensure_english(tweet):
  words = set(nltk.corpus.words.words())
  return " ".join(w for w in nltk.wordpunct_tokenize(tweet)\
                  if w.lower() in words or not w.isalpha())

# -- remove punctuation
def remove_punctuation(tweet):
  regular_punct = list(string.punctuation)
  for punctuation in regular_punct:
    if punctuation in tweet:
      tweet = tweet.replace(punctuation, ' ')
  return tweet.strip()
  
# -- remove stopwords
def remove_stopwords(tweet):
  en_stop =set(stopwords.words('english'))
  tweet = tweet.split()
  tweet = " ".join([word for word in tweet if not word in en_stop])
  return tweet

# -- tokenize
def tokenize(tweet):
  return word_tokenize(tweet)

# -- lematize
def lematize(tweet):
  lem = WordNetLemmatizer()
  return [lem.lemmatize(w) for w in tweet]

# -- finally, combine words
def combine_words(tweet):
  return ' '.join(tweet)

In [39]:
def clean_tweet(tweet):

  tweet = remove_url(tweet)
  tweet = remove_html(tweet)
  tweet = convert_emoji(tweet)
  tweet = remove_special_char(tweet)
  tweet = spellcheck(tweet)
  #tweet = ensure_english(tweet)
  tweet = remove_punctuation(tweet)
  tweet = remove_stopwords(tweet)
  tweet = tokenize(tweet)
  tweet = lematize(tweet)
  tweet = combine_words(tweet)

  return tweet


In [40]:
X_train, X_test, y_train, y_test = train_test_split(generalTweets['tweet'], generalTweets['label'], test_size=0.99, random_state=42)

X_train = X_train.apply(clean_tweet)
#X_test = X_test.apply(clean_tweet)

In [41]:
# Vectorize

# -- Bag of Words (unigrams)
cv_unigrams = CountVectorizer(ngram_range = (1,1))
X_train_bow = cv_unigrams.fit_transform(X_train)


# -- Bag of Words (bigrams)
cv_bigrams = CountVectorizer(ngram_range = (2,2))
X_train_bbow = cv_bigrams.fit_transform(X_train)


# -- TF-IDF
vec_tfidf = TfidfVectorizer(min_df = 2, max_df = 0.8, use_idf = True, ngram_range=(1, 1))
vec_tfidf.fit(X_train)
X_train_tfidf = vec_tfidf.fit_transform(X_train)


In [None]:
# Train Baseline Models
# NB
def naive_bayes_model(feature_vector_x, feature_vector_y):
  alpha = [1e-10, 1e-5, 0.1, 1.0, 2.0, 5.0, 10.0]
  best_alpha = -1
  max_score = 0
  for a in alpha:
    mnb = MultinomialNB(alpha = a)
    scores = sklearn.model_selection.cross_val_score(mnb, feature_vector_x, feature_vector_y, cv = 5)
    if np.mean(scores)> max_score:
      best_alpha = a
      max_score = np.mean(scores)
    
    print('alpha =', a)
    print(np.mean(scores))
    print('\n')
  
  print('best alpha:', best_alpha)
  mnb = MultinomialNB(alpha = best_alpha)
  mnb.fit(feature_vector_x, feature_vector_y)
  print('train score:', mnb.score(feature_vector_x, feature_vector_y))
  return mnb

# -- NB with BOW unigram
mnb_bow = naive_bayes_model(X_train_bow, y_train)
# -- NB with BOW bigram
mnb_bbow = naive_bayes_model(X_train_bbow, y_train)
# -- NB with TF-IDF
mnb_tfidf = naive_bayes_model(X_train_tfidf, y_train)

In [66]:
# DT
def dt_model(feature_vector_x, feature_vector_y):
  dtclassifier = DecisionTreeClassifier(criterion='entropy', max_depth=None)
  scores = cross_val_score(dtclassifier, feature_vector_x, feature_vector_y, cv = 10)
  dtclassifier.fit(feature_vector_x, feature_vector_y)
  print('train score:', accuracy_score(dtclassifier.predict(feature_vector_x), feature_vector_y))
  return dtclassifier

# -- DT with BOW unigram
dt_bow = dt_model(X_train_bow, y_train)

# -- DT with BOW bigram
dt_bbow = dt_model(X_train_bbow, y_train)

# -- DT with TF-IDF
dt_tfidf = dt_model(X_train_tfidf, y_train)

train score: 1.0
train score: 1.0
train score: 0.9937304075235109


In [67]:
# RF
def rf_model(feature_vector_x, feature_vector_y):
  rfclassifier = DecisionTreeClassifier(criterion='entropy', max_depth=None)
  scores = cross_val_score(rfclassifier, feature_vector_x, feature_vector_y, cv = 10)
  rfclassifier.fit(feature_vector_x, feature_vector_y)
  print('train score:', accuracy_score(rfclassifier.predict(feature_vector_x), feature_vector_y))
  return rfclassifier

# -- RF with BOW unigram
rf_bow = dt_model(X_train_bow, y_train)

# -- RF with BOW bigram
rf_bbow = dt_model(X_train_bbow, y_train)

# -- RF with TF-IDF
rf_tfidf = dt_model(X_train_tfidf, y_train)

train score: 1.0
train score: 1.0
train score: 0.9937304075235109


In [None]:
# LR
def lr_model(feature_vector_x, feature_vector_y):
  C_values = [0.001,0.01, 0.1,1,10,100]
  best_c = -1
  max_score = 0
  for c in C_values:
    lr = LogisticRegression(C = c, random_state=0, solver = 'lbfgs', multi_class='multinomial')
    lr.fit(feature_vector_x, feature_vector_y)
    scores = sklearn.model_selection.cross_val_score(lr, feature_vector_x, feature_vector_y, cv = 5)
    if np.mean(scores)> max_score:
      best_c = c
      max_score = np.mean(scores)
    
    print('c =', c)
    print(np.mean(scores))
    print('\n')

  lr = LogisticRegression(solver = 'lbfgs', multi_class='multinomial', C=c)
  lr.fit(feature_vector_x, feature_vector_y)
  print('train score:', accuracy_score(lr.predict(feature_vector_x), feature_vector_y))
  return lr

# -- LR with BOW unigram
lr_bow = lr_model(X_train_bow, y_train)

# -- LR with BOW bigram
lr_bbow = lr_model(X_train_bbow, y_train)

# -- LR with TF-IDF
lr_tfidf = lr_model(X_train_tfidf, y_train)


In [45]:
# SVM
def svm_model(feature_vector_x, feature_vector_y):
  params = {'C':[0.01, 0.1, 1, 10, 100],
       'kernel':['rbf', 'poly', 'linear', 'sigmoid']}
  classifier_linear = GridSearchCV(svm.SVC(), params, cv=10)
  classifier_linear.fit(feature_vector_x, feature_vector_y)
  print('train score:', accuracy_score(classifier_linear.predict(feature_vector_x), feature_vector_y))

# -- SVM with BOW unigram
svm_bow = svm_model(X_train_bow, y_train)
# -- SVM with BOW bigram
svm_bbow = svm_model(X_train_bbow, y_train)
# -- SVM with TF-IDF
svm_tfidf = svm_model(X_train_tfidf, y_train)

train score: 1.0
train score: 0.9373040752351097
train score: 0.9937304075235109


In [65]:
# XGBoost
def xgboost_model(feature_vector_x, feature_vector_y):
  xgb = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)
  param_grid = {
     'xgb__n_estimators': [1, 5, 10, 50, 100, 150, 300]}
  grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 10, 
                             n_jobs = 1, verbose = 0, return_train_score=True)

  grid_search.fit(feature_vector_x, feature_vector_y)
  print(grid_search.best_params_)

  print('train score:', accuracy_score(grid_search.predict(feature_vector_x), feature_vector_y))

  return xgb

# -- XGBoost with BOW unigram
xgb_bow = xgboost_model(X_train_bow, y_train)
# -- XGBoost with BOW bigram
xgb_bbow = xgboost_model(X_train_bbow, y_train)
# -- XGBoost with TF-IDF
xgb_tfidf = xgboost_model(X_train_tfidf, y_train)

{'xgb__n_estimators': 1}
train score: 0.9373040752351097
{'xgb__n_estimators': 1}
train score: 0.9373040752351097
{'xgb__n_estimators': 1}
train score: 0.9373040752351097


In [None]:
# 1D CNN


In [None]:
# RNN


In [None]:
# BERT

In [None]:
# Explainability 

# -- SHAP


In [None]:
# -- LIME