In [1]:
import pandas as pd
from glob import glob  # for getting filepaths
import os
import re  # Import the re module

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer 
from sklearn.model_selection import train_test_split

In [2]:
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

# Read in the data

In [3]:
!kaggle datasets download -d mrmorj/hate-speech-and-offensive-language-dataset

Dataset URL: https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset
License(s): CC0-1.0
Downloading hate-speech-and-offensive-language-dataset.zip to c:\Users\Bryan Chan\Documents\Projects\HateSpeech_detection




  0%|          | 0.00/1.01M [00:00<?, ?B/s]
 99%|█████████▉| 1.00M/1.01M [00:00<00:00, 1.72MB/s]
100%|██████████| 1.01M/1.01M [00:00<00:00, 1.70MB/s]


In [4]:
import zipfile

with zipfile.ZipFile('hate-speech-and-offensive-language-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('hate_speech_dataset')

In [None]:
dataset_path = 'hate_speech_dataset/labeled_data.csv'
df = pd.read_csv(dataset_path)
df.head(19)

In [8]:
# Preview the dataset
df.head(10)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


# Preprocess Data

In [10]:
# Create two new dataset from 'tweet' and 'class' column
tweet_X = pd.DataFrame(df['tweet'])
tweet_y = pd.DataFrame(df['class'])

## Simple Text Preprocessing

Lowercase the tweet, URLs, HTML entities, hashtags and mentions, punctuation, extra whitespace

In [108]:
import string

def preprocess_tweet_typeB(tweet):
    # Lowercase the tweet
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    
    # Remove HTML entities
    tweet = re.sub(r'&\w+;', '', tweet)
    
    # Remove hashtags and mentions
    tweet = re.sub(r'#\w+|@\w+', '', tweet)
    
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    tweet = tweet.strip()
    tweet = re.sub(r'\s+', ' ', tweet)
    
    return tweet

# Apply the preprocessing function to the X_train dataset
tweet_X_cleanedB = tweet_X['tweet'].apply(preprocess_tweet_typeB)

# Display the first few cleaned tweet
tweet_X_cleanedB.head()

0     rt as a woman you shouldnt complain about clea...
1     rt boy dats coldtyga dwn bad for cuffin dat ho...
2     rt dawg rt you ever fuck a bitch and she start...
3                             rt she look like a tranny
4     rt the shit you hear about me might be true or...
5     the shit just blows meclaim you so faithful an...
6     i can not just sit up and hate on another bitc...
7     cause im tired of you big bitches coming for u...
8            you might not get ya bitch back thats that
9                 hobbies include fighting mariam bitch
10    keeks is a bitch she curves everyone lol i wal...
11                       murda gang bitch its gang land
12           so hoes that smoke are losers yea go on ig
13            bad bitches is the only thing that i like
14                                  bitch get up off me
Name: tweet, dtype: object

## Advanced Text Preprocessing

Additional Tokenization, remove Stopwords, and Lemmatization steps


In [11]:
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Download stopwords if not already downloaded
nltk.download("stopwords")
nltk.download("punkt")
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

def preprocess_tweet_typeA(tweet):
    # Lowercase the tweet
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    
    # Remove HTML entities
    tweet = re.sub(r'&\w+;', '', tweet)
    
    # Remove hashtags and mentions
    tweet = re.sub(r'#\w+|@\w+', '', tweet)
    
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    
    # Remove extra whitespace
    tweet = tweet.strip()
    tweet = re.sub(r'\s+', ' ', tweet)
    
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    return " ".join(lemmatized_tokens)

# Apply the preprocessing function to the tweet_X dataset
tweet_X_cleanedA = tweet_X['tweet'].apply(preprocess_tweet_typeA)

# Display the first few cleaned tweets
tweet_X_cleanedA.head(15)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000020BB2DD9400>>
Traceback (most recent call last):
  File "C:\Users\Bryan Chan\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bryan Chan\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1533, in enumerate
    def enumerate():
    
KeyboardInterrupt: 


NameError: name 'stop_words' is not defined

## Entity Detection

In [135]:
# take a 20% sample of the 'tweet_X_cleanedB' data
tweet_sample = tweet_X_cleanedB.sample(frac=0.2, random_state=42)

nlp = spacy.load('en_core_web_sm')

#let's get the named entities:
doc = [nlp(sentence) for sentence in tweet_sample]
for i in doc:
    for ent in i.ents:
        if ent.label_ == 'PERSON':
            print(ent.text, ent.label_ )

kno PERSON
rt js PERSON
mufucka PERSON
rt jsu PERSON
omar johnson PERSON
somethingod throwin PERSON
ewww yuck PERSON
shylock PERSON
rt rt PERSON
rt hun PERSON
blanke beslissing PERSON
rt oreos PERSON
charlie PERSON
george need PERSON
rt dese PERSON
nigga PERSON
eatin nd beatin PERSON
nah gurl PERSON
ya PERSON
ya pussy gon PERSON
coulda PERSON
ymas sws PERSON
rt bitches tweeting PERSON
rt jackie PERSON
lmaoooo beiber PERSON
kik PERSON
rt bitches jus love PERSON
rt trey songz PERSON
rt bitches fwu PERSON
rt rt word rt having zero PERSON
rt jihadi PERSON
rt yung berg PERSON
rt hol PERSON
rt uncle PERSON
lebron PERSON
rt hell yea PERSON
lmfaoooooooooooooooooo PERSON
rt ayy PERSON
rt rt PERSON
atl bvb sws PERSON
fob PERSON
charlie PERSON
gail PERSON
hey jim PERSON
wana PERSON
kevin hart PERSON
mary jane PERSON
lee castro PERSON
anthony PERSON
michael bay PERSON
rt obama PERSON
ali PERSON
rt obama PERSON
johnson PERSON
lmao rt lmao rt PERSON
rt joan rivers PERSON
ron brownvin PERSON
rt tom f

# Split Dataset

In [None]:
# Advanced Preprocessing
XA_train, XA_test, yA_train, yA_test = train_test_split(tweet_X_cleanedA, tweet_y, 
                                                    test_size=0.2,
                                                    stratify=tweet_y,
                                                    random_state=42)
print(XA_train.head(5))
print(XA_test.head(5))

In [None]:
# Simple Preprocessing
XB_train, XB_test, yB_train, yB_test = train_test_split(tweet_X_cleanedB, tweet_y, 
                                                    test_size=0.2,
                                                    stratify=tweet_y,
                                                    random_state=42)
print(XB_train.head(5))
print(XB_test.head(5))

# Train a Supervised Classifier

In [118]:
# Import Models and Metrics
from sklearn.naive_bayes import MultinomialNB        # Multinomial Naive Bayes model classifier model
from sklearn.linear_model import LogisticRegression  # Logistic Regression model classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest model classifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### MultinomialNB Model

In [123]:
# simple preprocessing
vectorizer = CountVectorizer() # Convert text data to numerical features
XB_train_vect = vectorizer.fit_transform(XB_train)
XB_test_vect = vectorizer.transform(XB_test)

model = MultinomialNB()
model.fit(XB_train_vect, yB_train) # Fit model to the Training Data

y_pred = model.predict(XB_test_vect) 

print(f"Bccuracy : {accuracy_score(yB_test, y_pred)}")
print(classification_report(yB_test, y_pred))

Bccuracy : 0.8575751462578173
              precision    recall  f1-score   support

           0       0.33      0.02      0.04       286
           1       0.86      0.98      0.92      3838
           2       0.85      0.57      0.68       833

    accuracy                           0.86      4957
   macro avg       0.68      0.52      0.55      4957
weighted avg       0.83      0.86      0.83      4957



  y = column_or_1d(y, warn=True)


In [127]:
# Advanced Preprocessing
vectorizer = CountVectorizer() # Convert text data to numerical features
XA_train_vect = vectorizer.fit_transform(XA_train)
XA_test_vect = vectorizer.transform(XA_test)

model = MultinomialNB() # Initialize the MultinomialNB model without class_weight
model.fit(XA_train_vect, yA_train) # Fit model to the Training Data

y_pred = model.predict(XA_test_vect) 

print(f"Accuracy : {accuracy_score(yA_test, y_pred)}")
print(classification_report(yA_test, y_pred))

Accuracy : 0.8616098446641114
              precision    recall  f1-score   support

           0       0.36      0.03      0.06       286
           1       0.87      0.98      0.92      3838
           2       0.86      0.59      0.70       833

    accuracy                           0.86      4957
   macro avg       0.69      0.54      0.56      4957
weighted avg       0.83      0.86      0.83      4957



  y = column_or_1d(y, warn=True)


### Logistic Regression

In [128]:
# Simple Preprocessing
vectorizer = CountVectorizer() # Convert text data to numerical features
XB_train_vect = vectorizer.fit_transform(XB_train)
XB_test_vect = vectorizer.transform(XB_test)

model = LogisticRegression(multi_class='ovr', solver='liblinear', class_weight='balanced')
model.fit(XB_train_vect, yB_train) # Fit model to the Training Data

y_pred = model.predict(XB_test_vect) 

print(f"Accuracy : {accuracy_score(yB_test, y_pred)}")
print(classification_report(yB_test, y_pred))

  y = column_or_1d(y, warn=True)


Accuracy : 0.8876336493847085
              precision    recall  f1-score   support

           0       0.42      0.47      0.44       286
           1       0.95      0.91      0.93      3838
           2       0.80      0.93      0.86       833

    accuracy                           0.89      4957
   macro avg       0.72      0.77      0.74      4957
weighted avg       0.90      0.89      0.89      4957



In [125]:
# Advanced Preprocessing
vectorizer = CountVectorizer() # Convert text data to numerical features
XA_train_vect = vectorizer.fit_transform(XA_train)
XA_test_vect = vectorizer.transform(XA_test)

model = LogisticRegression(multi_class='ovr', solver='liblinear', class_weight='balanced')
model.fit(XA_train_vect, yA_train) # Fit model to the Training Data

y_pred = model.predict(XA_test_vect) 

print(f"Accuracy : {accuracy_score(yA_test, y_pred)}")
print(classification_report(yA_test, y_pred))

Accuracy : 0.8862215049425055
              precision    recall  f1-score   support

           0       0.44      0.44      0.44       286
           1       0.95      0.91      0.93      3838
           2       0.78      0.92      0.84       833

    accuracy                           0.89      4957
   macro avg       0.72      0.76      0.74      4957
weighted avg       0.89      0.89      0.89      4957



  y = column_or_1d(y, warn=True)


### Random Forest

In [132]:
# Random Forest
vectorizer = CountVectorizer() # Convert text data to numerical features
XB_train_vect = vectorizer.fit_transform(XB_train)
XB_test_vect = vectorizer.transform(XB_test)

model = RandomForestClassifier(n_estimators=100, 
                               random_state=42, 
                               class_weight='balanced')
model.fit(XB_train_vect, yB_train) # Fit model to the Training Data

y_pred = model.predict(XB_test_vect) 

print(f"Accuracy : {accuracy_score(yB_test, y_pred)}")
print(classification_report(yB_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


Accuracy : 0.8628202541859996
              precision    recall  f1-score   support

           0       0.36      0.14      0.20       286
           1       0.87      0.97      0.92      3838
           2       0.88      0.61      0.72       833

    accuracy                           0.86      4957
   macro avg       0.70      0.57      0.61      4957
weighted avg       0.84      0.86      0.84      4957

