In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import TweetTokenizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import gensim.downloader as api
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import xgboost

In [2]:
# NLTK Downloads
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/e.d.i.t.h/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/e.d.i.t.h/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file_path = './HateXplain-Dataset.xlsx'
df = pd.read_excel(file_path)

print(df.columns)
df.head()

Index(['Unnamed: 0', 'origin_id', 'target', 'text', 'label'], dtype='object')


Unnamed: 0.1,Unnamed: 0,origin_id,target,text,label
0,0,1179055004553900032,,i dont think im getting my baby them white 9 h...,nhs
1,1,1179063826874032128,,we cannot continue calling ourselves feminists...,nhs
2,2,1178793830532956161,ethnicity,nawt yall niggers ignoring me,nhs
3,3,1179088797964763136,ethnicity,<user> i am bit confused coz chinese ppl can n...,hs
4,4,1179085312976445440,gender,this bitch in whataburger eating a burger with...,hs


In [4]:
# Filter for rows where target is 'ethnicity'
ethnicity_df = df[df['target'] == 'ethnicity']

# Display the shape and first few rows of the filtered dataset
print(f"Ethnicity dataset shape: {ethnicity_df.shape}")
ethnicity_df.head()


Ethnicity dataset shape: (5801, 5)


Unnamed: 0.1,Unnamed: 0,origin_id,target,text,label
2,2,1178793830532956161,ethnicity,nawt yall niggers ignoring me,nhs
3,3,1179088797964763136,ethnicity,<user> i am bit confused coz chinese ppl can n...,hs
5,5,13851720,ethnicity,laura loomer raped me while screaming at me in...,hs
6,6,24198545,ethnicity,and this is why i end up with nigger trainee d...,hs
7,7,14567516,ethnicity,nogs jews and dykes how enriching,hs


In [5]:
# Filter for rows where target is NaN
nan_target_df = df[df['target'].isnull()]

# Display the shape and first few rows of the filtered dataset
print(f"NaN target dataset shape: {nan_target_df.shape}")
nan_target_df.head()

NaN target dataset shape: (4850, 5)


Unnamed: 0.1,Unnamed: 0,origin_id,target,text,label
0,0,1179055004553900032,,i dont think im getting my baby them white 9 h...,nhs
1,1,1179063826874032128,,we cannot continue calling ourselves feminists...,nhs
11,11,1159278048069464065,,<percent> cotton thigh highs to avoid contact ...,nhs
12,12,1170440965888335874,,oomf a <number> yearl old closet gay and obses...,nhs
14,14,1122734457230774272,,fellas be careful she texting other niggas,nhs


In [6]:
# Filter for rows where target is NaN
nan_target_df = df[df['target'].isnull()]

# Display the shape and first few rows of the filtered dataset
print(f"NaN target dataset shape: {nan_target_df.shape}")
nan_target_df.head()

NaN target dataset shape: (4850, 5)


Unnamed: 0.1,Unnamed: 0,origin_id,target,text,label
0,0,1179055004553900032,,i dont think im getting my baby them white 9 h...,nhs
1,1,1179063826874032128,,we cannot continue calling ourselves feminists...,nhs
11,11,1159278048069464065,,<percent> cotton thigh highs to avoid contact ...,nhs
12,12,1170440965888335874,,oomf a <number> yearl old closet gay and obses...,nhs
14,14,1122734457230774272,,fellas be careful she texting other niggas,nhs


In [7]:
# Combine the two datasets (ethnicity and NaN target)
final_data = pd.concat([ethnicity_df, nan_target_df])

# Reset the index for the new DataFrame
final_data = final_data.reset_index(drop=True)

# Verify the 'target' column remains intact
print("Unique targets in the final dataset:")
print(final_data['target'].unique())

# Verify the 'label' column remains intact
print("Unique labels in the final dataset:")
print(final_data['label'].unique())

# Display the shape of the final dataset
print(f"Final dataset shape: {final_data.shape}")
final_data.head()

Unique targets in the final dataset:
['ethnicity' nan]
Unique labels in the final dataset:
['nhs' 'hs']
Final dataset shape: (10651, 5)


Unnamed: 0.1,Unnamed: 0,origin_id,target,text,label
0,2,1178793830532956161,ethnicity,nawt yall niggers ignoring me,nhs
1,3,1179088797964763136,ethnicity,<user> i am bit confused coz chinese ppl can n...,hs
2,5,13851720,ethnicity,laura loomer raped me while screaming at me in...,hs
3,6,24198545,ethnicity,and this is why i end up with nigger trainee d...,hs
4,7,14567516,ethnicity,nogs jews and dykes how enriching,hs


In [8]:
#drop unwanted columns
final_data = final_data.drop(['Unnamed: 0', 'origin_id'], axis=1)
final_data.head()

Unnamed: 0,target,text,label
0,ethnicity,nawt yall niggers ignoring me,nhs
1,ethnicity,<user> i am bit confused coz chinese ppl can n...,hs
2,ethnicity,laura loomer raped me while screaming at me in...,hs
3,ethnicity,and this is why i end up with nigger trainee d...,hs
4,ethnicity,nogs jews and dykes how enriching,hs


In [9]:
#Remove unnecessary characters, such as mentions (@user), URLs, and special characters,

def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply cleaning to the 'text' column
final_data['text'] = final_data['text'].apply(clean_text)

# Display a few cleaned rows
final_data.head()

Unnamed: 0,target,text,label
0,ethnicity,nawt yall niggers ignoring me,nhs
1,ethnicity,user i am bit confused coz chinese ppl can not...,hs
2,ethnicity,laura loomer raped me while screaming at me in...,hs
3,ethnicity,and this is why i end up with nigger trainee d...,hs
4,ethnicity,nogs jews and dykes how enriching,hs


In [10]:
# integer label encoding

# Map 'hs' (hate speech, in this context xenophobic) to 1 and 'nhs' (non-hate speech, here non-xenophobic) to 0
final_data['label'] = final_data['label'].map({'hs': 1, 'nhs': 0})

# Verify the transformation
print(final_data['label'].value_counts())

label
0    5387
1    5264
Name: count, dtype: int64


In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_data['text'], final_data['label'], test_size=0.2, random_state=42)


In [12]:
# # Import necessary libraries
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from transformers import BertTokenizer, BertModel
# import torch


# # Load BERT tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # Function to generate BERT embeddings
# def generate_bert_embeddings(text_list):
#     # Tokenize the text
#     tokens = tokenizer(text_list.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=512)

#     # Pass tokens through the BERT model
#     with torch.no_grad():
#         outputs = model(**tokens)

#     # Use the [CLS] token embedding (first token) as the sentence embedding
#     embeddings = outputs.last_hidden_state[:, 0, :].detach()
#     return embeddings

# # Generate embeddings for training and testing sets
# train_embeddings = generate_bert_embeddings(X_train)
# test_embeddings = generate_bert_embeddings(X_test)

# # Convert embeddings to numpy arrays for downstream tasks
# X_train_embeddings = train_embeddings.numpy()
# X_test_embeddings = test_embeddings.numpy()

# # Labels
# y_train = y_train.values
# y_test = y_test.values

# # Print shapes for verification
# print("Training Embeddings Shape:", X_train_embeddings.shape)
# print("Testing Embeddings Shape:", X_test_embeddings.shape)


In [13]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient, fast, and suitable for classification

# Generate sentence embeddings for training and testing sets
X_train_embeddings = sbert_model.encode(X_train.tolist())
X_test_embeddings = sbert_model.encode(X_test.tolist())

# Convert labels to numpy arrays
y_train = y_train.values
y_test = y_test.values

# Print shapes for verification
print("Training Embeddings Shape:", X_train_embeddings.shape)
print("Testing Embeddings Shape:", X_test_embeddings.shape)

Training Embeddings Shape: (8520, 384)
Testing Embeddings Shape: (2131, 384)


In [14]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, classification_report

# # Train an SVM classifier
# svm_clf = SVC(kernel='rbf', random_state=42)
# svm_clf.fit(X_train_embeddings, y_train)

# # Make predictions
# y_pred_svm = svm_clf.predict(X_test_embeddings)

# # Evaluate the model
# print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
# print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


In [15]:
# from xgboost import XGBClassifier

# # Train an XGBoost classifier
# xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# xgb_clf.fit(X_train_embeddings, y_train)

# # Make predictions
# y_pred_xgb = xgb_clf.predict(X_test_embeddings)

# # Evaluate the model
# print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
# print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


In [16]:
# from sklearn.linear_model import LogisticRegression

# # Train a Logistic Regression classifier
# log_reg = LogisticRegression(max_iter=1000, random_state=42)
# log_reg.fit(X_train_embeddings, y_train)

# # Make predictions
# y_pred_logreg = log_reg.predict(X_test_embeddings)

# # Evaluate the model
# print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
# print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))


In [17]:
# Train and evaluate classifiers
# 1. SVM
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(X_train_embeddings, y_train)
y_pred_svm = svm_clf.predict(X_test_embeddings)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.8559361801970906
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.86      1051
           1       0.88      0.83      0.85      1080

    accuracy                           0.86      2131
   macro avg       0.86      0.86      0.86      2131
weighted avg       0.86      0.86      0.86      2131



In [18]:
# 2. XGBoost
from xgboost import XGBClassifier


xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_embeddings, y_train)
y_pred_xgb = xgb_clf.predict(X_test_embeddings)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.8193336461755044
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      1051
           1       0.83      0.80      0.82      1080

    accuracy                           0.82      2131
   macro avg       0.82      0.82      0.82      2131
weighted avg       0.82      0.82      0.82      2131



In [19]:
# 3. Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_embeddings, y_train)
y_pred_logreg = log_reg.predict(X_test_embeddings)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.8296574378226185
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83      1051
           1       0.85      0.81      0.83      1080

    accuracy                           0.83      2131
   macro avg       0.83      0.83      0.83      2131
weighted avg       0.83      0.83      0.83      2131

