<a href="https://colab.research.google.com/github/hcostanog-AI/Racial-Bias-in-Abusive-Language-Detection/blob/main/Replication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Import necessary libraries
import pandas as pd                  # For data manipulation and loading CSVs
import numpy as np                   # For numerical operations (was not used)
from sklearn.feature_extraction.text import TfidfVectorizer  # For transforming text into numerical features
from sklearn.linear_model import LogisticRegression          # Classifier used for training
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold  # For model validation and tuning
from sklearn.metrics import f1_score, classification_report  # For evaluation metrics
from sklearn.pipeline import Pipeline                        # To streamline feature extraction + model in one object
import re                                                     # For regex-based text cleaning
from nltk.stem import SnowballStemmer                        # For word stemming (reduces words to base form)
from nltk.tokenize import TweetTokenizer                     # Tokenizer designed for tweets
import nltk
nltk.download('punkt')                                       # Download NLTK tokenizer model

# Set up preprocessing tools
stemmer = SnowballStemmer("english")        # Initialize English stemmer
tokenizer = TweetTokenizer()                # Initialize tweet-specific tokenizer

# Preprocessing function
def preprocess_tweet(text):
    text = re.sub(r"http\S+|www\S+", "<URL>", text)         # Replace URLs with a placeholder
    text = re.sub(r"@[A-Za-z0-9_]+", "<MENTION>", text)     # Replace mentions with a placeholder
    text = re.sub(r"\s+", " ", text).strip()                # Normalize whitespace
    tokens = tokenizer.tokenize(text.lower())               # Lowercase and tokenize
    stemmed = [stemmer.stem(token) for token in tokens]     # Stem each token
    return " ".join(stemmed)                                # Recombine tokens into a single string

# Load dataset
file_path = "/Users/humbecosta/Desktop/UOB/Project AI GC/1st attempt/labeled_data.csv"
try:
    df = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')  # Load CSV using ';' delimiter
    print("File loaded successfully!")
    print(df.head())  # Preview first few rows
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")  # Catch formatting issues
except FileNotFoundError:
    print(f"File not found: {file_path}")  # Catch missing file errors

print(df)  # Print entire DataFrame (useful for debugging, but can be large)

# Check what columns are available
print("Column names:", df.columns)

# Clean and preprocess tweets
if 'tweet' in df.columns:
    df = df.drop_duplicates(subset='tweet')  # Drop duplicate tweets
    print("Duplicates dropped successfully!")
else:
    print("Column 'tweet' not found in the dataframe.")

# Preprocess tweet text if column exists
if 'tweet' in df.columns:
    df['processed_tweet'] = df['tweet'].astype(str).apply(preprocess_tweet)  # Apply cleaning function
    print("Tweets preprocessed successfully!")
else:
    print("Column 'tweet' not found in the dataframe.")


# Feature extraction with TF-IDF
# Use unigrams, bigrams, and trigrams
# Limit to top 10,000 features to reduce dimensionality
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

# === Split data into train and validation sets ===
X = df['processed_tweet']     # Feature input
y = df['class']               # Labels (0 = Hate, 1 = Offensive, 2 = Neither)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,            # 80/20 split
    stratify=y,               # Preserve label distribution in both sets
    random_state=42           # Reproducibility
)

# === Build ML pipeline ===
pipeline = Pipeline([
    ('tfidf', vectorizer),                                        # First step: transform text into TF-IDF features
    ('clf', LogisticRegression(solver='liblinear', class_weight='balanced'))  # Second step: train classifier
])

# Hyperparameter tuning via grid search
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100]  # Regularization strengths to test
}
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5),    # Use stratified k-fold to ensure balanced folds
    scoring='f1_weighted',             # Optimize for weighted F1 score
    n_jobs=-1                          # Use all cores for faster training
)
grid.fit(X_train, y_train)             # Train and validate over grid

# Show best results
print("Best F1 score:", grid.best_score_)       # Best score across folds
print("Best Parameters:", grid.best_params_)    # Corresponding hyperparameters

# Evaluate on validation set
y_pred = grid.predict(X_val)  # Predict on held-out validation set
print(classification_report(y_val, y_pred, digits=4))  # Print detailed precision/recall/F1

# Convert label IDs to the proper categorize
label_mapping = {0: "Hate", 1: "Offensive", 2: "Neither"}  # Label interpretation

y_val_mapped = [label_mapping[label] for label in y_val]    # Map actual labels
y_pred_mapped = [label_mapping[label] for label in y_pred]  # Map predictions

# Print more readable classification report
print(classification_report(y_val_mapped, y_pred_mapped, digits=4))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/humbecosta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  df = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')  # Load CSV using ';' delimiter


File loaded successfully!
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      4            6                   0        0      2   
4           4      3            0                   2        1      1   

                                               tweet  Unnamed: 7  Unnamed: 8  \
0  !!! RT @mayasolovely: As a woman you shouldn't...         NaN         NaN   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...         NaN         NaN   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...         NaN         NaN   
3                    hut! We like it in our butt!"""         NaN         NaN   
4  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...         NaN         NaN   

  Unnamed: 9  ... Unnamed: 56 Unnamed: 57 Unnamed: 58 