# Implementing spaCy's Text Categorizer for Classifying Hate Speech

In [66]:
import numpy as np
import random
from time import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from dataset.dataset import Dataset
from constants import *

import spacy
import spacy_transformers
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

In [64]:
RANDOM_SEED = 0

In [30]:
# Load the data
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=split_sizes_cleaned)
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


In [31]:
# Combine X_train and Y_train into a list of dictionaries
data = [{"text": tweet, "label": label} for tweet, label in zip(X_train, Y_train)]

In [59]:
# Instantiate an empty spacy model
nlp = spacy.blank("en-tl")

# Add a text categorizer to the pipeline
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

# Add labels to the text classifier
textcat.add_label("HATE_SPEECH")
textcat.add_label("NOT_HATE_SPEECH")

1

In [60]:
# Convert the data to spaCy's Example format
examples = []
for entry in data:
    doc = nlp.make_doc(entry["text"])
    cats = {"HATE_SPEECH": entry["label"] == 1, 
            "NOT_HATE_SPEECH": entry["label"] == 0}
    examples.append(Example.from_dict(doc, {"cats": cats}))

# Convert examples to DocBin for efficient serialization
train_docbin = DocBin(docs=[example.reference for example in examples])
train_docbin.to_disk("dataset/train.spacy")

## Training

In [65]:
# Training the model
def train_model(examples, iterations):
    start = time()
    optimizer = nlp.initialize()
    for i in range(iterations):
        random.shuffle(examples)
        losses = {}
        # Batch the examples and iterate over them
        batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, sgd=optimizer, losses=losses)
        print(f"Iteration {i}, Loss: {losses}")
    end = time()
    print(f'Training Time: {end-start}')

# Train the model for a given number of iterations
train_model(examples, iterations=10)

# Save the trained model
nlp.to_disk("models/textcat_model")

Iteration 0, Loss: {'textcat': 288.25427383184433}
Iteration 1, Loss: {'textcat': 245.54165340494365}
Iteration 2, Loss: {'textcat': 216.3791906798724}
Iteration 3, Loss: {'textcat': 191.04857891355277}
Iteration 4, Loss: {'textcat': 171.08024123770156}
Iteration 5, Loss: {'textcat': 159.13212397720417}
Iteration 6, Loss: {'textcat': 149.4866166453976}
Iteration 7, Loss: {'textcat': 138.31153382081493}
Iteration 8, Loss: {'textcat': 129.21508954725687}
Iteration 9, Loss: {'textcat': 121.1759618195199}
Training Time: 88.0280921459198


## Inference

In [68]:
# Function to get predictions from the model
def get_predictions(nlp, texts):
    predictions = []
    for text in texts:
        doc = nlp(text)
        # Get the label with the highest score
        if doc.cats["HATE_SPEECH"] > doc.cats["NOT_HATE_SPEECH"]:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

start = time()
# Get predictions for X_test
y_pred = get_predictions(nlp, X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
end = time()

# Print evaluation results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f'Inference Time: {end-start}')


Accuracy: 0.7364185110663984
Precision: 0.7385691231845078
Recall: 0.7095607235142118
F1-Score: 0.7237743806009489
Inference Time: 2.350472927093506
