In [71]:
import re
# Numpy 
import numpy as np
# Pickle
import pickle
# Pandas
import pandas as pd
# Hugging Face
import huggingface_hub
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# PyTorch
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# SkLearn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# nltk.download()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [72]:
# Load datasets 
# Hate Xplain
hate_xplain = pd.read_csv("data/hate_xplain.csv")

# Implicit Hate 
implicit_hate = pd.read_csv('data/implicit-hate-corpus/implicit_hate_v1_stg2_posts.tsv', delimiter='\t')
label_map = {
    'white_grievance': 0, 'incitement': 1, 'inferiority': 2,
    'irony': 3, 'stereotypical': 4, 'threatening': 5, 'other': 6
}

implicit_hate['class_label'] = implicit_hate['implicit_class'].map(label_map)
implicit_hate.drop("extra_implicit_class", axis=1, inplace=True)

# Toxic-Spans
annotations = pd.read_csv('data/toxic-spans/annotations.csv')
comments = pd.read_csv('data/toxic-spans/comments.csv')

toxic_spans = pd.merge(annotations, comments, on='comment_id')

In [73]:
hate_xplain

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [99]:
def tokenize_data(texts, labels, tokenizer, max_length):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    texts = [str(text) for text in texts] 

    if isinstance(labels, pd.Series):
        labels = labels.tolist()
    labels = torch.tensor(labels, dtype=torch.long)
    
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
    return dataset


In [100]:
bert = 'distilbert-base-uncased'
bert_model = AutoModelForSequenceClassification.from_pretrained(bert, num_labels=3, 
                id2label={0: "hate speech", 1: "normal", 2: "offensive"}, label2id={"hate speech": 0, "normal": 1, "offensive": 2}, max_length=128)

def freeze_bert_layers(layers: list):
    for name, param in bert_model.named_parameters():
        # Unfreeze only the last two layers
        for layer in layers:
            if layer in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
    return bert_model

bert_model_4_5 = freeze_bert_layers(["transformer.layer.4", "transformer.layer.5"])
# if "transformer.layer.4" in name or "transformer.layer.5" in name:
#     param.requires_grad = True
# else:
#     param.requires_grad = False

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [105]:
hx_train_text, hx_test_text, hx_train_labels, hx_test_labels = train_test_split(hate_xplain['tweet'], hate_xplain['class'], test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(bert, clean_up_tokenization_spaces=True)
hx_train = tokenize_data(hx_train_text, hx_train_labels, tokenizer, 512)
hx_test = tokenize_data(hx_test_text, hx_test_labels, tokenizer, 512)

hx_train_loader = DataLoader(hx_train, batch_size=32, shuffle=True)
hx_test_loader = DataLoader(hx_test, batch_size=32, shuffle=True)

In [107]:
print(hx_train_loader.__len__())
print(hx_test_loader.__len__())

620
155


In [108]:
def train(model, data_loader, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for input_ids, attention_mask, labels in data_loader:
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} Loss: {loss.item()}")

def evaluate(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            actuals.extend(labels.tolist())
    return predictions, actuals

In [111]:
optimizer = optim.AdamW(bert_model_4_5.parameters(), lr=5e-5)
train(bert_model_4_5, hx_train_loader, optimizer, 5)
pickle.dump(bert_model_4_5, open("BERT/bert_4_5.", 'wb'))


Epoch 1 Loss: 0.3177885115146637
Epoch 2 Loss: 0.06367813795804977
Epoch 3 Loss: 0.0864814817905426
Epoch 4 Loss: 0.12104514986276627
Epoch 5 Loss: 0.2678643763065338


In [114]:
eval_preds, eval_labels = evaluate(bert_model_4_5, hx_test_loader)
print(classification_report(eval_labels, eval_preds))

              precision    recall  f1-score   support

           0       0.53      0.26      0.35       284
           1       0.93      0.96      0.94      3827
           2       0.86      0.88      0.87       846

    accuracy                           0.90      4957
   macro avg       0.77      0.70      0.72      4957
weighted avg       0.89      0.90      0.90      4957



In [118]:
bert_model_2_3 = freeze_bert_layers(["transformer.layer.2", "transformer.layer.3"])
optimizer = optim.AdamW(bert_model_2_3.parameters(), lr=5e-5)

In [119]:
hate_xplain = hate_xplain.sample(n=10000, random_state=42)
hate_xplain

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
20276,20721,3,1,2,0,1,RT @verkannt__: None of my bitches judge me th...
5857,6023,3,0,0,3,2,@dish \nWas just wondering if yo espect to add...
7467,7679,3,1,2,0,1,A real nigga ain't complete without a real bitch
2445,2496,3,0,2,1,1,@AndrewJBurns1 I would but I got dat new monop...
962,984,3,0,3,0,1,&#128165;&#128162; on the pussy http://t.co/mW...
...,...,...,...,...,...,...,...
18721,19136,6,0,6,0,1,RT @carterreynolds: Austin Mahone is making bi...
20280,20725,3,0,3,0,1,RT @versaceezy: if we date u don't even gotta ...
22461,22936,3,1,2,0,1,Wake up American sheeple nd stop allowing thes...
15533,15898,3,1,2,0,1,RT @Hotsausage_: Is it a crime if a nigga just...


In [120]:
hx_train_text, hx_test_text, hx_train_labels, hx_test_labels = train_test_split(hate_xplain['tweet'], hate_xplain['class'], test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(bert, clean_up_tokenization_spaces=True)
hx_train = tokenize_data(hx_train_text, hx_train_labels, tokenizer, 512)
hx_test = tokenize_data(hx_test_text, hx_test_labels, tokenizer, 512)

hx_train_loader = DataLoader(hx_train, batch_size=64, shuffle=True)
hx_test_loader = DataLoader(hx_test, batch_size=64, shuffle=True)

In [121]:
train(bert_model_2_3, hx_train_loader, optimizer, 5)
pickle.dump(bert_model_2_3, open("BERT/bert_2_3.pkl", 'wb'))

Epoch 1 Loss: 0.23374336957931519
Epoch 2 Loss: 0.1793309450149536
Epoch 3 Loss: 0.0936882495880127
Epoch 4 Loss: 0.0728011503815651
Epoch 5 Loss: 0.06655523926019669


In [122]:
eval_preds, eval_labels = evaluate(bert_model_2_3, hx_test_loader)
print(classification_report(eval_labels, eval_preds))

              precision    recall  f1-score   support

           0       0.71      0.48      0.58       122
           1       0.95      0.98      0.97      1519
           2       0.96      0.94      0.95       359

    accuracy                           0.94      2000
   macro avg       0.87      0.80      0.83      2000
weighted avg       0.94      0.94      0.94      2000

