In [1]:
# Pandas
import pandas as pd
# Hugging Face
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# PyTorch
import torch 
import torch.optim as optim
from torch.utils.data import DataLoader
# SkLearn
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from nnsight import NNsight
import numpy as np 

In [2]:
## Datasets 
# Hate Xplain
hate_xplain = pd.read_csv(r'data\hate_xplain.csv')

# Implicit Hate 
implicit_hate = pd.read_csv(r'data\implicit_hate_v1_stg2_posts.tsv', delimiter='\t')
label_map = {
    'white_grievance': 0, 'incitement': 1, 'inferiority': 2,
    'irony': 3, 'stereotypical': 4, 'threatening': 5, 'other': 6
}
implicit_hate['class_label'] = implicit_hate['implicit_class'].map(label_map)
implicit_hate.drop("extra_implicit_class", axis=1, inplace=True)

# hate_xplain = hate_xplain.sample(n=1000, random_state=42)
hate_xplain

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [3]:
def tokenize_data(texts, labels, tokenizer, max_length):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    texts = [str(text) for text in texts] 

    if isinstance(labels, pd.Series):
        labels = labels.tolist()
    labels = torch.tensor(labels, dtype=torch.long)
    
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
    return dataset

def train(model, data_loader, optimizer, epochs, device=None):
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()

    for epoch in range(epochs):
        for input_ids, attention_mask, labels in data_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} Loss: {loss.item()}")

def evaluate(model, data_loader, device=None):
    model.eval()
    model.to(device)

    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels

def freeze_bert_layers(model, layers: list):
    for name, param in model.named_parameters():
        # only unfreeze the layers in the list
        for layer in layers:
            if layer in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
    return model

In [4]:
bert = 'bert-base-uncased'
new_bert = AutoModelForSequenceClassification.from_pretrained(bert, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
new_bert = freeze_bert_layers(new_bert, ['encoder.layer.11'])

In [6]:
hx_train_text, hx_test_text, hx_train_labels, hx_test_labels = train_test_split(hate_xplain['tweet'], hate_xplain['class'], test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(bert, clean_up_tokenization_spaces=True)
hx_train = tokenize_data(hx_train_text, hx_train_labels, tokenizer, 512)
hx_test = tokenize_data(hx_test_text, hx_test_labels, tokenizer, 512)

hx_train_loader = DataLoader(hx_train, batch_size=16, shuffle=True)
hx_test_loader = DataLoader(hx_test, batch_size=16, shuffle=True)

hx_train_loader.__len__(), hx_test_loader.__len__()

(1240, 310)

In [7]:
optimizer = optim.AdamW(new_bert.parameters(), lr=5e-5)
train(new_bert, hx_train_loader, optimizer, 5, 'cuda')
new_bert.save_pretrained("BERT/bert_full")

Epoch 1 Loss: 0.05439606308937073
Epoch 2 Loss: 0.05981718748807907
Epoch 3 Loss: 0.08305776119232178
Epoch 4 Loss: 0.12415436655282974
Epoch 5 Loss: 0.07273364812135696


In [7]:
new_bert = AutoModelForSequenceClassification.from_pretrained("BERT/bert_full", num_labels=3)
optimizer = optim.AdamW(new_bert.parameters(), lr=5e-5)

In [8]:
# Baseline HateXplain Accuracy
hx_preds, hx_labels = evaluate(new_bert, hx_test_loader, 'cuda')
hx_report = accuracy_score(hx_labels, hx_preds)
print(f'Baseline HateXplain Accuracy: {hx_report}')

Baseline HateXplain Accuracy: 0.9231389953600968


In [9]:
def evaluate_and_zero_head(model, data_loader, head:int, device=None): 
    """
    Evaluate the model and zero out the given head. 
    Return the predictions and labels.
    Args:
        model: The NNsight model to evaluate
        data_loader: the test set data loader 
        head: The head # to zero out
        device: The device to evaluate on (gpu or cpu)
    Returns:
        preds: The model's predictions
        labels: The model's labels
    """
    model.eval()
    model.to(device)
    # Batch -> (input_ids, attention_mask, labels)
    for batch in data_loader:
        outputs, labels = [], []
        labels.append(batch[2])
        # Begin intervention 
        with model.trace(batch[0]) as tracer: 
            # Here, the given head # is zeroed out 
            model.bert.encoder.layer[head].output = 0.0
            output = model.output[0].save()
            outputs.append(output)
        # End intervention
        labels = labels[0]
        outputs = outputs[0]
        preds = outputs.argmax(dim=1)
    return preds, labels

In [11]:
nn_model = NNsight(new_bert)



In [12]:
accuracies = []
for i in range(12): 
    print(f"Zeroing out attention head {i}.")
    preds, labels = evaluate_and_zero_head(nn_model, hx_test_loader, i, 'cuda')

    preds, labels = preds.cpu().numpy(), labels.cpu().numpy()
    accuracy = accuracy_score(labels, preds)
    accuracies.append(accuracy)
    print(f"  - Accuracy: {accuracy}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Zeroing out attention head 0.
  - Accuracy: 0.6923076923076923
Zeroing out attention head 1.
  - Accuracy: 0.5384615384615384
Zeroing out attention head 2.
  - Accuracy: 0.5384615384615384
Zeroing out attention head 3.
  - Accuracy: 0.5384615384615384
Zeroing out attention head 4.
  - Accuracy: 0.6923076923076923
Zeroing out attention head 5.
  - Accuracy: 0.6153846153846154
Zeroing out attention head 6.
  - Accuracy: 0.9230769230769231
Zeroing out attention head 7.
  - Accuracy: 0.6153846153846154
Zeroing out attention head 8.
  - Accuracy: 0.6923076923076923
Zeroing out attention head 9.
  - Accuracy: 0.6923076923076923
Zeroing out attention head 10.
  - Accuracy: 0.7692307692307693
Zeroing out attention head 11.
  - Accuracy: 0.6153846153846154


In [10]:
ih_train_text, ih_test_text, ih_train_labels, ih_test_labels = train_test_split(implicit_hate['post'], implicit_hate['class_label'], test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(bert, clean_up_tokenization_spaces=True)
ih_train = tokenize_data(ih_train_text, ih_train_labels, tokenizer, 512)
ih_test = tokenize_data(ih_test_text, ih_test_labels, tokenizer, 512)

ih_train_loader = DataLoader(ih_train, batch_size=16, shuffle=True)
ih_test_loader = DataLoader(ih_test, batch_size=16, shuffle=True)

ih_train_loader.__len__(), ih_test_loader.__len__()

(318, 80)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(bert, num_labels=7)
model = freeze_bert_layers(model, ['encoder.layer.11'])
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
train(model, ih_train_loader, optimizer, 5, 'cuda')
model.save_pretrained("BERT/bert_ih_baseline")
preds, labels = evaluate(model, ih_test_loader, 'cuda')
ih_report = accuracy_score(labels, preds)
print(f'Baseline Implicit Hate Accuracy: {ih_report}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Loss: 1.5557514429092407
Epoch 2 Loss: 1.1465587615966797
Epoch 3 Loss: 0.9820433855056763
Epoch 4 Loss: 1.1998872756958008
Epoch 5 Loss: 1.2950735092163086
Baseline Implicit Hate Accuracy: 0.6236220472440945


In [11]:
model = AutoModelForSequenceClassification.from_pretrained('BERT/bert_full', num_labels=7, ignore_mismatched_sizes=True)
model = freeze_bert_layers(model, ['encoder.layer.11'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BERT/bert_full and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Transfer learning 
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
train(model, ih_train_loader, optimizer, 5, 'cuda')
model.save_pretrained("BERT/transfer_model")

Epoch 1 Loss: 1.5082802772521973
Epoch 2 Loss: 1.8577481508255005
Epoch 3 Loss: 0.7282094955444336
Epoch 4 Loss: 1.629270315170288
Epoch 5 Loss: 1.1496150493621826


In [16]:
preds, labels = evaluate(model, ih_test_loader, 'cuda')
report = accuracy_score(labels, preds)
print(f"Transfer learning accuracy: {report}")

Transfer learning accuracy: 0.6070866141732284


In [17]:
# is one head more important - dropping head -> dropping head in overall 