In [None]:
# Library for Data Preparation
import pandas as pd
import re

# Library for Classificaton Model
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    set_seed,
    GPT2Config,
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
    AdamW,
    get_cosine_schedule_with_warmup,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# load data

In [None]:
# Download dataset
! git clone https://github.com/rewire-online/edos.git

In [None]:
# Define the path to the CSV files
csv_labelled_aggregated = 'edos/data/edos_labelled_aggregated.csv'
csv_labelled_individual_annotations = 'edos/data/edos_labelled_individual_annotations.csv'
csv_gab_unlabelled = 'edos/data/gab_1M_unlabelled.csv'
csv_reddit_unlabelled = 'edos/data/reddit_1M_unlabelled.csv

In [None]:
# Read labelled data
data_labelled_1 = pd.read_csv(csv_labelled_aggregated)
data_labelled_1.head()

In [None]:
# Read labelled data
data_labelled_2 = pd.read_csv(csv_labelled_individual_annotations)
data_labelled_2 = data_labelled_2.drop(columns=['annotator'], inplace=False)
data_labelled_2.head()

In [None]:
# Concat data_labelled_1 and data_labelled_2
data_labelled = pd.concat([data_labelled_1, data_labelled_2])
data_labelled.count()

In [None]:
# Load data for Task A
selected_columns_taskA = ['text', 'label_sexist', 'split']
data_labelled_taskA = data_labelled[selected_columns_taskA]
data_labelled_taskA.head()

In [None]:
# Split into Train, Validate and Test
data_train_taskA = data_labelled_taskA[data_labelled_taskA['split'] == 'train']
data_train_taskA = data_train_taskA.drop('split', axis=1)
print("Train Data:\n", data_train_taskA.count(), "\n")

data_val_taskA = data_labelled_taskA[data_labelled_taskA['split'] == 'dev']
data_val_taskA = data_val_taskA.drop('split', axis=1)
print("Validation Data:\n", data_val_taskA.count(), "\n")

data_test_taskA = data_labelled_taskA[data_labelled_taskA['split'] == 'test']
data_test_taskA = data_test_taskA.drop('split', axis=1)
print("Test Data:\n", data_test_taskA.count())

## preprocessing

In [None]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    text=re.sub(r'[^\w\s]','',text)
    text=text.replace("[URL]","")
    text=text.replace("[USER]","")
    text=re.sub(r"[â€™ºðŸ‡˜Žµ±¤£‘Œ”œ]","",text)
    return text

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):

    text = remove_between_square_brackets(text)
    return text

# to make all texts lowercase
def to_lowercase(input_text):

    return input_text.lower()

In [None]:
data_train_taskA['text'] = data_train_taskA['text'].apply(clean_text)
data_train_taskA['text'] = data_train_taskA['text'].apply(denoise_text)
data_train_taskA['text'] = data_train_taskA['text'].apply(to_lowercase)

data_val_taskA['text'] = data_val_taskA['text'].apply(clean_text)
data_val_taskA['text'] = data_val_taskA['text'].apply(denoise_text)
data_val_taskA['text'] = data_val_taskA['text'].apply(to_lowercase)

data_test_taskA['text'] = data_test_taskA['text'].apply(clean_text)
data_test_taskA['text'] = data_test_taskA['text'].apply(denoise_text)
data_test_taskA['text'] = data_test_taskA['text'].apply(to_lowercase)

In [None]:
class SeximsDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        record = self.data.iloc[index]
        text = record['text']
        label = record['label_sexist']
        if label == 'not sexist':
            label = 0
        else:
            label = 1
        return {'text': text, 'label': label} 

In [None]:
train_dataset = SeximsDataset(data_train_taskA)
val_dataset = SeximsDataset(data_val_taskA)
test_dataset = SeximsDataset(data_test_taskA)

# Train GPT-2 for the taska

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [None]:
class Gpt2ClassificationCollator(object):
    def __init__(self, tokenizer, max_seq_len=None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        return
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(
            text=texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=self.max_seq_len,
        )
        inputs.update({'labels': torch.tensor(labels)})
        return inputs

In [None]:
gpt2_classification_collator = Gpt2ClassificationCollator(tokenizer, max_seq_len=256)

In [None]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=gpt2_classification_collator,
)

val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=gpt2_classification_collator,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=gpt2_classification_collator,
)

In [None]:
set_seed(36)
model_config = GPT2Config.from_pretrained("gpt2", num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

In [None]:
TOTAL_EPOCHS = 5

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01,
    },
    {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
    }
]
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=1e-5,
    eps=1e-8,
)

num_training_steps = TOTAL_EPOCHS * len(train_dataloader)
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

In [None]:
def train(dataloader, optimizer, scheduler, device_):
    global model
    model.train()

    predictions_labels = []
    true_labels = []

    total_loss = []

    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        outputs = model(**batch)
        loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        total_loss.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    return true_labels, predictions_labels, total_loss

def validation(dataloader, device_):
    global model
    model.eval()

    predictions_labels = []
    true_labels = []

    total_loss = []

    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss.append(loss.item())

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return true_labels, predictions_labels, total_loss

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

for epoch in range(TOTAL_EPOCHS):
    y, y_pred, train_loss = train(train_dataloader, optimizer, lr_scheduler, device)

    train_acc = accuracy_score(y, y_pred)

    y, y_pred, val_loss = validation(val_dataloader, device)
    val_acc = accuracy_score(y, y_pred)

    all_loss['train_loss'] += train_loss
    all_loss['val_loss'] += val_loss

    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)

    print(f'Epoch: {epoch}, train_loss: {torch.tensor(train_loss).mean():.3f}, train_acc: {train_acc:.3f}, val_loss: {torch.tensor(val_loss).mean():.3f}, val_acc: {val_acc:.3f}')

    model.save_pretrained('./')
    tokenizer.save_pretrained('./')

In [None]:
fig = plt.figure(figsize=(20, 20))

a = fig.add_subplot(4, 1, 1)
b = fig.add_subplot(4, 1, 2)
c = fig.add_subplot(4, 1, 3)

a.plot(all_loss['train_loss'], label='Train Loss')
b.plot(all_loss['val_loss'], label='Val Loss')
c.plot(all_acc['train_acc'], label='Train Accuracy')
c.plot(all_acc['val_acc'], label='Val Accuracy')
c.set(xlabel='Epochs', ylabel='Accuracy')
c.legend(['Train', 'Validation'])

In [None]:
test_y, test_y_pred, test_val_loss =validation(test_dataloader, device)
test_val_acc = accuracy_score(test_y, test_y_pred)
print(f'Test Loss: {torch.tensor(test_val_loss).mean():.3f}, Test Accuracy: {test_val_acc:.3f}')