In [25]:
import numpy as np
import pandas as pd
import pickle
import argparse
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
import os
import random
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

#PYTORCH_ENABLE_MPS_FALLBACK=1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps")
device

device(type='cpu')

In [26]:
class SexistDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [27]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
       # self.class_weights = torch.FloatTensor(class_weights)
        self.weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights)).to(device)
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, num_items_in_batch=False, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = self.weighted_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        else:
            return loss

In [28]:
def create_datasets(data_dir):
    
    train_df = pd.read_csv(data_dir + "/train.csv")
    train_df = train_df.dropna()
    valid_df = pd.read_csv(data_dir + "/valid.csv")
    valid_df = valid_df.dropna()
    test_df = pd.read_csv(data_dir + "/test.csv")
    test_df = test_df.dropna()

    train_texts = train_df['text'].astype("string").tolist()
    valid_texts = valid_df['text'].astype("string").tolist()
    test_texts = test_df['text'].astype("string").tolist()

    train_labels = train_df['label_sexist'].astype("int").tolist()
    valid_labels = valid_df['label_sexist'].astype("int").tolist()
    test_labels = test_df['label_sexist'].astype("int").tolist()

    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # add special tokens for URLs, emojis and mentions (--> see pre-processing)
    special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt").to(device)
    valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, return_tensors="pt").to(device)
    test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt").to(device)

    train_dataset = SexistDataset(train_encodings, train_labels)
    valid_dataset = SexistDataset(valid_encodings, valid_labels)
    test_dataset = SexistDataset(test_encodings, test_labels)

    return train_dataset, valid_dataset, test_dataset, len(tokenizer)

In [29]:
def calculate_class_weights(data_dir):
    dataset = pd.read_csv(data_dir + "/train.csv")
    train_labels = dataset.label_sexist.to_numpy()
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    print("class weights are {}".format(class_weights))
    return class_weights


def train_model(train_dataset, valid_dataset, tok_len,  class_weights, output_dir, learning_rate, num_epochs, batch_size):
    training_args = TrainingArguments(
        save_steps=2500,
        output_dir=output_dir,  # output directory
        num_train_epochs=num_epochs,  # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        learning_rate=learning_rate,
        seed=123,
        use_mps_device=False, 
        dataloader_pin_memory=False, 
        no_cuda=True
    )
    print(device)
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
    model.resize_token_embeddings(tok_len)

    trainer = WeightedTrainer(
        model=model,
        class_weights=class_weights,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset
    )
    trainer.train()
    
    #try:
    #    trainer.train(resume_from_checkpoint=True)
    #    print("resuming from checkpoint...")
    #except ValueError:
    #    print("No checkpoints found. training from scratch...")
    #    trainer.train()

    return trainer

In [30]:
output_dir = "./Model/"
dataset_dir = "../Data/"
#datasets = ["CAD_hate", "CAD_abuse", "Founta_hate", "Founta_abuse", "Davidson_hate", "Davidson_abuse"]
dataset = "data_sexism"

num_epochs = 2
batch_size = 16
learning_rate = 5e-5

dd_dir = dataset_dir + dataset
oo_dir = output_dir + dataset

train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir)
class_weights = calculate_class_weights(dd_dir)

trainer = train_model(train_dataset,
                        valid_dataset,
                        tok_len,
                        class_weights,
                        oo_dir,
                        learning_rate,
                        num_epochs,
                        batch_size)
    
trainer.save_model(oo_dir)

print("Training done, evaluating...")
valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
valid_labels = np.array(valid_dataset.labels)

cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
pickle.dump(cls_report_valid, open(oo_dir + "/cls_report_valid.pickle", "wb"))

test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
test_labels = np.array(test_dataset.labels)

cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
pickle.dump(cls_report_test, open(oo_dir + "/cls_report_test.pickle", "wb"))

class weights are [0.66025278 2.06003531]
cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5532
1000,0.3951
1500,0.2906


Training done, evaluating...
