In [7]:
import numpy as np
import pandas as pd
import pickle
import argparse
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
import os
import random
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

#PYTORCH_ENABLE_MPS_FALLBACK=1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps")
device

device(type='cuda')

In [8]:
class SexistDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
       # self.class_weights = torch.FloatTensor(class_weights)
        self.weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights)).to(device)
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, num_items_in_batch=False, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = self.weighted_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        else:
            return loss

In [10]:
def create_datasets(data_dir):
    
    train_df = pd.read_csv(data_dir + "/train.csv")
    train_df = train_df.dropna()
    valid_df = pd.read_csv(data_dir + "/valid.csv")
    valid_df = valid_df.dropna()
    test_df = pd.read_csv(data_dir + "/test.csv")
    test_df = test_df.dropna()

    train_texts = train_df['text'].astype("string").tolist()
    valid_texts = valid_df['text'].astype("string").tolist()
    test_texts = test_df['text'].astype("string").tolist()

    train_labels = train_df['label_sexist'].astype("int").tolist()
    valid_labels = valid_df['label_sexist'].astype("int").tolist()
    test_labels = test_df['label_sexist'].astype("int").tolist()

    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # add special tokens for URLs, emojis and mentions (--> see pre-processing)
    special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt").to(device)
    valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, return_tensors="pt").to(device)
    test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt").to(device)

    train_dataset = SexistDataset(train_encodings, train_labels)
    valid_dataset = SexistDataset(valid_encodings, valid_labels)
    test_dataset = SexistDataset(test_encodings, test_labels)

    return train_dataset, valid_dataset, test_dataset, len(tokenizer)

In [13]:
from transformers import TrainerCallback

class LossLoggingCallback(TrainerCallback):
    def __init__(self):
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            self.losses.append(logs['loss'])

In [41]:
def calculate_class_weights(data_dir):
    dataset = pd.read_csv(data_dir + "/train.csv")
    train_labels = dataset.label_sexist.to_numpy()
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    print("class weights are {}".format(class_weights))
    return class_weights


def train_model(train_dataset, valid_dataset, tok_len,  class_weights, output_dir, learning_rate, num_epochs, batch_size):
    training_args = TrainingArguments(
        save_steps=2500,
        output_dir=output_dir,  # output directory
        num_train_epochs=num_epochs,  # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        learning_rate=learning_rate,
        seed=123,
        use_mps_device=False, 
        dataloader_pin_memory=False
    )
    print(device)
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
    model.resize_token_embeddings(tok_len)

    loss_logging_callback = LossLoggingCallback()

    trainer = WeightedTrainer(
        model=model,
        class_weights=class_weights,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset, 
        callbacks=[loss_logging_callback] 
    )
    trainer.train()
    
    #try:
    #    trainer.train(resume_from_checkpoint=True)
    #    print("resuming from checkpoint...")
    #except ValueError:
    #    print("No checkpoints found. training from scratch...")
    #    trainer.train()

    return trainer, loss_logging_callback.losses

In [17]:
output_dir = "./Model/"
dataset_dir = "./"
#datasets = ["CAD_hate", "CAD_abuse", "Founta_hate", "Founta_abuse", "Davidson_hate", "Davidson_abuse"]
dataset = "data_sexism"

num_epochs = 5
batch_size = 16
learning_rate = 5e-5

dd_dir = dataset_dir + dataset
oo_dir = output_dir + dataset

train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir)
class_weights = calculate_class_weights(dd_dir)

trainer, losses = train_model(train_dataset,
                        valid_dataset,
                        tok_len,
                        class_weights,
                        oo_dir,
                        learning_rate,
                        num_epochs,
                        batch_size)
    
trainer.save_model(oo_dir)

print("Training done, evaluating...")
valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
valid_labels = np.array(valid_dataset.labels)

cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
pickle.dump(cls_report_valid, open(oo_dir + "/cls_report_valid.pickle", "wb"))

test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
test_labels = np.array(test_dataset.labels)

cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
pickle.dump(cls_report_test, open(oo_dir + "/cls_report_test.pickle", "wb"))

class weights are [0.66025278 2.06003531]
cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5553
1000,0.4143
1500,0.3323
2000,0.2497
2500,0.1821
3000,0.1011
3500,0.0599
4000,0.0256


Training done, evaluating...


In [19]:
print(cls_report_valid)
print(cls_report_test)

{'0': {'precision': 0.9020486555697823, 'recall': 0.9306472919418758, 'f1-score': 0.9161248374512354, 'support': 1514.0}, '1': {'precision': 0.7602739726027398, 'recall': 0.6851851851851852, 'f1-score': 0.7207792207792207, 'support': 486.0}, 'accuracy': 0.871, 'macro avg': {'precision': 0.831161314086261, 'recall': 0.8079162385635306, 'f1-score': 0.8184520291152281, 'support': 2000.0}, 'weighted avg': {'precision': 0.867597407608791, 'recall': 0.871, 'f1-score': 0.8686558525999357, 'support': 2000.0}}
{'0': {'precision': 0.9062197873026103, 'recall': 0.928052805280528, 'f1-score': 0.9170063590412523, 'support': 3030.0}, '1': {'precision': 0.7569676700111483, 'recall': 0.7, 'f1-score': 0.7273701124799143, 'support': 970.0}, 'accuracy': 0.87275, 'macro avg': {'precision': 0.8315937286568793, 'recall': 0.8140264026402639, 'f1-score': 0.8221882357605833, 'support': 4000.0}, 'weighted avg': {'precision': 0.8700261488594309, 'recall': 0.87275, 'f1-score': 0.8710195692501279, 'support': 4000.

In [43]:
training_args = TrainingArguments(
        save_steps=2500,
        output_dir=output_dir,  # output directory
        num_train_epochs=num_epochs,  # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        learning_rate=learning_rate,
        seed=123,
        use_mps_device=False, 
        dataloader_pin_memory=False
    )

In [47]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

model = BertForSequenceClassification.from_pretrained("Model/data_sexism-s/checkpoint-2625")
model.resize_token_embeddings(len(tokenizer))
model.eval()
trainer = WeightedTrainer(
        model=model,
        class_weights=class_weights,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset
    )

In [48]:
valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
valid_labels = np.array(valid_dataset.labels)

cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
print(cls_report_valid)

test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
test_labels = np.array(test_dataset.labels)

cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
print(cls_report_test)

{'0': {'precision': 0.9003856041131105, 'recall': 0.9253632760898283, 'f1-score': 0.9127035830618893, 'support': 1514.0}, '1': {'precision': 0.7454954954954955, 'recall': 0.6810699588477366, 'f1-score': 0.7118279569892473, 'support': 486.0}, 'accuracy': 0.866, 'macro avg': {'precision': 0.822940549804303, 'recall': 0.8032166174687825, 'f1-score': 0.8122657700255682, 'support': 2000.0}, 'weighted avg': {'precision': 0.8627473077190301, 'recall': 0.866, 'f1-score': 0.8638908059262372, 'support': 2000.0}}


{'0': {'precision': 0.912672357189757, 'recall': 0.9174917491749175, 'f1-score': 0.9150757077024358, 'support': 3030.0}, '1': {'precision': 0.7379454926624738, 'recall': 0.7257731958762886, 'f1-score': 0.7318087318087318, 'support': 970.0}, 'accuracy': 0.871, 'macro avg': {'precision': 0.8253089249261154, 'recall': 0.821632472525603, 'f1-score': 0.8234422197555838, 'support': 4000.0}, 'weighted avg': {'precision': 0.8703010925418908, 'recall': 0.871, 'f1-score': 0.8706334660482126, 'support': 4000.0}}


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

model = BertForSequenceClassification.from_pretrained("Model/data_sexism-s/checkpoint-2500")
model.resize_token_embeddings(len(tokenizer))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30525, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
device='cpu'
output_dir = "./Model/"
dataset_dir = "./"
#datasets = ["CAD_hate", "CAD_abuse", "Founta_hate", "Founta_abuse", "Davidson_hate", "Davidson_abuse"]
dataset = "data_sexism"

num_epochs = 2
batch_size = 16
learning_rate = 5e-5

dd_dir = dataset_dir + dataset
oo_dir = output_dir + dataset

train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir)

In [26]:
test_dataset.encodings

{'input_ids': tensor([[  101,  6616,  1996,  ...,     0,     0,     0],
        [  101,  2092,  2059,  ...,     0,     0,     0],
        [  101,  1001,  3915,  ...,     0,     0,     0],
        ...,
        [  101,  2043,  2017,  ...,     0,     0,     0],
        [  101,  2017,  2323,  ...,     0,     0,     0],
        [  101,  2023, 12170,  ...,     0,     0,     0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}