In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import pickle
import argparse
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

import os
import random
#os.environ["TOKENIZERS_PARALLELISM"] = "false"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [3]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


In [4]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
       # self.class_weights = torch.FloatTensor(class_weights)
        self.weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights)).to(DEVICE)
        super().__init__(**kwargs)

    # def compute_loss(self, model, inputs, return_outputs=False):
    #     labels = inputs.pop("labels")
    #     outputs = model(**inputs)
    #     logits = outputs[0]
    #     loss = self.weighted_loss(logits, labels)
    #     if return_outputs:
    #         return loss, outputs
    #     else:
    #         return loss
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = self.weighted_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss


In [5]:
def create_datasets(data_dir):

    train_df = pd.read_csv(data_dir + "/train.csv",engine="python")
    train_df = train_df.dropna()
    train_df = train_df.dropna(subset=['label'])
    valid_df = pd.read_csv(data_dir + "/valid.csv",engine="python")
    valid_df = valid_df.dropna()
    valid_df = valid_df.dropna(subset=['label'])
    test_df = pd.read_csv(data_dir + "/test.csv",engine="python")
    test_df = test_df.dropna()
    test_df = test_df.dropna(subset=['label'])


    train_texts = train_df['text'].astype("string").tolist()
    valid_texts = valid_df['text'].astype("string").tolist()
    test_texts = test_df['text'].astype("string").tolist()

    train_labels = train_df['label'].astype("int").tolist()
    valid_labels = valid_df['label'].astype("int").tolist()
    test_labels = test_df['label'].astype("int").tolist()

    # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    # tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    # add special tokens for URLs, emojis and mentions (--> see pre-processing)
    special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    # train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
    train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
    # valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
    valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, return_tensors="pt")
    test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

    train_dataset = HateDataset(train_encodings, train_labels)
    valid_dataset = HateDataset(valid_encodings, valid_labels)
    test_dataset = HateDataset(test_encodings, test_labels)

    return train_dataset, valid_dataset, test_dataset, len(tokenizer)

In [6]:
def calculate_class_weights(data_dir):
    dataset = pd.read_csv(data_dir + "/train.csv",engine="python")
    dataset = dataset.dropna(subset=['label'])
    train_labels = dataset.label.to_numpy()
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    print("class weights are {}".format(class_weights))
    return class_weights


def train_model(train_dataset, valid_dataset, tok_len,  class_weights, output_dir, learning_rate, num_epochs, batch_size):
    training_args = TrainingArguments(
        save_steps=5000,
        output_dir=output_dir,  # output directory
        num_train_epochs=num_epochs,  # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=1000,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        learning_rate=learning_rate,
        seed=123,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=50,          # log every 50 steps
        eval_strategy="steps",
        eval_steps=250,
    )

    # model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
    # model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")
    model = model.to(DEVICE)
    model.resize_token_embeddings(tok_len)

    trainer = WeightedTrainer(
        model=model,
        class_weights=class_weights,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset
    )

    try:
        trainer.train(resume_from_checkpoint=True)
        print("resuming from checkpoint...")
    except ValueError:
        print("No checkpoints found. training from scratch...")
        trainer.train()

    return trainer

In [16]:
dir = "/content/drive/MyDrive/NLP/"
dataset_dir = dir + "Data/"
# datasets = ["CAD_hate", "CAD_abuse", "Founta_hate", "Founta_abuse", "Davidson_hate", "Davidson_abuse"]
# datasets = ["CAD_hate", "CAD_abuse"]
# datasets = ["CAD_hate", "CAD_abuse", "Davidson_hate", "Davidson_abuse", "Dynamic_hate", "Measuring_dat_hate", "Measuring_dat_abuse"]
datasets = [ "Dynamic_hate", "Measuring_dat_hate", "Measuring_dat_abuse"]

output_dir = dir + "Weights/eng_classif_RoBerta/"
num_epochs = 5
batch_size = 16
learning_rate = 5e-5

for dataset in datasets:
    dd_dir = dataset_dir + dataset
    oo_dir = output_dir + dataset

    train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir)
    class_weights = calculate_class_weights(dd_dir)

    trainer = train_model(train_dataset,
                          valid_dataset,
                          tok_len,
                          class_weights,
                          oo_dir,
                          learning_rate,
                          num_epochs,
                          batch_size)

    trainer.save_model(oo_dir)

    print("Training done, evaluating...")
    valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
    valid_labels = np.array(valid_dataset.labels)

    cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
    pickle.dump(cls_report_valid, open(oo_dir + "/cls_report_valid.pickle", "wb"))

    test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
    test_labels = np.array(test_dataset.labels)

    cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
    pickle.dump(cls_report_test, open(oo_dir + "/cls_report_test.pickle", "wb"))

class weights are [1.08416754 0.92795941]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No checkpoints found. training from scratch...


Step,Training Loss,Validation Loss
250,0.6677,0.614588
500,0.5402,0.502172
750,0.532,0.486996
1000,0.5396,0.4779
1250,0.521,0.461424
1500,0.5387,0.446005
1750,0.4967,0.439376
2000,0.4776,0.432961
2250,0.4064,0.438442
2500,0.4038,0.454144


Training done, evaluating...


class weights are [0.78348698 1.38187471]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No checkpoints found. training from scratch...


Step,Training Loss,Validation Loss
250,0.4199,0.325069
500,0.3526,0.357508
750,0.3471,0.318284
1000,0.3535,0.38439
1250,0.3942,0.367367
1500,0.3858,0.359033
1750,0.3497,0.379709
2000,0.4307,0.344701
2250,0.3656,0.345894
2500,0.362,0.352684


Training done, evaluating...


class weights are [0.90415208 1.11857903]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No checkpoints found. training from scratch...


Step,Training Loss,Validation Loss
250,0.3748,0.340149
500,0.3485,0.351417
750,0.3611,0.366492
1000,0.3716,0.362881
1250,0.3769,0.377125
1500,0.3724,0.367065
1750,0.3435,0.410097
2000,0.3545,0.338106
2250,0.3684,0.395451
2500,0.2991,0.449498


KeyboardInterrupt: 

In [7]:
dir = "/content/drive/MyDrive/NLP/"
dataset_dir = dir + "Data/"
# datasets = ["CAD_hate", "CAD_abuse", "Founta_hate", "Founta_abuse", "Davidson_hate", "Davidson_abuse"]
# datasets = ["CAD_hate", "CAD_abuse"]
# datasets = ["CAD_hate", "CAD_abuse", "Davidson_hate", "Davidson_abuse", "Dynamic_hate", "Measuring_dat_hate", "Measuring_dat_abuse"]
datasets = [ "Measuring_dat_abuse"]

output_dir = dir + "Weights/eng_classif_RoBerta/"
num_epochs = 5
batch_size = 16
learning_rate = 5e-5

for dataset in datasets:
    dd_dir = dataset_dir + dataset
    oo_dir = output_dir + dataset

    train_dataset, valid_dataset, test_dataset, tok_len = create_datasets(dd_dir)
    class_weights = calculate_class_weights(dd_dir)

    trainer = train_model(train_dataset,
                          valid_dataset,
                          tok_len,
                          class_weights,
                          oo_dir,
                          learning_rate,
                          num_epochs,
                          batch_size)

    trainer.save_model(oo_dir)

    print("Training done, evaluating...")
    valid_preds = np.argmax(trainer.predict(valid_dataset)[0], axis=1) #should be numpy ndarray
    valid_labels = np.array(valid_dataset.labels)

    cls_report_valid = classification_report(valid_labels, valid_preds, output_dict=True)
    pickle.dump(cls_report_valid, open(oo_dir + "/cls_report_valid.pickle", "wb"))

    test_preds = np.argmax(trainer.predict(test_dataset)[0], axis=1)
    test_labels = np.array(test_dataset.labels)

    cls_report_test = classification_report(test_labels, test_preds, output_dict=True)
    pickle.dump(cls_report_test, open(oo_dir + "/cls_report_test.pickle", "wb"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

class weights are [0.90415208 1.11857903]


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
You are resuming training from a checkpoint trained with 4.48.3 of Transformers but your current version is 4.49.0. This is not recommended and could yield to errors or unwanted behaviors.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhackimou11[0m ([33mhackimou11-ens[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
5250,0.3561,0.346131
5500,0.3749,0.360741
5750,0.3692,0.366305
6000,0.3681,0.38632
6250,0.3383,0.389345
6500,0.5764,0.60776
6750,0.557,0.492869
7000,0.3817,0.375013
7250,0.334,0.405391
7500,0.3692,0.381929


resuming from checkpoint...
Training done, evaluating...
