### Import Required Libraries

In [None]:
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
from torch.utils.data import random_split, DataLoader, RandomSampler, SequentialSampler, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn

import tqdm as notebook_tqdm

import numpy as np
import time
import datetime
import random

import matplotlib.pyplot as plt
from matplotlib import font_manager
import seaborn as sns

### Get Model Name and Tokenizer

In [2]:
model_name = 'google/muril-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

### Set Device to CUDA

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
torch.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


### Prepare Dataset

- Load Data

In [None]:
df = pd.read_csv("data/hinglish-hate/hate-dataset-train.csv")
df.head()

Unnamed: 0,data,label
0,an extremist hindu crying for no reason,hate
1,हमारे मूर्धन्य हमारा खुदा हैं एक hi होता हैं,non-hate
2,इसमें देखो कौन पैसे के लिए दौड़ता he हिन्दू एक...,non-hate
3,वही नारा अब हम लोगों को भी follow करना पड़ेगा,non-hate
4,तुम जैसे कुछ बूंद लोगों की वजह se सबकी pakista...,non-hate


- Tokenize Data for MuRIL

In [6]:
class MyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = self.encode_labels(labels)
        self.max_length = max_length

    def encode_labels(self, y):
        encoder = LabelEncoder()
        encoder.fit(y)
        y_encoded = encoder.transform(y)
        self.label_encoder = encoder  # save encoder for inverse_transform if needed
        print(f"Encoder has the following classes: {encoder.classes_}")
        print(f"The new data type for y is {type(y_encoded)}")
        return y_encoded

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)         # remove batch dim
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Usage
X = df.data.values
y = df.label.values

dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


- Split into Train and Eval Sets

In [7]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"{train_size} training samples")
print(f"{val_size} validation samples")

24230 training samples
2693 validation samples


### Load Model

- Set custom metrics and class weights

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted', zero_division=1)
    recall = recall_score(labels, preds, average='weighted', zero_division=1)
    f1 = f1_score(labels, preds, average='weighted', zero_division=1)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


labels_class = df['label']
print(np.unique(labels_class))
class_weights = compute_class_weight("balanced", classes=np.unique(labels_class), y=labels_class)

class_weights = torch.tensor(class_weights, dtype=torch.float32)
class_weights = class_weights.to(device)

['hate' 'non-hate']


- Load Model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels = 2,
        output_hidden_states = False,
)

model.to(device)

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(197285, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

- Set Hyperparameters

In [10]:
batch_size = 32
epochs = 4
learning_rate = 2e-5

- Define custom loss function, optimizer and scheduler

In [11]:
class ClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=batch_size):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


optimizer = AdamW(model.parameters(),
                lr = learning_rate,
                )


scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(train_dataset) // batch_size) * epochs
)


training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    logging_dir="/kaggle/working/logs",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)


trainer = ClassificationTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler)
)

- Train the model

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6456,0.558583,0.694022,0.723726,0.694022,0.696692
2,0.4942,0.524126,0.749722,0.748458,0.749722,0.748937
3,0.4182,0.528113,0.759376,0.761458,0.759376,0.760174
4,0.3632,0.567582,0.759005,0.763862,0.759005,0.760448


TrainOutput(global_step=3032, training_loss=0.4720120574679412, metrics={'train_runtime': 1342.0011, 'train_samples_per_second': 72.221, 'train_steps_per_second': 2.259, 'total_flos': 6375180871372800.0, 'train_loss': 0.4720120574679412, 'epoch': 4.0})

### Evaulation with Test Set

- Load Test Data

In [None]:
test_df = pd.read_csv("data/hinglish-hate/test-task1.csv")
X = test_df.data.values
y = test_df.label.values
test_dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


- Predict Labels

In [14]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-0.75391215,  0.7427963 ],
       [-0.27820474,  0.27296558],
       [-0.9175914 ,  0.9036926 ],
       ...,
       [-0.7722663 ,  0.7625929 ],
       [ 1.1774191 , -1.1539279 ],
       [ 1.2186581 , -1.1933494 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.508640706539154, 'test_accuracy': 0.7671136203246295, 'test_precision': 0.7729729596193579, 'test_recall': 0.7671136203246295, 'test_f1': 0.7691553758175734, 'test_runtime': 5.2149, 'test_samples_per_second': 271.722, 'test_steps_per_second': 8.629})

In [None]:
output = "classifier/"
model.save_pretrained(output)
tokenizer.save_pretrained(output)

### (Optional) Continue Training

In [None]:
trainer.train(resume_from_checkpoint=True)