### Import necessary libraries

In [1]:
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_cosine_with_hard_restarts_schedule_with_warmup, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.optim import AdamW
import pandas as pd
from torch.utils.data import random_split, DataLoader, RandomSampler, SequentialSampler, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from peft import get_peft_model, LoraConfig, PeftModelForSequenceClassification
import torch.nn as nn

import tqdm as notebook_tqdm

import numpy as np
import time
import datetime
import random

import matplotlib.pyplot as plt
from matplotlib import font_manager
import seaborn as sns

2025-06-11 06:30:01.568756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749623401.738345      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749623401.787659      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Set device to CUDA

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

torch.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


### Get model and tokenizer

In [3]:
model_name = 'google/muril-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

### Prepare Data

- Load Data

In [4]:
df = pd.read_csv("/kaggle/input/hate-dataset-train/hate-dataset-train.csv")
df.head()

Unnamed: 0,data,label
0,an extremist hindu crying for no reason,hate
1,हमारे मूर्धन्य हमारा खुदा हैं एक hi होता हैं,non-hate
2,इसमें देखो कौन पैसे के लिए दौड़ता he हिन्दू एक...,non-hate
3,वही नारा अब हम लोगों को भी follow करना पड़ेगा,non-hate
4,तुम जैसे कुछ बूंद लोगों की वजह se सबकी pakista...,non-hate


- Tokenize Data for MuRIL

In [5]:
class MyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = self.encode_labels(labels)
        self.max_length = max_length

    def encode_labels(self, y):
        encoder = LabelEncoder()
        encoder.fit(y)
        y_encoded = encoder.transform(y)
        self.label_encoder = encoder  # save encoder for inverse_transform if needed
        print(f"Encoder has the following classes: {encoder.classes_}")
        print(f"The new data type for y is {type(y_encoded)}")
        return y_encoded

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)         # remove batch dim
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Usage
X = df.data.values
y = df.label.values

dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


- Assign Class Weights

In [6]:
labels_class = df['label']
print(np.unique(labels_class))
class_weights = compute_class_weight("balanced", classes=np.unique(labels_class), y=labels_class)

class_weights = torch.tensor(class_weights, dtype=torch.float32)
class_weights = class_weights.to(device)
class_weights

['hate' 'non-hate']


tensor([1.2938, 0.8150], device='cuda:0')

- Split into Train and Eval

In [7]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"{train_size} training samples")
print(f"{val_size} validation samples")

24230 training samples
2693 validation samples


### Training

- Set metrics

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted', zero_division=1)
    recall = recall_score(labels, preds, average='weighted', zero_division=1)
    f1 = f1_score(labels, preds, average='weighted', zero_division=1)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

- Load Model and LoRA

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False,
)

for name, param in model.named_parameters():
	if 'classifier' not in name: # classifier layer
		param.requires_grad = False

loraConfig = LoraConfig(task_type="SEQ_CLS",
                         inference_mode=False,
                         lora_alpha=128,
                         lora_dropout=0.1,
                         r=128,
                         use_rslora=True)

model = get_peft_model(model, loraConfig, adapter_name='hinglish_hate_1')
model.add_adapter("hinglish_hate_2", loraConfig)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(197285, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (hinglish_hate_1): Dropout(p=0.1, inplace=False)
                      (hinglish_hate_2): Dropout(p=0.1, inplace=False)
              

- Set Hyperparameters

In [15]:
batch_size = 32
epochs = 20
learning_rate = 2e-5

- Create custom loss func, optimizer, scheduler and training arguments

In [16]:
class ClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=batch_size):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [17]:
optimizer = AdamW(model.parameters(),
                lr = learning_rate,
                )


scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(train_dataset) // batch_size) * epochs
)

training_args = TrainingArguments(
    output_dir="/kaggle/working/results/adapter_1",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    logging_dir="/kaggle/working/logs/adapter_2",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    label_names=["labels"]
)


trainer = ClassificationTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

- Start training

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.685,0.669378,0.686595,0.685879,0.686595,0.665622
2,0.6535,0.648435,0.711474,0.712985,0.711474,0.695311
3,0.6385,0.629063,0.725214,0.723761,0.725214,0.715141
4,0.6117,0.60828,0.740438,0.737669,0.740438,0.735103
5,0.595,0.596281,0.731155,0.729766,0.731155,0.730313
6,0.5752,0.588934,0.742666,0.739746,0.742666,0.739318
7,0.5621,0.577018,0.733754,0.734131,0.733754,0.733934
8,0.5434,0.575028,0.723357,0.734131,0.723357,0.725776
9,0.5343,0.566638,0.746008,0.744778,0.746008,0.745257
10,0.5158,0.565328,0.741552,0.744161,0.741552,0.742535


Could not locate the best model at /kaggle/working/results/adapter_1/checkpoint-8338/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=10612, training_loss=0.5627552241682332, metrics={'train_runtime': 3296.7921, 'train_samples_per_second': 146.991, 'train_steps_per_second': 4.598, 'total_flos': 2.47725186476544e+16, 'train_loss': 0.5627552241682332, 'epoch': 14.0})

### Evaluation

- Load and prepare Test Data

In [19]:
test_df = pd.read_csv("/kaggle/input/hinglish-hate/test-task1.csv")
X = test_df.data.values
y = test_df.label.values
test_dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


- Predict and evaluate accuracy

In [20]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-0.5519283 ,  0.57854617],
       [ 0.45793357, -0.43980888],
       [-0.5446111 ,  0.5731474 ],
       ...,
       [-0.55422044,  0.5816745 ],
       [ 0.66868454, -0.65999323],
       [ 0.6703883 , -0.66175264]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.5722740888595581, 'test_accuracy': 0.7424135497529993, 'test_precision': 0.7467601580238625, 'test_recall': 0.7424135497529993, 'test_f1': 0.7441305872034747, 'test_runtime': 5.5188, 'test_samples_per_second': 256.76, 'test_steps_per_second': 8.154})

- Save LoRA Adapter

In [None]:
output_dir = "/kaggle/working/hinglish_hate_1/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

### (Optional) Resume Training

In [None]:
trainer.train(resume_from_checkpoint=True)

### Continual Learning

In [21]:
df = pd.read_csv("/kaggle/input/hinglish-hate-task-2/cont_train.csv")

X = df.data.values
y = df.label.values

dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


In [22]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"{train_size} training samples")
print(f"{val_size} validation samples")

4325 training samples
481 validation samples


In [23]:
labels_class = df['label']
print(np.unique(labels_class))
class_weights = compute_class_weight("balanced", classes=np.unique(labels_class), y=labels_class)

class_weights = torch.tensor(class_weights, dtype=torch.float32)
class_weights = class_weights.to(device)
class_weights

['hate' 'non-hate']


tensor([1.1492, 0.8851], device='cuda:0')

In [24]:
model.set_adapter("hinglish_hate_2")

In [26]:
batch_size = 32
epochs = 20
learning_rate = 2e-5

In [27]:
optimizer = AdamW(model.parameters(),
                lr = learning_rate,
                )


scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(train_dataset) // batch_size) * epochs
)

training_args = TrainingArguments(
    output_dir="/kaggle/working/results/adapter_2",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    save_total_limit=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    logging_dir="/kaggle/working/logs/adapter_2",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    label_names=["labels"]
)


trainer = ClassificationTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6922,0.677835,0.677755,0.736023,0.677755,0.675219
2,0.6778,0.653785,0.758836,0.761052,0.758836,0.759615
3,0.6467,0.637106,0.787942,0.803114,0.787942,0.789532
4,0.6393,0.627797,0.787942,0.795294,0.787942,0.789319
5,0.63,0.6226,0.7921,0.792177,0.7921,0.788876
6,0.6187,0.614457,0.790021,0.794073,0.790021,0.791059
7,0.6129,0.610106,0.781705,0.809565,0.781705,0.782974
8,0.6041,0.598958,0.804574,0.803896,0.804574,0.804107
9,0.5967,0.597016,0.800416,0.799463,0.800416,0.798775
10,0.5867,0.585164,0.819127,0.82033,0.819127,0.819553


Could not locate the best model at /kaggle/working/results/adapter_2/checkpoint-2584/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=2720, training_loss=0.5988502491922939, metrics={'train_runtime': 853.8898, 'train_samples_per_second': 101.301, 'train_steps_per_second': 3.185, 'total_flos': 6316911924480000.0, 'train_loss': 0.5988502491922939, 'epoch': 20.0})

In [117]:
test_df = pd.read_csv("/kaggle/input/hinglish-hate-task-2/test.csv")
X = test_df.data.values
y = test_df.label.values
test_dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


In [48]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 0.24458964, -0.24023998],
       [-0.20946077,  0.2403012 ],
       [-0.20823666,  0.2399553 ],
       ...,
       [-0.21001852,  0.24005468],
       [-0.20890388,  0.23997977],
       [ 0.24473189, -0.24077846]], dtype=float32), label_ids=array([0, 1, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.5735505223274231, 'test_accuracy': 0.8161397670549085, 'test_precision': 0.8163474785365424, 'test_recall': 0.8161397670549085, 'test_f1': 0.8162294088548435, 'test_runtime': 4.6781, 'test_samples_per_second': 256.942, 'test_steps_per_second': 8.123})

In [70]:
model.base_model.set_adapter(['hinglish_hate_1', 'hinglish_hate_2'])

In [121]:
test_df = pd.read_csv("/kaggle/input/hinglish-hate/test-task1.csv")
X = test_df.data.values
y = test_df.label.values
test_dataset = MyDataset(X, y, tokenizer)

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


In [73]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-0.19778466,  0.22222993],
       [-0.20312594,  0.229599  ],
       [-0.19992775,  0.22588298],
       ...,
       [-0.199821  ,  0.22575593],
       [ 0.21515879, -0.20615792],
       [ 0.21106333, -0.20119008]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.6248688697814941, 'test_accuracy': 0.7311220889202541, 'test_precision': 0.7237488869488463, 'test_recall': 0.7311220889202541, 'test_f1': 0.7227431452317656, 'test_runtime': 5.9209, 'test_samples_per_second': 239.324, 'test_steps_per_second': 7.6})

In [77]:
output = "/kaggle/working/muril_bert_adapters/"
model.save_pretrained(output)
tokenizer.save_pretrained(output)

('/kaggle/working/muril_bert_adapters/tokenizer_config.json',
 '/kaggle/working/muril_bert_adapters/special_tokens_map.json',
 '/kaggle/working/muril_bert_adapters/vocab.txt',
 '/kaggle/working/muril_bert_adapters/added_tokens.json',
 '/kaggle/working/muril_bert_adapters/tokenizer.json')

In [115]:
from peft import PeftConfig

muril_model = "/kaggle/input/muril_classifier/pytorch/default/1"
hinglish_hate_1 = "/kaggle/working/muril_bert_adapters/hinglish_hate_1"
hinglish_hate_2 = "/kaggle/working/muril_bert_adapters/hinglish_hate_2"

base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, output_hidden_states=False)

peft_model = PeftModelForSequenceClassification.from_pretrained(base_model, hinglish_hate_1, num_labels=2)
peft_model.load_adapter(hinglish_hate_1, adapter_name="hinglish_hate_1")
peft_model.load_adapter(hinglish_hate_2, adapter_name='hinglish_hate_2')

#peft_model.set_adapter("hinglish_hate_2")
peft_model.base_model.set_adapter(["hinglish_hate_1", "hinglish_hate_2"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [119]:
new_trainer = ClassificationTrainer(
    model=peft_model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [112]:
#peft_model.base_model.set_adapter("hinglish_hate_2")
new_trainer = ClassificationTrainer(
    model=peft_model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

new_trainer.predict(test_dataset) # Task 2

PredictionOutput(predictions=array([[ 0.11254247, -0.06880226],
       [-0.15410027,  0.20360343],
       [-0.15202044,  0.20362139],
       ...,
       [-0.1563042 ,  0.20506175],
       [-0.17010796,  0.24416836],
       [ 0.08897122, -0.04002063]], dtype=float32), label_ids=array([0, 1, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.6264745593070984, 'test_model_preparation_time': 0.0055, 'test_accuracy': 0.800332778702163, 'test_precision': 0.8034933340167373, 'test_recall': 0.800332778702163, 'test_f1': 0.7974021787007133, 'test_runtime': 4.6899, 'test_samples_per_second': 256.297, 'test_steps_per_second': 8.103})

In [114]:
new_trainer.predict(test_dataset) # Task 1

PredictionOutput(predictions=array([[-0.1522802 ,  0.18991649],
       [-0.148069  ,  0.1983913 ],
       [-0.14530145,  0.18925565],
       ...,
       [-0.15332216,  0.20294136],
       [ 0.11365376, -0.06761795],
       [ 0.10139805, -0.0549578 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.6552545428276062, 'test_model_preparation_time': 0.0055, 'test_accuracy': 0.7247706422018348, 'test_precision': 0.736249792641435, 'test_recall': 0.7247706422018348, 'test_f1': 0.6898122815696602, 'test_runtime': 5.5008, 'test_samples_per_second': 257.599, 'test_steps_per_second': 8.181})

In [122]:
new_trainer.predict(test_dataset) # Task 1 with adapter

PredictionOutput(predictions=array([[-0.54628086,  0.5706933 ],
       [-0.54292077,  0.5693583 ],
       [-0.5469816 ,  0.57290226],
       ...,
       [-0.5471181 ,  0.5730184 ],
       [ 0.6275594 , -0.6185687 ],
       [ 0.6197144 , -0.60985243]], dtype=float32), label_ids=array([1, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.5989140868186951, 'test_model_preparation_time': 0.0076, 'test_accuracy': 0.7311220889202541, 'test_precision': 0.7237850086407459, 'test_recall': 0.7311220889202541, 'test_f1': 0.7229389964481092, 'test_runtime': 5.9132, 'test_samples_per_second': 239.635, 'test_steps_per_second': 7.61})

In [120]:
new_trainer.predict(test_dataset) # Task 2 with adapter

PredictionOutput(predictions=array([[ 0.62307155, -0.61346555],
       [-0.53667545,  0.5602557 ],
       [-0.5402038 ,  0.5646883 ],
       ...,
       [-0.53787404,  0.56118286],
       [-0.5273331 ,  0.5574368 ],
       [ 0.6257093 , -0.61647457]], dtype=float32), label_ids=array([0, 1, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.5297549962997437, 'test_model_preparation_time': 0.0076, 'test_accuracy': 0.7662229617304492, 'test_precision': 0.7814071614726438, 'test_recall': 0.7662229617304492, 'test_f1': 0.7667628969106643, 'test_runtime': 5.0574, 'test_samples_per_second': 237.669, 'test_steps_per_second': 7.514})