In [None]:
import pandas as pd
import torch
from torch import nn
from transformers import AdamW, set_seed, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification,BertTokenizer,BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [None]:
def read_data(data_path):
    data = pd.read_csv(data_path)
    data_train, data_test = train_test_split(data, test_size=0.3, random_state=42)
    return data_train, data_test

In [None]:
def preprocess_data(tokenizer, data):
    X = data.drop(["Label"], axis=1)
    y = data["Label"]

    encode = tokenizer.batch_encode_plus(
        X["Tweet"].tolist(),
        add_special_tokens=True,
        return_attention_mask=True,
        padding="max_length",
        max_length=150,
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encode["input_ids"]
    attention_masks = encode["attention_mask"]
    labels = torch.tensor(y.tolist())

    if "token_type_ids" in encode:
        token_type_ids = encode["token_type_ids"]
        dataset = TensorDataset(input_ids, token_type_ids, attention_masks, labels)
    else:
        dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset


In [None]:
"""
Training loops with case checking whether there is token_type_ids or not
"""

def train_model(model, dataloader_train, epochs, device):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_acc = 0

        for batch_idx, batch in enumerate(dataloader_train):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[-1].to(device)


            if len(batch) == 4:  # Token type IDs are included
                token_type_ids = batch[2].to(device)
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            else:
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)



            loss = outputs.loss
            acc = (torch.log_softmax(outputs.logits, dim=1).argmax(dim=1) == labels).sum().float() / float(labels.size(0))

            optimizer.zero_grad()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

        train_acc = total_train_acc / len(dataloader_train)
        train_loss = total_train_loss / len(dataloader_train)

        print(f'Epoch {epoch + 1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f}')

In [None]:
"""
Evalaution loops with case checking whether there is token_type_ids or not "
"""

def evaluate_model(model, dataloader_test, device):
    model.to(device)
    model.eval()
    test_gold = []
    test_pred = []

    with torch.no_grad():
        for batch in dataloader_test:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[-1].to(device)

            if len(batch) == 4:  # Token type IDs are included
                token_type_ids = batch[2].to(device)
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            else:
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            preds = torch.log_softmax(outputs.logits, dim=1).argmax(dim=1)
            test_gold.extend(labels.tolist())
            test_pred.extend(preds.tolist())
    print(classification_report(test_gold, test_pred))

In [None]:
# Set the seed for reproducibility
set_seed(36)

In [None]:
data_train, data_test = read_data("covid19-tweets.csv")

In [None]:
# Load the tokenizer for each model
tokenizer_roberta = AutoTokenizer.from_pretrained("amitness/roberta-base-ne")

tokenizer_distilbert = AutoTokenizer.from_pretrained("Sakonii/distilbert-base-nepali")

tokenizer_nepalibert = AutoTokenizer.from_pretrained("Rajan/NepaliBERT")

tokenizer_bert = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [None]:
# Load each of the models

model_roberta = AutoModelForSequenceClassification.from_pretrained("amitness/roberta-base-ne", num_labels=3)

model_distillbert = AutoModelForSequenceClassification.from_pretrained("Sakonii/distilbert-base-nepali", num_labels=3)

model_nepalibert = AutoModelForSequenceClassification.from_pretrained("Rajan/NepaliBERT", num_labels=3)

model_bert = BertForSequenceClassification.from_pretrained("bert-base-multilingual-uncased",num_labels=3)

In [None]:
# Preprocess the data for each tokenizer
dataset_train_roberta = preprocess_data(tokenizer_roberta, data_train)
dataset_test_roberta = preprocess_data(tokenizer_roberta, data_test)

dataset_train_distillbert = preprocess_data(tokenizer_distilbert, data_train)
dataset_test_distillbert = preprocess_data(tokenizer_distilbert, data_test)

dataset_train_nepalibert = preprocess_data(tokenizer_nepalibert, data_train)
dataset_test_nepalibert = preprocess_data(tokenizer_nepalibert, data_test)

dataset_train_bert = preprocess_data(tokenizer_bert, data_train)
dataset_test_bert = preprocess_data(tokenizer_bert, data_test)

In [None]:
# Create the data loaders
batch_size = 16
dataloader_train_roberta = DataLoader(dataset_train_roberta, sampler=RandomSampler(dataset_train_roberta), batch_size=batch_size)
dataloader_test_roberta = DataLoader(dataset_test_roberta, sampler=RandomSampler(dataset_test_roberta), batch_size=batch_size)

dataloader_train_distillbert = DataLoader(dataset_train_distillbert, sampler=RandomSampler(dataset_train_distillbert), batch_size=batch_size)
dataloader_test_distillbert = DataLoader(dataset_test_distillbert, sampler=RandomSampler(dataset_test_distillbert), batch_size=batch_size)

dataloader_train_nepalibert = DataLoader(dataset_train_nepalibert, sampler=RandomSampler(dataset_train_nepalibert), batch_size=batch_size)
dataloader_test_nepalibert = DataLoader(dataset_test_nepalibert, sampler=RandomSampler(dataset_test_nepalibert), batch_size=batch_size)

dataloader_train_bert = DataLoader(dataset_train_bert, sampler=RandomSampler(dataset_train_bert), batch_size=batch_size)
dataloader_test_bert = DataLoader(dataset_test_bert, sampler=RandomSampler(dataset_test_bert), batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
"""
Training  for amitness/roberta-base-ne
"""
print("Training for amitness/roberta-base-ne")
model_roberta.to(device)
train_model(model_roberta, dataloader_train_roberta, epochs=10, device=device)

"""
Evaluation for amitness/roberta-base-ne
"""
print("Evaluation for amitness/roberta-base-ne")
evaluate_model(model_roberta, dataloader_test_roberta, device=device)

In [None]:
"""
Training for Sakonii/distilbert-base-nepali
"""
print("Training for Sakonii/distilbert-base-nepali")
model_distillbert.to(device)
train_model(model_distillbert, dataloader_train_distillbert, epochs=10, device=device)

"""
Evaluation for Sakonii/distilbert-base-nepali
"""
print("Evaluation for Sakonii/distilbert-base-nepali")
evaluate_model(model_distillbert, dataloader_test_distillbert, device=device)

In [None]:
"""
Training for Rajan/NepaliBERT
"""
print("Training for Rajan/NepaliBERT")
model_nepalibert.to(device)
train_model(model_nepalibert, dataloader_train_nepalibert, epochs=10, device=device)

"""
Evaluation for Rajan/NepaliBERT
"""
print("Evaluation for Rajan/NepaliBERT")
evaluate_model(model_nepalibert, dataloader_test_nepalibert, device=device)

In [None]:
"""
Training for bert-base-multilingual-uncased
"""
print("Training for bert-base-multilingual-uncased")
model_bert.to(device)
train_model(model_bert, dataloader_train_bert, epochs=10, device=device)

"""
Evaluation for bert-base-multilingual-uncased
"""
print("Evaluation for bert-base-multilingual-uncased")
evaluate_model(model_bert, dataloader_test_bert, device=device)
