In [None]:
!pip install transformers
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/google-bert/bert-base-multilingual-cased

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset
from sklearn import metrics
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [None]:
data = pd.DataFrame()
data['normalized_text'] = df['normalized_text']
data['labels'] = df['label final'].astype(int)

In [None]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1


train_df, test_df = train_test_split(data, test_size=TEST_SIZE + VAL_SIZE, random_state=0)
test_df, val_df = train_test_split(test_df, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

def tokenize_data(examples):
    return tokenizer(examples["normalized_text"], truncation=True,  padding='max_length', 
        max_length=512)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_val = val_dataset.map(tokenize_data, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-multilingual-cased')

In [None]:
# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/working/results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_strategy="epoch",
    save_total_limit=3,  # Keep only the last 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,  
    logging_dir='/working/logs',
    report_to=[]
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('/working/mbert/model')

In [None]:
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities
    logits = torch.tensor(logits)
    probs = logits.softmax(dim=-1)
    # Get predictions from probabilities
    predictions = probs.argmax(axis=-1)
    
    f1 = metrics.f1_score(labels, predictions, zero_division = 0, average='macro')       
    recall = metrics.recall_score(labels, predictions, zero_division = 0, average='macro')
    precision = metrics.precision_score(labels, predictions, zero_division = 0, average='macro')
    acc = metrics.accuracy_score(labels, predictions)
    
    probs = probs[:, 1].numpy()  # Get probabilities for the positive class
    auc = metrics.roc_auc_score(labels, probs)
    
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "auc": auc}

# Define Trainer with evaluation dataset
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

In [None]:
import os
import shutil

# Directory where checkpoints are saved
checkpoint_dir = "/working/results"

# Find the latest checkpoint directory (usually named something like 'checkpoint-xxxx')
latest_checkpoint = max([os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)], key=os.path.getmtime)

# Rename the checkpoint directory to a .ckpt file
checkpoint_name = "../models/mbert-fine-tuned/model.ckpt"
shutil.make_archive(checkpoint_name, 'zip', latest_checkpoint)

In [None]:
# import zipfile

# # Unzip the .ckpt file
# with zipfile.ZipFile('/kaggle/working/model.ckpt.zip', 'r') as zip_ref:
#     zip_ref.extractall('/kaggle/working/extracted_checkpoint')

# # Load the model from the extracted directory
# from transformers import DistilBertForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/extracted_checkpoint')