In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
files = Path("./datasets/").glob("*.json")
dfs = []
for file in files:
    df = pd.read_json(file)
    df["label"] = file.stem
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=42)
df.head()

Unnamed: 0,sentences,label
234,Tyo mero din ho ki hoina?,nepali
118,Timi ta ekdam ramro dekhinchhau.,nepali
346,Nihon ni iku yotei ga arimasu.,japanese
498,What inspires you the most?,english
402,Watashi wa nihon no tabemono ga oishii to omoi...,japanese


In [18]:
df.to_csv("./datasets/dataset.csv", index=False)

In [3]:
from transformers import AutoTokenizer

model_name = "prajjwal1/bert-tiny"
# Load the tokenizer for BERT-Small
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Tokenise the utterances
def tokenize_function(examples):
    return tokenizer(
        examples["sentences"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

In [4]:
from datasets import Dataset

# Convert the dataframe to a Hugging Face Dataset
raw_dataset = Dataset.from_pandas(df).class_encode_column("label")
dataset = raw_dataset.map(tokenize_function, batched=True)

Casting to class labels:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

In [5]:
class_encoder = dataset.features["label"]

In [6]:
dataset = dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="label")

In [7]:
val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="label")["test"]
dataset["validation"] = val_dataset

In [8]:
import torch
from transformers import BertForSequenceClassification
import torch.nn as nn

In [9]:
class CustomModelForSequenceClassification(BertForSequenceClassification):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        print("Clf layer set to my layer")
        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        model.classifier = nn.Sequential(
            nn.Linear(model.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, model.config.num_labels)
        )
        return model

# Load the BERT-Small model
model = CustomModelForSequenceClassification.from_pretrained(
    model_name, num_labels=3, torch_dtype=torch.float32, problem_type="single_label_classification"
)

Clf layer set to my layer


Some weights of CustomModelForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./roman-classifier",  # Directory to save model checkpoints
    eval_strategy="steps",
    eval_steps=20,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    save_strategy="best",
    logging_dir="./logs",
    logging_steps=20,
    weight_decay=0.01,
    metric_for_best_model="f1",
)


In [12]:
from transformers import Trainer, TrainerCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
20,1.1134,1.093611,0.288462,0.08321,0.288462,0.129162
40,1.0838,1.061313,0.346154,0.549843,0.346154,0.23778
60,1.0644,1.030684,0.730769,0.565304,0.730769,0.633002
80,1.0327,0.991786,0.730769,0.565304,0.730769,0.633002
100,0.9961,0.952484,0.730769,0.565304,0.730769,0.633002
120,0.9651,0.914024,0.730769,0.565304,0.730769,0.633002
140,0.9202,0.87369,0.865385,0.895782,0.865385,0.849153
160,0.8905,0.83103,0.884615,0.907692,0.884615,0.873718
180,0.8559,0.78862,0.903846,0.920424,0.903846,0.896935
200,0.8142,0.7445,0.942308,0.948718,0.942308,0.940242


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=850, training_loss=0.5541931040146771, metrics={'train_runtime': 32.5622, 'train_samples_per_second': 795.402, 'train_steps_per_second': 26.104, 'total_flos': 4358907840000.0, 'train_loss': 0.5541931040146771, 'epoch': 50.0})

In [16]:
trainer.save_model("./roman-classifier")

In [17]:
model.save_pretrained("./roman-classifier")