In [2]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
files = Path("./datasets/").glob("*.json")
dfs = []
for file in files:
    df = pd.read_json(file)
    df["label"] = file.stem
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=42)
df.head()

Unnamed: 0,sentences,label
234,Tyo mero din ho ki hoina?,nepali
118,Timi ta ekdam ramro dekhinchhau.,nepali
346,Nihon ni iku yotei ga arimasu.,japanese
498,What inspires you the most?,english
402,Watashi wa nihon no tabemono ga oishii to omoi...,japanese


In [4]:
df.to_csv("./datasets/dataset.csv", index=False)

In [5]:
from transformers import AutoTokenizer

model_name = "prajjwal1/bert-tiny"
# Load the tokenizer for BERT-Small
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Tokenise the utterances
def tokenize_function(examples):
    return tokenizer(
        examples["sentences"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

In [6]:
from datasets import Dataset

# Convert the dataframe to a Hugging Face Dataset
raw_dataset = Dataset.from_pandas(df).class_encode_column("label")
dataset = raw_dataset.map(tokenize_function, batched=True)

Casting to class labels:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

In [7]:
class_encoder = dataset.features["label"]

In [8]:
dataset = dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="label")

In [9]:
val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="label")["test"]
dataset["validation"] = val_dataset

In [10]:
import torch
from transformers import BertForSequenceClassification
import torch.nn as nn

In [31]:
from typing import Union
from transformers.modeling_outputs import SequenceClassifierOutput

class BertWithCustomHead(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

        # Define the custom classification head
        self.custom_dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Sequential(
            nn.Linear(self.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, self.config.num_labels),
        )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        return_dict=True,
    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
        # Get outputs from the base BERT model
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=return_dict,
        )

        # Extract the [CLS] pooled output
        pooled_output = outputs.pooler_output

        # Pass pooled output through the custom classification head
        pooled_output = self.custom_dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # Return logits and loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        if not return_dict:
            return (loss, logits) if loss is not None else logits

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [45]:
model = BertWithCustomHead.from_pretrained(model_name, num_labels=3, torch_dtype=torch.float32, problem_type="single_label_classification")

Some weights of BertWithCustomHead were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'classifier.5.bias', 'classifier.5.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [47]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback

BATCH_SIZE = 32
WARMUP_STEPS = 100
n_epochs_for_warmup = round(BATCH_SIZE * WARMUP_STEPS / len(dataset["train"]))

training_args = TrainingArguments(
    output_dir="./roman-classifier",  # Directory to save model checkpoints
    eval_strategy="steps",
    eval_steps=20,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=WARMUP_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=n_epochs_for_warmup,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
20,No log,1.098612,0.5,0.337838,0.5,0.38
40,No log,1.098219,0.519231,0.557418,0.519231,0.473752
60,No log,1.097596,0.5,0.51,0.5,0.36725
80,No log,1.096686,0.480769,0.505656,0.480769,0.331442
100,No log,1.095181,0.461538,0.213018,0.461538,0.291498


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=102, training_loss=1.0978881237553615, metrics={'train_runtime': 5.4338, 'train_samples_per_second': 571.972, 'train_steps_per_second': 18.771, 'total_flos': 523068940800.0, 'train_loss': 1.0978881237553615, 'epoch': 6.0})

In [48]:
MAX_EPOCHS = 100
training_args = TrainingArguments(
    output_dir="./roman-classifier",  # Directory to save model checkpoints
    eval_strategy="steps",
    eval_steps=20,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCHS,
    save_strategy="best",
    logging_dir="./logs",
    logging_steps=20,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
20,1.0953,1.09309,0.615385,0.54021,0.615385,0.526461
40,1.093,1.090851,0.730769,0.57996,0.730769,0.635835
60,1.0909,1.088029,0.75,0.587838,0.75,0.651639
80,1.0881,1.084136,0.75,0.587838,0.75,0.651639
100,1.0831,1.078781,0.75,0.587838,0.75,0.651639
120,1.0787,1.072038,0.75,0.587838,0.75,0.651639
140,1.0708,1.063475,0.75,0.587838,0.75,0.651639
160,1.0641,1.052493,0.75,0.587838,0.75,0.651639
180,1.052,1.039159,0.75,0.587838,0.75,0.651639
200,1.0374,1.02279,0.826923,0.874126,0.826923,0.794773


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=480, training_loss=0.9284395416577657, metrics={'train_runtime': 26.4279, 'train_samples_per_second': 1960.047, 'train_steps_per_second': 64.326, 'total_flos': 2462530483200.0, 'train_loss': 0.9284395416577657, 'epoch': 28.235294117647058})

In [50]:
trainer.evaluate(dataset["test"])

{'eval_loss': 0.9240568280220032,
 'eval_accuracy': 0.9482758620689655,
 'eval_precision': 0.9534482758620689,
 'eval_recall': 0.9482758620689655,
 'eval_f1': 0.9467634603750755,
 'eval_runtime': 0.2888,
 'eval_samples_per_second': 200.853,
 'eval_steps_per_second': 6.926,
 'epoch': 28.235294117647058}

In [51]:
trainer.save_model("./roman-classifier")

In [53]:
config = model.config