### 💡 Instructions
- Upload your `dataset/` and `models/` folders to the Colab environment.
- Adjust paths to match your directory structure (e.g., `'dataset/train.csv'`).
- Run cells to train, evaluate, or predict using the ensemble model.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers scikit-learn tqdm




In [3]:
import os
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/NLP Project')

from dataset.dataset import get_dataloaders
from Models.plain_bert import PlainBertClassifier
from Models.roberta_classifier import RobertaClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


In [4]:
def train_model(model, train_loader, val_loader, tokenizer, model_name, device, epochs=3, lr=2e-5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training {model_name} Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs['loss']
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")
        torch.save(model.state_dict(), f"{model_name}_best.pt")


In [7]:
def predict(model, dataloader, device):
    model.eval()
    predictions, true_labels, masks = [], [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            preds = torch.argmax(F.softmax(logits, dim=-1), dim=-1)

            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
            masks.extend(attention_mask.cpu().tolist())

    return predictions, true_labels, masks

def ensemble_predictions(preds_list):
    final_preds = []
    for tokens in zip(*preds_list):
        batch_preds = []
        for token_preds in zip(*tokens):
            vote = torch.mode(torch.tensor(token_preds)).values.item()
            batch_preds.append(vote)
        final_preds.append(batch_preds)
    return final_preds

def compute_metrics(predictions, labels, mask):
    all_preds, all_labels = [], []
    for pred, label, attn in zip(predictions, labels, mask):
        for p, l, m in zip(pred, label, attn):
            if m == 1:
                all_preds.append(p)
                all_labels.append(l)
    return {
        "precision": precision_score(all_labels, all_preds, average='macro', zero_division=0),
        "recall": recall_score(all_labels, all_preds, average='macro', zero_division=0),
        "f1": f1_score(all_labels, all_preds, average='macro', zero_division=0),
    }


In [8]:
def write_submission(predictions, filename):
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP Project/dataset/eval_w_o_labels.csv")
    result = []

    for i, row in df.iterrows():
        tokenized = eval(row['tokenized_sentence'])
        labels = predictions[i][:len(tokenized)]
        language = row['language']
        idx = row['id']

        idiom_indices = []
        current = []
        for j, label in enumerate(labels):
            if label == 1:
                if current:
                    idiom_indices.extend(current)
                    current = []
                current = [j]
            elif label == 2:
                if current:
                    current.append(j)
            else:
                if current:
                    idiom_indices.extend(current)
                    current = []
        if current:
            idiom_indices.extend(current)

        result.append({
            "id": idx,
            "indices": idiom_indices if idiom_indices else [-1],
            "language": language
        })

    pd.DataFrame(result).to_csv(filename, index=False)
    print(f"✅ Submission written to: {filename}")


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

save_dir = "/content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts"
os.makedirs(save_dir, exist_ok=True)

train_loader, val_loader, tokenizer = get_dataloaders(
    train_path="/content/drive/MyDrive/Colab Notebooks/NLP Project/dataset/train.csv",
    val_path="/content/drive/MyDrive/Colab Notebooks/NLP Project/dataset/eval.csv",
    batch_size=8
)

model_a = PlainBertClassifier().to(device)
model_b = RobertaClassifier().to(device)

train_model(model_a, train_loader, val_loader, tokenizer, os.path.join(save_dir, "plain_bert"), device)
train_model(model_b, train_loader, val_loader, tokenizer, os.path.join(save_dir, "roberta"), device)

"""model_a.load_state_dict(torch.load("plain_bert_best.pt"))
model_b.load_state_dict(torch.load("roberta_best.pt"))"""

val_loader_nolabels = DataLoader(val_loader.dataset, batch_size=8)

preds_a, labels, mask = predict(model_a, val_loader_nolabels, device)
metrics_a = compute_metrics(preds_a, labels, mask)

preds_b, _, _ = predict(model_b, val_loader_nolabels, device)
metrics_b = compute_metrics(preds_b, labels, mask)

final_preds = ensemble_predictions([preds_a, preds_b])
write_submission(preds_a, filename="/content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/plain_bert_prediction.csv")
write_submission(preds_b, filename="/content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/roberta_prediction.csv")
write_submission(final_preds, filename="/content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/ensemble_prediction.csv")

print("✅ Submission written to Submissions/prediction.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/plain_bert Epoch 1: 100%|██████████| 1440/1440 [01:22<00:00, 17.52it/s]


Epoch 1 Loss: 0.1776


Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/plain_bert Epoch 2: 100%|██████████| 1440/1440 [01:21<00:00, 17.75it/s]


Epoch 2 Loss: 0.0980


Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/plain_bert Epoch 3: 100%|██████████| 1440/1440 [01:21<00:00, 17.72it/s]


Epoch 3 Loss: 0.0703


Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/roberta Epoch 1: 100%|██████████| 1440/1440 [01:31<00:00, 15.73it/s]


Epoch 1 Loss: 0.3442


Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/roberta Epoch 2: 100%|██████████| 1440/1440 [01:31<00:00, 15.73it/s]


Epoch 2 Loss: 0.2153


Training /content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/roberta Epoch 3: 100%|██████████| 1440/1440 [01:31<00:00, 15.74it/s]


Epoch 3 Loss: 0.1697


Evaluating: 100%|██████████| 180/180 [00:02<00:00, 60.12it/s]
Evaluating: 100%|██████████| 180/180 [00:02<00:00, 60.06it/s]


✅ Submission written to: /content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/plain_bert_prediction.csv
✅ Submission written to: /content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/roberta_prediction.csv


OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/Colab Notebooks/NLP Project/submission'

In [6]:
print("📊 Plain BERT Metrics:")
print(metrics_a)

print("📊 RoBERTa Metrics:")
print(metrics_b)

ensemble_preds = ensemble_predictions([preds_a, preds_b])
metrics_ensemble = compute_metrics(ensemble_preds, labels, mask)

print("📊 Ensemble Metrics:")
print(metrics_ensemble)

📊 Plain BERT Metrics:


NameError: name 'metrics_a' is not defined

In [None]:
"""!cp plain_bert_best.pt /content/drive/MyDrive/Colab\ Notebooks/NLP\ Project/Models/
!cp roberta_best.pt /content/drive/MyDrive/Colab\ Notebooks/NLP\ Project/Models/"""


In [10]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/NLP Project')

from Models.plain_bert import PlainBertClassifier
from Models.roberta_classifier import RobertaClassifier

In [11]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Plain BERT
model_a = PlainBertClassifier().to(device)
model_a.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/plain_bert_best.pt", map_location=device))

# RoBERTa
model_b = RobertaClassifier().to(device)
model_b.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/NLP Project/Models/saved_pts/roberta_best.pt", map_location=device))

<All keys matched successfully>

In [12]:
model_a.eval()
model_b.eval()


RobertaClassifier(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [14]:
val_loader_nolabels = DataLoader(val_loader.dataset, batch_size=8)

preds_a, labels, mask = predict(model_a, val_loader_nolabels, device)
metrics_a = compute_metrics(preds_a, labels, mask)

preds_b, _, _ = predict(model_b, val_loader_nolabels, device)
metrics_b = compute_metrics(preds_b, labels, mask)

final_preds = ensemble_predictions([preds_a, preds_b])
write_submission(final_preds, filename="/content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/ensemble_prediction.csv")


Evaluating: 100%|██████████| 180/180 [00:03<00:00, 58.92it/s]
Evaluating: 100%|██████████| 180/180 [00:03<00:00, 59.64it/s]


✅ Submission written to: /content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/ensemble_prediction.csv


In [13]:
"""val_loader_nolabels = DataLoader(val_loader.dataset, batch_size=8)

preds_a = predict(model_a, val_loader_nolabels, device)
preds_b = predict(model_b, val_loader_nolabels, device)

final_preds = ensemble_predictions([preds_a, preds_b])

write_submission(final_preds, filename='/content/drive/MyDrive/Colab Notebooks/NLP Project/Submissions/ensemble_prediction.csv')"""


Evaluating: 100%|██████████| 180/180 [00:03<00:00, 59.29it/s]
Evaluating: 100%|██████████| 180/180 [00:03<00:00, 52.41it/s]


RuntimeError: a Tensor with 2 elements cannot be converted to Scalar

In [None]:
print("📊 Plain BERT Metrics:")
print(metrics_a)

print("📊 RoBERTa Metrics:")
print(metrics_b)

ensemble_preds = ensemble_predictions([preds_a, preds_b])
metrics_ensemble = compute_metrics(ensemble_preds, labels, mask)

print("📊 Ensemble Metrics:")
print(metrics_ensemble)