In [None]:
!pip install transformers sentencepiece scikit-learn pandas torch --quiet


In [None]:
from google.colab import files

uploaded = files.upload()
# Upload both `train.jsonl` and `test.jsonl`


Saving test.jsonl to test (1).jsonl
Saving train.jsonl to train (2).jsonl


In [None]:
# 📚 Imports
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # ✅ Correct import
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm


In [None]:
# 💻 Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
# 📂 Load JSONL files
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl("train.jsonl")
test_data = load_jsonl("test.jsonl")

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

train_df["label"] = train_df["label"].astype(int)
print(train_df['label'].value_counts())

label
1    2391
0    1404
2    1205
Name: count, dtype: int64


In [None]:

# 🔠 Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

In [None]:
# 🧺 Dataset class
class RedditDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        inputs = self.tokenizer(
            row["context"],
            row["target"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }
        if "label" in row:
            item["labels"] = torch.tensor(row["label"], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.df)

In [None]:
# 🔁 5-Fold Stratified Split
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kfold.split(train_df, train_df["label"]))

In [None]:

# 🧠 Training Function
def train_model(train_dataset, val_dataset):
    model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    best_f1 = 0
    for epoch in range(3):
        model.train()
        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                logits = model(input_ids, attention_mask=attention_mask).logits
                preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                targets.extend(labels.cpu().numpy())

        f1 = f1_score(targets, preds, average='weighted')
        print(f"Epoch {epoch+1} F1: {f1:.4f}")
        if f1 > best_f1:
            torch.save(model.state_dict(), "best_model.pt")
            best_f1 = f1

    return model


In [None]:
# 🔁 Train 5 models (1 per fold)
models = []
for fold, (train_idx, val_idx) in enumerate(splits):
    print(f"\n📚 Fold {fold+1}")
    train_ds = RedditDataset(train_df.iloc[train_idx], tokenizer)
    val_ds = RedditDataset(train_df.iloc[val_idx], tokenizer)
    model = train_model(train_ds, val_ds)
    models.append(model)


📚 Fold 1


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/250 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|█▏        | 31/250 [00:25<03:04,  1.19it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▍       | 62/250 [00:52<02:42,  1.15it/s]Be aware, overflowing tokens are not returned for the setting y

Epoch 1 F1: 0.3092


  2%|▏         | 5/250 [00:03<03:20,  1.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 30%|███       | 76/250 [01:04<02:28,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 38%|███▊      | 95/250 [01:21<02:11,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 47%|████▋     | 117/250 [01:39<01:53,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 2 F1: 0.5186


  1%|          | 3/250 [00:02<03:07,  1.32it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 14%|█▍        | 35/250 [00:29<03:06,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 40/250 [00:33<03:02,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▌       | 63/250 [00:53<02:40,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

Epoch 3 F1: 0.5449

📚 Fold 2


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 27%|██▋       | 67/250 [00:57<02:36,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 31%|███       | 78/250 [01:06<02:27,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 40%|████      | 100/250 [01:25<02:07,  1.17it/s]Be aware, overflowing tokens are not returned for the

Epoch 1 F1: 0.3092


  1%|          | 3/250 [00:02<03:06,  1.32it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 13%|█▎        | 33/250 [00:27<03:07,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 57%|█████▋    | 142/250 [02:01<01:32,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 66%|██████▌   | 165/250 [02:21<01:12,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunc

Epoch 2 F1: 0.5550


  9%|▉         | 22/250 [00:18<03:15,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▉         | 23/250 [00:19<03:15,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 42/250 [00:35<03:01,  1.14it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 35%|███▌      | 88/250 [01:15<02:17,  1.18it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 3 F1: 0.6007

📚 Fold 3


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 43/250 [00:36<03:00,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|█▉        | 49/250 [00:41<02:54,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs wit

Epoch 1 F1: 0.4520


  7%|▋         | 18/250 [00:14<03:18,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|▊         | 19/250 [00:15<03:18,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▍        | 37/250 [00:31<03:05,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 40/250 [00:33<03:02,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 2 F1: 0.5275


  8%|▊         | 19/250 [00:15<03:17,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|█▏        | 29/250 [00:24<03:11,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 53/250 [00:45<02:50,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|██▌       | 65/250 [00:55<02:38,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 3 F1: 0.5516

📚 Fold 4


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▎         | 9/250 [00:07<03:22,  1.19it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 10%|▉         | 24/250 [00:20<03:14,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with

Epoch 1 F1: 0.3092


  1%|          | 2/250 [00:01<02:47,  1.48it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 40/250 [00:34<03:02,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 43/250 [00:36<02:59,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|█▉        | 49/250 [00:41<02:54,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

Epoch 2 F1: 0.3114


 10%|█         | 25/250 [00:20<03:13,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 14%|█▎        | 34/250 [00:28<03:07,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▌       | 63/250 [00:53<02:40,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 32%|███▏      | 81/250 [01:09<02:24,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 3 F1: 0.3092

📚 Fold 5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 3/250 [00:02<03:02,  1.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  2%|▏         | 5/250 [00:03<03:18,  1.23it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  6%|▌         | 14/250 [00:11<03:21,  1.17it/s]Be aware, overflowing tokens are not returned for the se

Epoch 1 F1: 0.4448


 14%|█▍        | 35/250 [00:29<03:06,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▍        | 37/250 [00:31<03:05,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|██▎       | 58/250 [00:49<02:45,  1.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 30%|██▉       | 74/250 [01:03<02:30,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 2 F1: 0.4915


  0%|          | 0/250 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|▋         | 18/250 [00:14<03:18,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|██▋       | 66/250 [00:56<02:37,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 32%|███▏      | 79/250 [01:07<02:25,  1.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation stra

Epoch 3 F1: 0.5367


In [None]:
# 🧪 Predict on test set using all 5 models
test_ds = RedditDataset(test_df, tokenizer)
test_loader = DataLoader(test_ds, batch_size=16)

final_logits = []
for model in models:
    model.eval()
    logits_all = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            logits = model(input_ids, attention_mask=attention_mask).logits
            logits_all.append(logits.cpu().numpy())
    final_logits.append(np.vstack(logits_all))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# 📊 Average predictions
avg_logits = np.mean(final_logits, axis=0)
test_preds = np.argmax(avg_logits, axis=1)

In [None]:
# 📝 Save submission file
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "label": test_preds
})
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,ID,label
0,5000,2
1,5001,0
2,5002,1
3,5003,1
4,5004,0


In [None]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>