In [18]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, AdamW
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [20]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl("train.jsonl")
test_data = load_jsonl("test.jsonl")

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

train_df['label'].value_counts()

train_df["label"] = train_df["label"].astype(int)  # 💥 Fix type issue here


In [21]:
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

class RedditDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        row = self.df.iloc[index]
        inputs = self.tokenizer(
            row["context"],
            row["target"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }
        if "label" in row:
            item["labels"] = torch.tensor(row["label"])
        return item

    def __len__(self):
        return len(self.df)


In [22]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kfold.split(train_df, train_df["label"]))


In [23]:
def train_model(train_dataset, val_dataset):
    model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    best_f1 = 0
    for epoch in range(3):
        model.train()
        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                targets.extend(labels.cpu().numpy())

        f1 = f1_score(targets, preds, average='weighted')
        print(f"Epoch {epoch+1} F1: {f1:.4f}")
        if f1 > best_f1:
            torch.save(model.state_dict(), "best_model.pt")
            best_f1 = f1

    return model


In [24]:
models = []
for fold, (train_idx, val_idx) in enumerate(splits):
    print(f"\n📚 Fold {fold+1}")
    train_ds = RedditDataset(train_df.iloc[train_idx], tokenizer)
    val_ds = RedditDataset(train_df.iloc[val_idx], tokenizer)
    model = train_model(train_ds, val_ds)
    models.append(model)



📚 Fold 1


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 11%|█         | 28/250 [04:36<35:12,  9.52s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 22%|██▏       | 55/250 [08:53<31:27,  9.68s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▍       | 62/250 [10:00<29:50,  9.53s/it]Be aware, overflowing tokens are not returned for the 

Epoch 1 F1: 0.4015


 15%|█▍        | 37/250 [05:46<33:16,  9.37s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 24%|██▍       | 61/250 [09:32<29:36,  9.40s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|██▋       | 66/250 [10:19<28:47,  9.39s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|██▊       | 70/250 [10:56<28:07,  9.38s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Epoch 2 F1: 0.5385


  1%|          | 2/250 [00:20<42:07, 10.19s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|▋         | 17/250 [02:53<37:53,  9.76s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 18%|█▊        | 46/250 [08:01<35:26, 10.42s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 27%|██▋       | 68/250 [11:42<29:27,  9.71s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

Epoch 3 F1: 0.5316

📚 Fold 2


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 8/250 [01:49<55:02, 13.65s/it]


KeyboardInterrupt: 

In [None]:
test_ds = RedditDataset(test_df, tokenizer)
test_loader = DataLoader(test_ds, batch_size=16)

final_logits = []

for model in models:
    model.eval()
    logits_all = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            logits = model(input_ids, attention_mask=attention_mask).logits
            logits_all.append(logits.cpu().numpy())
    final_logits.append(np.vstack(logits_all))

# Average predictions
avg_logits = np.mean(final_logits, axis=0)
test_preds = np.argmax(avg_logits, axis=1)


In [None]:
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "label": test_preds
})
submission.to_csv("submission.csv", index=False)
submission.head()