In [None]:
import pandas as pd
import torch
import torch.nn as nn
from collections import defaultdict
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

# Import learning rate scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning import LightningModule
from sklearn.model_selection import KFold
import numpy as np

from sentence_transformers import SentenceTransformer

# Create a regressor

In [None]:
K_FOLDS = 10
BATCH_SIZE = 32

INPUT_SIZE = 768 * 2
LAYER_NUM = 7
DROPOUT_RATE = 0.2

LEARNING_RATE = 1e-2
WEIGHT_DECAY = 1e-2

# Learning rate scheduler
LRS_PATIENCE = 10
LRS_FACTOR = 0.5

# Early stopping
ES_PATIENCE = 50

MAX_EPOCHS = 1000

In [None]:
# Load the data
train_df = pd.read_csv("/kaggle/input/nlp-2025-midterm-kaggle-asas/train.csv", index_col="ID")
train_df

In [None]:
# Assign fold to each row
train_df["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(KFold(n_splits=K_FOLDS, shuffle=True, random_state=42).split(train_df)):
    train_df.loc[val_idx, "fold"] = fold

In [None]:
class ParaphraseMMPNETDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        has_score: bool = True,
    ) -> None:
        self.df = df

        self.has_score = has_score

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        model_inputs = {"question": row["question"], "answer": row["answer"]}

        # Get the embeddings
        if self.has_score:
            return model_inputs, row["score"].astype(np.float32)
        else:
            return model_inputs

In [None]:
class MLPRegressor(LightningModule):
    def __init__(self, input_size: int, layer_num: int, dropout_rate: float) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, input_size // 2),
            nn.LeakyReLU(),
        )
        for _ in range(layer_num):
            self.model.add_module(f"linear_{_}", nn.Linear(input_size // 2, input_size // 2))
            self.model.add_module(f"relu_{_}", nn.LeakyReLU())
            self.model.add_module(f"dropout_{_}", nn.Dropout(dropout_rate))
            if _ % 3 == 0:
                self.model.add_module(f"batchnorm_{_}", nn.BatchNorm1d(input_size // 2))
        self.model.add_module("output", nn.Linear(input_size // 2, 1))
        self.model.add_module("sigmoid", nn.Sigmoid())

        self.embedder_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

        # Initialize the weights
        self.model.apply(self._init_weights)

        self.loss_fn = nn.MSELoss(reduction="mean")

    def _init_weights(self, layer):
        if type(layer) == nn.Linear:
            torch.nn.init.xavier_uniform_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

    def forward(self, x):
        question_embeds = self.embedder_model.encode(x["question"], show_progress_bar=False)
        answer_embeds = self.embedder_model.encode(x["answer"], show_progress_bar=False)
        embeddings = np.concatenate([question_embeds, answer_embeds], axis=-1)
        return self.model(torch.tensor(embeddings, device=self.device)) * 5.0

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.loss_fn(y_hat, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

        # Log the learning rate
        self.log(
            "learning_rate",
            self.trainer.optimizers[0].param_groups[0]["lr"],
            on_step=True,
            on_epoch=False,
            prog_bar=True,
        )

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.loss_fn(y_hat, y)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=LRS_FACTOR, patience=LRS_PATIENCE, verbose=True)
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "val_loss",
        }

In [None]:
trainers = []
for current_fold in range(K_FOLDS):
    model = MLPRegressor(input_size=INPUT_SIZE, layer_num=LAYER_NUM, dropout_rate=DROPOUT_RATE)

    train_fold_df = train_df[train_df["fold"] != current_fold]
    val_fold_df = train_df[train_df["fold"] == current_fold]

    # Create the datasets and dataloaders
    train_dataset = ParaphraseMMPNETDataset(train_fold_df)
    val_dataset = ParaphraseMMPNETDataset(val_fold_df)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, drop_last=True)

    # Create logger
    logger = CSVLogger(
        "logs/paraphrase-multilingual-mpnet-base-v2", name=f"paraphrase-multilingual-mpnet-base-v2-{current_fold}"
    )

    # Create a checkpoint callback
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=f"./logs/paraphrase-multilingual-mpnet-base-v2/paraphrase-multilingual-mpnet-base-v2-{current_fold}/checkpoints",
        filename="paraphrase-multilingual-mpnet-base-v2-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        mode="min",
    )

    # Create an early stopping callback
    early_stopping = EarlyStopping(monitor="val_loss", patience=ES_PATIENCE, mode="min")

    # Create a trainer
    trainer = Trainer(max_epochs=MAX_EPOCHS, logger=logger, callbacks=[checkpoint_callback, early_stopping])

    # Fit the model
    trainer.fit(model, train_loader, val_loader)

    # Append the trainer
    trainers.append(trainer)

In [None]:
# Load the best model
best_models = []
for current_fold in range(K_FOLDS):
    model = MLPRegressor.load_from_checkpoint(
        trainers[current_fold].checkpoint_callback.best_model_path,
        input_size=INPUT_SIZE,
        layer_num=LAYER_NUM,
        dropout_rate=DROPOUT_RATE,
    )
    best_models.append(model)

# Create submission

In [None]:
test_df = pd.read_csv("/kaggle/input/nlp-2025-midterm-kaggle-asas/test.csv", index_col="ID")

test_dataset = ParaphraseMMPNETDataset(test_df, has_score=False)
test_loader = DataLoader(test_dataset, batch_size=4)

predictions = []
for model in best_models:
    model.eval()
    model.freeze()

    model_predictions = []
    for x in test_loader:
        y_hat = model(x).squeeze()
        model_predictions.extend(y_hat.tolist())

    predictions.append(model_predictions)

# Average the predictions
predictions = np.mean(predictions, axis=0)

test_df["score"] = predictions
test_df[["score"]].to_csv("submission.csv")