# Load Libraries and Define Configs

In [None]:
import os
import numpy as np
import pandas as pd
import gc
import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt

# Set the environment variable for CUDA devices
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# Load the Data

In [None]:
train = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
print("Train shape", train.shape)
train.head()
print(train.head())

test = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")
print("Test shape", test.shape)
print(test.head())

# Stratified 15 K Fold

In [None]:
FOLDS = 15
train["fold"] = -1
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for fold, (train_index, val_index) in enumerate(skf.split(train, train["score"])):
    train.loc[val_index, "fold"] = fold
print('Train samples per fold:')
train.fold.value_counts().sort_index()

# Generate Embeddings

In [None]:
# Helper function for mean pooling
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# Dataset class for embedding generation
class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "full_text"]
        tokens = self.tokenizer(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        tokens = {k: v.squeeze(0) for k, v in tokens.items()}
        return tokens

In [None]:
# Function to generate embeddings
def get_embeddings(model_name='', max_length=1024, batch_size=32, compute_train=True, compute_test=True):
    global train, test
    DEVICE = "cuda:1"
    
    # Necessary since this competition does not allow Internet access
    path = "/kaggle/input/download-huggingface/"
    disk_name = path + model_name.replace("/","_")
    
    model = AutoModel.from_pretrained(disk_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(disk_name, trust_remote_code=True)

    ds_tr = EmbedDataset(train, tokenizer, max_length)
    embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr, batch_size=batch_size, shuffle=False)
    ds_te = EmbedDataset(test, tokenizer, max_length)
    embed_dataloader_te = torch.utils.data.DataLoader(ds_te, batch_size=batch_size, shuffle=False)

    model.to(DEVICE)
    model.eval()

    all_train_text_feats = []
    if compute_train:
        for batch in tqdm(embed_dataloader_tr, total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                with torch.cuda.amp.autocast(enabled=True):
                    model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
            all_train_text_feats.extend(sentence_embeddings.squeeze(0).detach().cpu().numpy())
    all_train_text_feats = np.array(all_train_text_feats)

    all_test_text_feats = []
    if compute_test:
        for batch in embed_dataloader_te:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                with torch.cuda.amp.autocast(enabled=True):
                    model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
            all_test_text_feats.extend(sentence_embeddings.squeeze(0).detach().cpu().numpy())
    all_test_text_feats = np.array(all_test_text_feats)

    # Clear memory
    del ds_tr, ds_te, embed_dataloader_tr, embed_dataloader_te, model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return all_train_text_feats, all_test_text_feats

In [None]:
# List of models to generate embeddings
models = [
    ('microsoft/deberta-base', 1024, 32),
    ('microsoft/deberta-large', 1024, 8),
    ('microsoft/deberta-v3-large', 1024, 8),
    ('allenai/longformer-base-4096', 1024, 32),
    ('google/bigbird-roberta-base', 1024, 32),
    ('google/bigbird-roberta-large', 1024, 8),
]

In [None]:
all_train_embeds = []
all_test_embeds = []

for model, max_length, batch_size in models:
    model_path_name = model.replace("/","_")
    full_path = '/kaggle/input/d/joaodpscorreia/essay-train-embeddings/' + model_path_name + '.npy'
    
    if os.path.exists(full_path):
        _, test_embed = get_embeddings(model_name=model, max_length=max_length, batch_size=batch_size, compute_train=False, compute_test=True)
        train_embed = np.load(full_path)
        print(f"Loaded train embeddings for {model}")
    else:
        print(f"Computing train embeddings for {model}")
        train_embed, test_embed = get_embeddings(model_name=model, max_length=max_length, batch_size=batch_size, compute_train=True, compute_test=True)
        save_path = '/kaggle/working/' + model_path_name + '.npy'
        np.save(save_path, train_embed)
        
    all_train_embeds.append(train_embed)
    all_test_embeds.append(test_embed)

In [None]:
all_train_embeds = np.concatenate(all_train_embeds, axis=1)
all_test_embeds = np.concatenate(all_test_embeds, axis=1)
gc.collect()
print('Our concatenated train embeddings have shape', all_train_embeds.shape)

# Model Training

In [None]:
# Function to compute Quadratic Weighted Kappa score
def comp_score(y_true, y_pred):
    p = y_pred.clip(1, 6).round(0)
    return cohen_kappa_score(y_true, p, weights='quadratic')

In [None]:
# Train LightGBM Model
oof = np.zeros(len(train), dtype='float32')
test_preds = np.zeros((len(test), FOLDS), dtype='float32')

for fold in range(FOLDS):
    print(f'Fold {fold + 1}')
    
    train_index = train["fold"] != fold
    valid_index = train["fold"] == fold
    
    X_train = all_train_embeds[train_index]
    y_train = train.loc[train_index, 'score'].values
    X_valid = all_train_embeds[valid_index]
    y_valid = train.loc[valid_index, 'score'].values
    X_test = all_test_embeds

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
    }

    num_round = 500
    early_stopping = 50
    bst = lgb.train(params, train_data, num_round, valid_sets=[valid_data], callbacks=[lgb.early_stopping(stopping_rounds=early_stopping), lgb.log_evaluation(early_stopping)])
    
    preds = bst.predict(X_valid, num_iteration=bst.best_iteration)
    test_preds[:, fold] = bst.predict(X_test, num_iteration=bst.best_iteration)
    oof[valid_index] = preds

    score = comp_score(y_valid, preds)
    print(f"QWK score: {score}")

overall_score = comp_score(train.score.values, oof)
print('Overall CV QWK score =', overall_score)

# Find optimal QWK thresholds

In [None]:
# Function to find optimal thresholds for QWK score
def find_thresholds(true, pred, steps=50):
    xs = [[], [], [], [], []]
    ys = [[], [], [], [], []]

    threshold = [1.5, 2.5, 3.5, 4.5, 5.5]
    pred2 = pd.cut(pred, [-np.inf] + threshold + [np.inf], labels=[1, 2, 3, 4, 5, 6]).astype('int32')
    best = cohen_kappa_score(true, pred2, weights="quadratic")

    for k in range(5):
        for sign in [1, -1]:
            v = threshold[k]
            threshold2 = threshold.copy()
            stop = 0
            while stop < steps:
                v += sign * 0.001
                threshold2[k] = v
                pred2 = pd.cut(pred, [-np.inf] + threshold2 + [np.inf], labels=[1, 2, 3, 4, 5, 6]).astype('int32')
                metric = cohen_kappa_score(true, pred2, weights="quadratic")
                xs[k].append(v)
                ys[k].append(metric)
                if metric <= best:
                    stop += 1
                else:
                    stop = 0
                    best = metric
                    threshold = threshold2.copy()

    pred2 = pd.cut(pred, [-np.inf] + threshold + [np.inf], labels=[1, 2, 3, 4, 5, 6]).astype('int32')
    best = cohen_kappa_score(true, pred2, weights="quadratic")
    threshold = [np.round(t, 3) for t in threshold]
    return best, threshold, xs, ys

best, thresholds, xs, ys = find_thresholds(train.score.values, oof, steps=500)
print('Best thresholds are:', thresholds)
print('achieve Overall CV QWK score =', best)

In [None]:
# Display Thresholds
diff = 0.5
for k in range(5):
    plt.figure(figsize=(10, 3))
    plt.scatter(xs[k], ys[k], s=3)
    m = k + 1.5
    plt.xlim((m - diff, m + diff))
    i = np.where((np.array(xs[k]) > m - diff) & (np.array(xs[k]) < m + diff))[0]
    mn = np.min(np.array(ys[k])[i])
    mx = np.max(np.array(ys[k])[i])
    plt.ylim((mn, mx))
    plt.plot([thresholds[k], thresholds[k]], [mn, mx], '--', color='black', label='optimal threshold')
    plt.title(f"Threshold between {k + 1} and {k + 2}", size=16)
    plt.xlabel('Threshold value', size=10)
    plt.ylabel('QWK CV score', size=10)
    plt.legend()
    plt.show()

# Create Submission CSV

In [None]:
# Create Submission
test_preds = np.mean(test_preds, axis=1)
print('Test preds shape:', test_preds.shape)
test_preds_pp = pd.cut(test_preds, [-np.inf] + thresholds + [np.inf], labels=[1, 2, 3, 4, 5, 6]).astype('int32')

In [None]:
sub = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
sub["score"] = test_preds_pp
sub.score = sub.score.astype('int32')
sub.to_csv("submission.csv", index=False)
print("Submission shape", sub.shape)
sub.head()