# Imports and Configurations

In [None]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, cohen_kappa_score
from tokenizers import AddedToken

In [None]:
# Environment and Seed Setup
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
warnings.simplefilter('ignore')

In [None]:
# Constants
VERSION = 1
LOAD_MODEL_FROM = "/kaggle/input/deberta-v3-small-essay-finetuned/"

In [None]:
# Paths and Configuration
class PATHS:
    train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
    test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
    sub_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv'
    model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-small"

In [None]:
class CFG:
    n_splits = 5
    seed = 42
    max_length = 1024
    lr = 1e-5
    train_batch_size = 4
    eval_batch_size = 8
    train_epochs = 4
    weight_decay = 0.01
    warmup_ratio = 0.0
    num_labels = 6

In [None]:
# Seed Function
def seed_everything(seed):
    import random
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG.seed)

# Tokenization Class

In [None]:
class Tokenizer:
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def get_dataset(self, df):
        return Dataset.from_dict({
            'essay_id': df['essay_id'].tolist(),
            'full_text': df['full_text'].tolist(),
            'label': df['label'].tolist(),
        })

    def tokenize_function(self, example):
        return self.tokenizer(example['full_text'], truncation=True, max_length=self.max_length)

    def __call__(self, train_df, valid_df):
        train_ds = self.get_dataset(train_df)
        valid_ds = self.get_dataset(valid_df)
        tokenized_train = train_ds.map(self.tokenize_function, batched=True)
        tokenized_valid = valid_ds.map(self.tokenize_function, batched=True)
        return tokenized_train, tokenized_valid

# Metric Function

In [None]:
def compute_metric(eval_pred):
    predictions, labels = eval_pred
    qwk = cohen_kappa_score(labels, predictions.clip(0, 5).round(), weights='quadratic')
    return {'qwk': qwk}

# Load Data and Set Fold

In [None]:
data = pd.read_csv(PATHS.train_path)
data['label'] = data['score'] - 1
data['label'] = data['label'].astype('float32')

skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val_index) in enumerate(skf.split(data, data["score"])):
    data.loc[val_index, "fold"] = i

# Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=f'output_v{VERSION}',
    fp16=True,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    evaluation_strategy='epoch',
    metric_for_best_model='qwk',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type='linear',
    optim='adamw_torch',
    logging_first_step=True,
)

# K Fold Training

In [None]:
if LOAD_MODEL_FROM is None:
    for fold in range(CFG.n_splits):
        
        train_df = data[data['fold'] != fold]
        valid_df = data[data['fold'] == fold].copy()

        tokenizer = AutoTokenizer.from_pretrained(PATHS.model_path)
        tokenizer.add_tokens([AddedToken("\n", normalized=False), AddedToken(" " * 2, normalized=False)])
        custom_tokenizer = Tokenizer(tokenizer, CFG.max_length)
        tokenized_train, tokenized_valid = custom_tokenizer(train_df, valid_df)

        config = AutoConfig.from_pretrained(PATHS.model_path)
        config.attention_probs_dropout_prob = 0.0 
        config.hidden_dropout_prob = 0.0 
        config.num_labels = 1

        model = AutoModelForSequenceClassification.from_pretrained(PATHS.model_path, config=config)
        model.resize_token_embeddings(len(tokenizer))

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metric,
        )

        trainer.train()

        # Confusion Matrix
        y_true = valid_df['score'].values
        predictions = trainer.predict(tokenized_valid).predictions
        predictions = predictions.round() + 1

        cm = confusion_matrix(y_true, predictions, labels=[x for x in range(1, 7)])
        ConfusionMatrixDisplay(cm, display_labels=[x for x in range(1, 7)]).plot()
        plt.show()

        
        trainer.save_model(f'/kaggle/working/deberta-v3-small_AES2_fold_{fold}_v{VERSION}')
        tokenizer.save_pretrained(f'/kaggle/working/deberta-v3-small_AES2_fold_{fold}_v{VERSION}')

        valid_df['pred'] = predictions
        valid_df.to_csv(f'/kaggle/working/valid_df_fold_{fold}_v{VERSION}.csv', index=False)

# Overall CV Score

In [None]:
if LOAD_MODEL_FROM is None:
    dfs = [pd.read_csv(f'/kaggle/working/valid_df_fold_{k}_v{VERSION}.csv') for k in range(CFG.n_splits)]
    dfs = pd.concat(dfs)
    dfs.to_csv(f'/kaggle/working/valid_df_v{VERSION}.csv', index=False)
    print('Valid OOF shape:', dfs.shape)
    print('Valid OOF QWK:', cohen_kappa_score(dfs.score.values, dfs.pred.values.clip(1, 6).round(), weights='quadratic'))

# Infer Test Data

In [None]:
test_df = pd.read_csv(PATHS.test_path)
print('Test shape:', test_df.shape)
test_df.head()

In [None]:
all_predictions = []
test_df['label'] = 0.0

for fold in range(CFG.n_splits):
    tokenizer = AutoTokenizer.from_pretrained(f'{LOAD_MODEL_FROM}deberta-v3-small_AES2_fold_{fold}_v{VERSION}' if LOAD_MODEL_FROM else f'/kaggle/working/deberta-v3-small_AES2_fold_{fold}_v{VERSION}')
    custom_tokenizer = Tokenizer(tokenizer, CFG.max_length)
    tokenized_test, _ = custom_tokenizer(test_df, test_df)

    model = AutoModelForSequenceClassification.from_pretrained(f'{LOAD_MODEL_FROM}deberta-v3-small_AES2_fold_{fold}_v{VERSION}' if LOAD_MODEL_FROM else f'/kaggle/working/deberta-v3-small_AES2_fold_{fold}_v{VERSION}')
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_test,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
    )
    
    predictions = trainer.predict(tokenized_test).predictions
    all_predictions.append(predictions)

predictions = np.mean(all_predictions, axis=0)
print('Predictions shape:', predictions.shape)

# Create Submission CSV

In [None]:
submission = pd.read_csv(PATHS.sub_path)
submission["score"] = predictions.clip(0, 5).round() + 1
submission['score'] = submission['score'].astype('int32')
submission.to_csv('submission.csv', index=False)
print('Submission shape:', submission.shape)
submission.head()