In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import cohen_kappa_score
import os
import random
import torch
import numpy as np
import wandb

wandb.init(mode='disabled')



In [2]:
TORCH_SEED = 52

def seed_everything(TORCH_SEED):
    random.seed(TORCH_SEED)
    os.environ['PYTHONHASHSEED'] = str(TORCH_SEED)
    np.random.seed(TORCH_SEED)
    torch.manual_seed(TORCH_SEED)
    torch.cuda.manual_seed_all(TORCH_SEED)
    
seed_everything(52)

In [3]:
df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [4]:
le = LabelEncoder()

df['score'] = le.fit_transform(df['score'])
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,2
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,3
3,001bdc0,"We all heard about Venus, the planet without a...",3
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",2


In [5]:
df = df.set_index('essay_id').rename(columns={'score': 'label', 'full_text': 'text'})
test_df = test_df.set_index('essay_id').rename(columns={'full_text': 'text'})

In [6]:
train_split, val_split = train_test_split(df, test_size=0.2, random_state=52, stratify=df['label'])

In [7]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_split)
dataset_val = Dataset.from_pandas(val_split)
dataset_test = Dataset.from_pandas(test_df)

In [8]:
model_name = '/kaggle/input/roberta-essay-scoring/roberta-trained/checkpoint-2598'

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
tokenized_val = dataset_val.map(preprocess_function, batched=True)
tokenized_test = dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2024-05-04 17:24:20.828037: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 17:24:20.828135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 17:24:20.946000: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    score = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
    return { 'quadratic weighted kappa':score }

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=6).to('cuda')

In [15]:
eval_args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=16, 
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=eval_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
preds_test = trainer.predict(tokenized_test).predictions.argmax(-1)

In [17]:
submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission["score"] = preds_test
submission["score"] = le.inverse_transform(submission["score"])
submission.to_csv("submission.csv", index=False)