In [1]:
!pip install transformers datasets



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import cohen_kappa_score
import os
import random
import torch
import numpy as np
import wandb

wandb.init(mode='disabled')



In [3]:
TORCH_SEED = 52

def seed_everything(TORCH_SEED):
    random.seed(TORCH_SEED)
    os.environ['PYTHONHASHSEED'] = str(TORCH_SEED)
    np.random.seed(TORCH_SEED)
    torch.manual_seed(TORCH_SEED)
    torch.cuda.manual_seed_all(TORCH_SEED)
    
seed_everything(52)

In [4]:
df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
display(df.head())
display(test_df.head())

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [6]:
df['score'].unique()

array([3, 4, 2, 1, 5, 6])

In [7]:
le = LabelEncoder()

df['score'] = le.fit_transform(df['score'])
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,2
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,3
3,001bdc0,"We all heard about Venus, the planet without a...",3
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",2


In [8]:
df = df.set_index('essay_id').rename(columns={'score': 'label', 'full_text': 'text'})
test_df = test_df.set_index('essay_id').rename(columns={'full_text': 'text'})

In [9]:
train_split, val_split = train_test_split(df, test_size=0.2, random_state=52, stratify=df['label'])

In [10]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_split)
dataset_val = Dataset.from_pandas(val_split)
dataset_test = Dataset.from_pandas(test_df)

In [11]:
model_name = 'google-bert/bert-base-uncased'

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/google-bert-3')

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [14]:
tokenized_train = dataset_train.map(preprocess_function, batched=True)
tokenized_val = dataset_val.map(preprocess_function, batched=True)
tokenized_test = dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/13845 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2024-04-26 10:57:59.092735: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 10:57:59.092876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 10:57:59.226946: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    score = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
    return { 'quadratic weighted kappa':score }

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('/kaggle/input/google-bert-3', 
                                                           num_labels=6).to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bert-3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=1e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    #weight_decay=1e-3,
    #warmup_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Quadratic weighted kappa
1,1.137,0.971359,0.709694
2,0.8707,0.949012,0.750086
3,0.7952,0.897613,0.766886


TrainOutput(global_step=2598, training_loss=0.9105858630267724, metrics={'train_runtime': 2405.9418, 'train_samples_per_second': 17.264, 'train_steps_per_second': 1.08, 'total_flos': 1.0926670984284672e+16, 'train_loss': 0.9105858630267724, 'epoch': 3.0})

In [19]:
submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission.head()

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4


In [20]:
predictions = trainer.predict(tokenized_test)
predicted_class_id = predictions[0].argmax(axis=1).tolist()
submission["score"] = predicted_class_id
submission["score"] = le.inverse_transform(submission["score"])
submission.head()

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,4


In [21]:
submission.to_csv("submission.csv", index=False)