In [1]:
import numpy as np 
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset



In [2]:
!pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"

Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l- done
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=2ce5256545d69782d96ec5cf5a1a0491514c16cfbaa1735584e104cac048405b
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


# Data loading and Data prep

In [3]:
prompts = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')

In [4]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summaries_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [5]:
prompts.head(2)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...


In [6]:
summaries.head(2)

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755


In [7]:
df = prompts.merge(summaries, on = ['prompt_id'],how ='left')
test = prompts_test.merge(summaries_test, on = ['prompt_id'],how ='left')

In [8]:
from autocorrect import Speller

spell = Speller(lang='en')

df['text_correct'] = df['text'].apply(lambda x: "".join([spell(i) for i in x]))
df['question'] = df["prompt_title"] + "\n" + df["prompt_question"] + "\n" + df["prompt_text"]

In [9]:
test['text_correct'] = test['text'].apply(lambda x: "".join([spell(i) for i in x]))
test['question'] = test["prompt_title"] + "\n" + test["prompt_question"] + "\n" + test["prompt_text"]

In [10]:
model_name = "/kaggle/input/deberta-v3-base/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, problem_type="regression")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,text_correct,question
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,1 element of an ideal tragedy is that it shoul...,On Tragedy\nSummarize at least 3 elements of a...
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,The three elements of an ideal tragedy are: H...,On Tragedy\nSummarize at least 3 elements of a...
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,Aristotle states that an ideal tragedy should ...,On Tragedy\nSummarize at least 3 elements of a...
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,One element of an Ideal tragedy is having a co...,On Tragedy\nSummarize at least 3 elements of a...
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,The 3 ideal of tragedy is how complex you need...,On Tragedy\nSummarize at least 3 elements of a...


# Data prep for training

In [12]:
class CustomDataset(Dataset):
    def __init__(self, df, has_labels=True):
        self.df = df
        self.prompt_titles = df["question"].values.tolist()
        self.texts = df["text_correct"].values.tolist()
        self.encoded_examples = tokenizer(
            text=self.prompt_titles,
            text_pair=self.texts,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )

        
        self.has_labels = has_labels
        
        if self.has_labels:
            self.labels_list = df[["content", "wording"]].values.tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encoded_examples["input_ids"][idx],
            "attention_mask": self.encoded_examples["attention_mask"][idx],
            "token_type_ids": self.encoded_examples["token_type_ids"][idx]
        }
        
        if self.has_labels:
            item["labels"] = torch.tensor(self.labels_list[idx])
        
        return item

In [13]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, test_size=0.2, random_state=2023)

train_dataset = CustomDataset(df_train)
valid_dataset = CustomDataset(df_valid)
test_dataset  = CustomDataset(test, has_labels=False)

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Training

In [16]:
def compute_mcrmse(eval_pred):

    predictions, labels = eval_pred
    squared_errors = np.square(predictions - labels)
    mean_squared_errors = np.mean(squared_errors, axis=0)
    
    rmse = np.sqrt(mean_squared_errors)
    
    mcrmse_value = np.mean(rmse)
    
    content_rmse = rmse[0]
    wording_rmse = rmse[1]
    
    return {
        "mcrmse": mcrmse_value,
        "content_rmse": content_rmse,
        "wording_rmse": wording_rmse
    }

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output",             
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=4,    
    learning_rate=1e-5,            
    lr_scheduler_type="linear",      
    warmup_ratio=0.01,               
    num_train_epochs=3,              
    save_strategy="epoch",           
    logging_strategy="epoch",        
    evaluation_strategy="epoch",     
    load_best_model_at_end=True,     
    metric_for_best_model="mcrmse",           
    fp16=False,                      
    report_to='none',
    save_total_limit=1
)

In [18]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_mcrmse,
)
trainer.train()

trainer.save_model("best_model")

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mcrmse,Content Rmse,Wording Rmse
1,0.5233,0.324159,0.564862,0.49352,0.636205
2,0.2789,0.307704,0.550864,0.485651,0.616078
3,0.237,0.283475,0.527465,0.454973,0.599957




# Scoring

In [19]:
predictions = trainer.predict(test_dataset)
content_list = predictions.predictions[:, 0].tolist()
wording_list = predictions.predictions[:, 1].tolist()

In [20]:
submission = test[['student_id']]
submission['content'] = content_list
submission['wording'] = wording_list

submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['content'] = content_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['wording'] = wording_list


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.040858,0.049885
1,222222cccccc,0.052598,0.039814
2,111111eeeeee,0.044912,0.051069
3,333333dddddd,0.053708,0.039713


In [21]:
submission.to_csv('submission.csv', index=False)