In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

In [6]:
train_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test_data = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

train_data['text'] = train_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
test_data['text'] = test_data['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
train_data.head(2)

In [7]:
encoder = LabelEncoder()
train_data['discourse_effectiveness'] = encoder.fit_transform(train_data['discourse_effectiveness'])
train_data['discourse_type'] = encoder.fit_transform(train_data['discourse_type'])
test_data['discourse_type'] = encoder.fit_transform(test_data['discourse_type'])

In [8]:
class EssayDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.discourse_type = df['discourse_type'].values
        self.discourse = df['discourse_text'].values
        self.essay = df['text'].values
        if 'discourse_effectiveness' in self.df:
            self.target = df['discourse_effectiveness'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        discourse_type = self.discourse_type[idx]
        discourse = self.discourse[idx]
        essay = self.essay[idx]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay

        encode_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        ids = encode_dict['input_ids']
        mask = encode_dict['attention_mask']

        ids = ids.squeeze(0)
        mask = mask.squeeze(0)

        if 'discourse_effectiveness' in self.df:
            target = self.target[idx]
            return {"ids" : ids, "mask": mask, "target": target, "dense_feature": discourse_type}
        return {"ids": ids, "mask": mask, "dense_feature": discourse_type}

In [9]:
model_name = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [10]:
max_len = 512
train_data_, valid_data_ = train_test_split(train_data, test_size=0.2, random_state=42)
train_dataset = EssayDataset(train_data_, tokenizer, max_len)
valid_dataset = EssayDataset(valid_data_, tokenizer, max_len)
test_dataset = EssayDataset(test_data, tokenizer, max_len)

In [11]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.deberta = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10, output_attentions=False, output_hidden_states=False)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(10+1, 3)
        
    def forward(self, batch):        
        input_ids, attention_masks, dense_feature = batch['ids'], batch['mask'], batch['dense_feature']
        out = self.deberta(input_ids, attention_mask=attention_masks, output_hidden_states=False)
        output = self.dropout(out.logits)
        dense_feature = dense_feature.reshape((-1, 1))
        output = torch.concat([out.logits, dense_feature], dim=-1)
        output = self.fc(output)
        return output

In [12]:
class Classifier(pl.LightningModule):
    def __init__(self, hparams, model):
        super(Classifier, self).__init__()
        self.save_hyperparameters(ignore=['model'])

        self.model = model
        self.batch_size = hparams["batch_size"]
        self.lr = hparams["lr"]
        self.wd = hparams['weight_decay']
        self.steps = hparams['total_steps']


    def forward(self, batch):
        output = self.model(batch)
        return output

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=self.wd)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(self.steps * 0.1), num_training_steps=self.steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        output = self.forward(batch)
        pred_flat = torch.argmax(output, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        loss = F.cross_entropy(output, labels_flat)
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("train_loss", loss, on_epoch=True, on_step=False)
        self.log("train_acc", acc, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, batch_idx):
        output = self.forward(batch)
        pred_flat = torch.argmax(output, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        loss = F.cross_entropy(output, labels_flat)
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def train_dataloader(self):
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, pin_memory=True)
        return train_loader

    def val_dataloader(self):
        valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return valid_loader

    def test_dataloader(self):
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return test_loader

In [13]:
pl.seed_everything(42)
hparams = {
    "batch_size": 16,
    "lr": 2e-5,
    "weight_decay": 1e-2,
    "epochs": 1,
}
loader = DataLoader(train_dataset, batch_size=hparams["batch_size"])
hparams["total_steps"] = len(loader) * hparams["epochs"]

In [14]:
feedback_model = FeedBackModel(model_name)
lightning = Classifier(hparams, feedback_model)

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    dirpath="./ckpts/",
    mode='min',
    filename='best',
)

trainer = pl.Trainer(
    gpus=1, 
    max_epochs=hparams["epochs"], 
    precision=16, 
    gradient_clip_val=1.0, 
    val_check_interval=0.5,
    callbacks=[checkpoint_callback]
)

In [15]:
trainer.fit(lightning)

In [16]:
predictions = trainer.predict(dataloaders=lightning.test_dataloader(), ckpt_path='best')

In [17]:
preds = []
for batch in predictions:
    preds.append(batch)

preds = torch.concat(preds)
preds = preds.type(torch.float32)
preds = F.softmax(preds, dim=1)

sample = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/sample_submission.csv")
sample['Adequate'] = preds[:, 0]
sample['Effective'] = preds[:, 1]
sample['Ineffective'] = preds[:, 2]
print(sample.head())
sample.to_csv("deberta_submission.csv", index=False)