In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/dataset/feedback_effectiveness/

In [None]:
!pip install transformers
!pip install tokenizers
!pip install sentencepiece
!pip install protobuf

In [None]:
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, CosineAnnealingWarmRestarts

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoConfig, AutoModel

import numpy as np

In [None]:
class global_CFG:
    path_to_train_csv = './train_stratified.csv'
    path_to_validation_csv = './validation_stratified.csv'

    w_xsmall = 0.14
    w_small = 0.86
    w_base = 0.0

    infer_batch_size = 16

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
df_train = pd.read_csv(global_CFG.path_to_train_csv)
df_val = pd.read_csv(global_CFG.path_to_validation_csv)

def encode_target(df):
    df['discourse_effectiveness'] = df['discourse_effectiveness'].map({
        'Adequate': 1,
        'Effective': 2,
        'Ineffective': 0
    })
    return df

df_train = encode_target(df_train)
df_val = encode_target(df_val)

In [None]:
def add_whole_text(df):
    texts = []
    for i, data in tqdm(df.iterrows()):
        id_ = data['essay_id']
        tmp = df[df['essay_id'] == id_]
        tmp = tmp['discourse_text'].to_numpy().tolist()
        texts.append(''.join(tmp))
    df['whole_text'] = texts
    return df

df_train = add_whole_text(df_train)
df_val = add_whole_text(df_val)

In [None]:
x_cols = ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'whole_text']
y_col = 'discourse_effectiveness'

X_train = df_train[x_cols]
y_train = df_train[y_col]
X_val = df_val[x_cols]
y_val = df_val[y_col]

In [None]:
class ArgumentsDataset(Dataset):
    def __init__(self, X, y):
        self.inputs = X['discourse_text']
        self.whole_text = X['whole_text']
        self.label = y

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ = self.inputs[idx] + '[SEP]' + self.whole_text[idx]
        if self.label is None:
            return input_

        label = self.label[idx]
        return input_ , label


train_dataset = ArgumentsDataset(X_train, y_train)
validation_dataset = ArgumentsDataset(X_val, y_val)

In [None]:
class deberta_xsmall_CFG:
    model_path = 'microsoft/deberta-v3-xsmall'
    trained_model_path = 'checkpoints/microsoft-deberta-v3-xsmall_epoch_9_batchsize_5'
    
    max_len = 2500
    dropout = 0.5

    layer_size = 512

class DebertaXSmallModel(nn.Module):
    def __init__(self, model_name, num_labels=3):
        super().__init__()
        self.config = AutoConfig.from_pretrained(deberta_xsmall_CFG.model_path)
        self.model_name = model_name
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(deberta_xsmall_CFG.dropout)
        self.fc1 = nn.Linear(self.config.hidden_size, deberta_xsmall_CFG.layer_size)
        self.fc2 = nn.Linear(deberta_xsmall_CFG.layer_size, num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        last_hidden = outputs[0]
        last_hidden_sum = last_hidden.mean(dim=1)
        
        out = torch.tanh(self.fc1(last_hidden_sum))
        logits = self.fc2(self.dropout(out))

        return {'logits': logits}

In [None]:
class deberta_small_CFG:
    model_path = 'microsoft/deberta-v3-small'
    trained_model_path = 'checkpoints/microsoft-deberta-v3-small_epoch_10_batchsize_5'
    
    max_len = 1500
    dropout = 0.5

    layer_size = 256

class DebertaSmallModel(nn.Module):
    def __init__(self, model_name, num_labels=3):
        super().__init__()
        self.config = AutoConfig.from_pretrained(deberta_small_CFG.model_path)
        self.model_name = model_name
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(deberta_small_CFG.dropout)
        self.fc1 = nn.Linear(self.config.hidden_size, deberta_small_CFG.layer_size)
        self.fc2 = nn.Linear(deberta_small_CFG.layer_size, num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        last_hidden = outputs[0]
        last_hidden_sum = last_hidden.mean(dim=1)
        
        out = torch.tanh(self.fc1(last_hidden_sum))
        logits = self.fc2(self.dropout(out))

        return {'logits': logits}

In [None]:
def eval_dataloader(dataloader, model):
    cols = ['Adequate', 'Effective', 'Ineffective']
    df_new = None
    with torch.no_grad():
        for j, batch in enumerate(tqdm(dataloader)):
            inputs, labels = batch
            labels = labels.to(global_CFG.device)
            input_ids = inputs['input_ids'].to(global_CFG.device)
            attention_mask = inputs['attention_mask'].to(global_CFG.device)
            token_type_ids = inputs['token_type_ids'].to(global_CFG.device)

            with torch.cuda.amp.autocast():
                outputs = model.forward(input_ids, attention_mask, token_type_ids)
                logits = outputs['logits']
            
            df_new = logits.cpu().detach().numpy() if df_new is None else np.append(df_new, logits.cpu().detach().numpy(), axis=0)
            torch.cuda.empty_cache()

    df_new = pd.DataFrame(df_new, columns=cols)
    df_new.reset_index(drop=True, inplace=True)
    return df_new

In [None]:
def infer_single_model(model_cfg, model_lambda):
    checkpoint = torch.load(model_cfg.trained_model_path, map_location=global_CFG.device)
    model = model_lambda(model_cfg.model_path)
    model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    model.to(device=global_CFG.device)
    model.eval()

    tokenizer = AutoTokenizer.from_pretrained(model_cfg.model_path)
    max_len = model_cfg.max_len

    train_dataloader = DataLoader(train_dataset, batch_size=global_CFG.infer_batch_size, shuffle=False, collate_fn=lambda b: collate_fn(b, tokenizer, max_len))
    validation_dataloader = DataLoader(validation_dataset, batch_size=global_CFG.infer_batch_size, shuffle=False, collate_fn=lambda b: collate_fn(b, tokenizer, max_len))

    validation_infer_df = eval_dataloader(validation_dataloader, model)
    train_infer_df = eval_dataloader(train_dataloader, model)
    return validation_infer_df, train_infer_df

def collate_fn(batch, tokenizer, max_len):
    inputs, labels = [], []
    for i, l in batch:
        inputs.append(i)
        labels.append(l)
    inputs = tokenizer(
        list(inputs), 
        return_tensors='pt', 
        truncation=True, 
        padding=True, 
        max_length=max_len
    )
    return inputs, torch.LongTensor(labels)

print(f'using {global_CFG.device} . . .')

validation_infer_df_xsmall, train_infer_df_xsmall = infer_single_model(deberta_xsmall_CFG, DebertaXSmallModel)
validation_infer_df_small, train_infer_df_small   = infer_single_model(deberta_small_CFG, DebertaSmallModel)

validation_final_df = global_CFG.w_xsmall * validation_infer_df_xsmall + global_CFG.w_small * validation_infer_df_small  
train_final_df      = global_CFG.w_xsmall * train_infer_df_xsmall      + global_CFG.w_small * train_infer_df_small  

In [None]:
display(validation_final_df.head())
display(train_final_df.head())

In [None]:
validation_final_df.to_csv('./validation_final_df.csv', index=False)
train_final_df.to_csv('./train_final_df.csv', index=False)