In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/dataset/feedback_effectiveness/

In [None]:
!pip install transformers
!pip install tokenizers
!pip install sentencepiece
!pip install protobuf

In [None]:
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, CosineAnnealingWarmRestarts

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoConfig, AutoModel

In [None]:
# CONFIG
class CFG:
    path_to_train_csv = './train_stratified.csv'
    path_to_validation_csv = './validation_stratified.csv'

    model_path = 'microsoft/deberta-v3-xsmall'
    
    save_file = f'./logs/{model_path.replace("/", "-")}'
    checkpoint_file = f'./checkpoints/{model_path.replace("/", "-")}'
    load_checkpoint_file = f'./checkpoints/{model_path.replace("/", "-")}_epoch_11_batchsize_5'

    learning_rate = 3e-6
    weight_decay = 1e-3
    train_batch_size = 4
    val_batch_size = 6
    epochs = 5
    dropout = 0.5
    max_norm = 1
    max_len = 2500
    eta_min = 1e-6

    layer_size = 512

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    keep_training = True
    freeze = False
    restart_opt_and_scheduler = True

In [None]:
df_train = pd.read_csv(CFG.path_to_train_csv)
df_val = pd.read_csv(CFG.path_to_validation_csv)

def encode_target(df):
    df['discourse_effectiveness'] = df['discourse_effectiveness'].map({
        'Adequate': 1,
        'Effective': 2,
        'Ineffective': 0
    })
    return df

df_train = encode_target(df_train)
df_val = encode_target(df_val)

In [None]:
def add_whole_text(df):
    texts = []
    for i, data in tqdm(df.iterrows()):
        id_ = data['essay_id']
        tmp = df[df['essay_id'] == id_]
        tmp = tmp['discourse_text'].to_numpy().tolist()
        texts.append(''.join(tmp))
    df['whole_text'] = texts
    return df


df_train = add_whole_text(df_train)
df_val = add_whole_text(df_val)

In [None]:
x_cols = ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'whole_text']
y_col = 'discourse_effectiveness'

X_train = df_train[x_cols]
y_train = df_train[y_col]
X_val = df_val[x_cols]
y_val = df_val[y_col]

In [None]:
# MODEL
class Model(nn.Module):
    def __init__(self, model_name, num_labels=3, freeze=True, reinit_n_layers=0):
        super().__init__()
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.model_name = model_name
        self.model = AutoModel.from_pretrained(model_name)
        if freeze:
            for param in self.model.parameters():
                param.requires_grad = False
        
        if reinit_n_layers > 0:
            self._reinit(reinit_n_layers)

        self.dropout = nn.Dropout(CFG.dropout)
        self.fc1 = nn.Linear(self.config.hidden_size, CFG.layer_size)
        self.fc2 = nn.Linear(CFG.layer_size, num_labels)

    def _reinit(self, num_layers):
        for i in range(1, num_layers + 1):
            self.model.encoder.layer[-i].apply(self._reinit_weights)

    def _reinit_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        last_hidden = outputs[0]
        last_hidden_sum = last_hidden.mean(dim=1)
        
        out = torch.tanh(self.fc1(last_hidden_sum))
        logits = self.fc2(self.dropout(out))

        return {'logits': logits}

# DATASET
class ArgumentsDataset(Dataset):
    def __init__(self, X, y):
        self.inputs = X['discourse_text']
        self.whole_text = X['whole_text']
        self.label = y

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ = self.inputs[idx] + '[SEP]' + self.whole_text[idx]
        if self.label is None:
            return input_

        label = self.label[idx]
        return input_ , label


def collate_fn(batch):
    inputs, labels = [], []
    for i, l in batch:
        inputs.append(i)
        labels.append(l)
    inputs = tokenizer(
        list(inputs), 
        return_tensors='pt', 
        truncation=True, 
        padding=True, 
        max_length=CFG.max_len
    )
    return inputs, torch.LongTensor(labels)

# --------- MAIN PROGRAM -----------
print(f'using {CFG.device} . . .')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

train_dataset = ArgumentsDataset(X_train, y_train)
validation_dataset = ArgumentsDataset(X_val, y_val)
train_dataloader = DataLoader(train_dataset, batch_size=CFG.train_batch_size, shuffle=False, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=CFG.val_batch_size, shuffle=False, collate_fn=collate_fn)

# TRAINING
model = Model(CFG.model_path, freeze=CFG.freeze)
model.to(CFG.device)

optimizer = AdamW(
    model.parameters(), 
    lr=CFG.learning_rate, 
    weight_decay=CFG.weight_decay
)

scheduler = CosineAnnealingLR(
    optimizer, 
    T_max=CFG.epochs * (len(train_dataset) // CFG.train_batch_size + len(train_dataset) % CFG.train_batch_size != 0), 
    eta_min=CFG.eta_min
)

if CFG.keep_training:
    print(f'Resuming training from {CFG.load_checkpoint_file} . . .')
    checkpoint = torch.load(CFG.load_checkpoint_file)
    model.load_state_dict(checkpoint['model_state_dict'])
    if not CFG.restart_opt_and_scheduler:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    checkpoint_epoch = checkpoint['epoch'] + 1
else:
    print(f'Training from scratch . . .')
    checkpoint_epoch = 0


# TRAINING LOOP
scaler = torch.cuda.amp.GradScaler()
loss_fn = nn.CrossEntropyLoss()
loss_fn2 = nn.CrossEntropyLoss(reduction='sum')
for i in range(CFG.epochs):
    # TRAIN STEP
    model.train()
    log_loss_train = torch.tensor(0.0, requires_grad=False).to(CFG.device)
    for j, batch in enumerate(tqdm(train_dataloader)):
        inputs, labels = batch

        labels = labels.to(CFG.device)
        input_ids = inputs['input_ids'].to(CFG.device)
        attention_mask = inputs['attention_mask'].to(CFG.device)
        token_type_ids = inputs['token_type_ids'].to(CFG.device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model.forward(input_ids, attention_mask, token_type_ids)
            logits = outputs['logits'].to(CFG.device)
            loss = loss_fn(logits, labels)
            log_loss_train += loss_fn2(logits, labels).detach()

        loss = scaler.scale(loss)
        loss.backward()
        
        clip_grad_norm_(model.parameters(), max_norm=CFG.max_norm)        
        scaler.step(optimizer)
        scale = scaler.get_scale()        
        scaler.update()
        skip_lr_sched = (scale != scaler.get_scale())
        
        if not skip_lr_sched:
            scheduler.step()

        torch.cuda.empty_cache()
    print(f'Epoch {i + checkpoint_epoch}: train loss - {log_loss_train / len(train_dataset)}')
    
    # VALIDATION STEP
    log_loss_val = torch.tensor(0.0, requires_grad=False).to(CFG.device)
    model.eval()
    with torch.no_grad():
        for j, batch in enumerate(tqdm(validation_dataloader)):
            inputs, labels = batch

            labels = labels.to(CFG.device)
            input_ids = inputs['input_ids'].to(CFG.device)
            attention_mask = inputs['attention_mask'].to(CFG.device)
            token_type_ids = inputs['token_type_ids'].to(CFG.device)

            with torch.cuda.amp.autocast():
                outputs = model.forward(input_ids, attention_mask, token_type_ids)
                logits = outputs['logits'].to(CFG.device)
                loss = loss_fn(logits, labels)
                log_loss_val += loss_fn2(logits, labels).detach()

            torch.cuda.empty_cache()
        print(f'Epoch {i + checkpoint_epoch}: val loss - {log_loss_val / len(validation_dataset)}')
    
    torch.save({
    'epoch': i + checkpoint_epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss': log_loss_train/len(train_dataset),
    'val_loss': log_loss_val/len(validation_dataset)
    }, f'{CFG.checkpoint_file}_epoch_{i + checkpoint_epoch}_batchsize_{CFG.train_batch_size}')
    print(f'Saved checkpoint for epoch {i + checkpoint_epoch}.')

print(f'Training finished.')