# Book 3 - Modelling and Evaluation (DistilBert Model)

---


In [23]:
# # ! pip install accelerate nvidia-ml-py3
# ! pip install datasets==2.1.0
# ! pip install transformers==4.18.0
# ! pip install sentencepiece==0.1.96
# ! pip install pytorch-lightning==1.6.5
# ! pip install torchmetrics==0.9.2
# ! pip install wandb==0.12.21

In [24]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

import pandas as pd
pd.set_option("max_colwidth", None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW, get_cosine_schedule_with_warmup, EarlyStoppingCallback, AutoModel
from datasets import Dataset, Value, ClassLabel, Features, load_metric
import math

import torch
from torch import nn
from torch.utils.checkpoint import checkpoint # need to call when using gradient_checkpointing

from sklearn.metrics import log_loss, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split

from scipy.special import softmax

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import EarlyStopping, TQDMProgressBar,StochasticWeightAveraging
from torchmetrics.functional import f1_score

from torch.utils.data import DataLoader
import torch.nn.functional as F

import spacy
from spacy import displacy

import wandb

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

In [25]:
input_location = 1

if input_location ==0:
  INPUT_DIR = './data/'
  
elif input_location == 1:
  INPUT_DIR = '/kaggle/input/feedback-prize-effectiveness/'
  TRAINED_DIR = './'
elif input_location ==2:
  from google.colab import drive
  drive.mount('/content/drive')
  INPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Data/'
  
data_path = pd.read_csv(INPUT_DIR + 'train.csv')
test_path = pd.read_csv(INPUT_DIR + 'test.csv')



# Check GPU availability

In [26]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

# Login to Weight and Bias

In [27]:
try:
  wandb_api = open('/content/drive/MyDrive/Colab Notebooks/WandBapi/wandbapi.txt', 'r').read()
  !wandb login {wandb_api}
except:
  try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value = user_secrets.get_secret("wand_api")
    !wandb login {secret_value}

  except:
    print("wandb failed to login...")
    


In [28]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print(f'Current datetime: {timestr}')

# Configuration

In [29]:

attributes = ["Adequate" ,"Effective","Ineffective"]

# try:
#   data_path = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
#   test_path = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')
# except:
#   data_path = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/train.csv')
#   test_path = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/test.csv')

distilbert_config={'name': 'distilbert',
#                     'model_name':'distilbert-base-uncased',
                   'model_name':'../input/distilbertbaseuncased/distilbert-base-uncased',
                    'existing_tuned_model_name' :'',# 'kitkeat/distilbert-based-uncased-argumentativewriting',
                    'newly_tuned_model_path' : '../input/distilbert-trained-model-20220820/20220820-043647.pth',
                    # 'PATH':'/content/drive/MyDrive/Colab Notebooks/FineTuneModel/distilbert-frozenembedding&2transformlayer-5epoch-lr6e5-drop02.pth',
                    # 'model_name': '../input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad',
                    # 'PATH' : '../input/distilberttuned/distilbert-frozenembedding2transformlayer-5epoch-lr6e5-drop02.pth',
                    'wandb':False,
                    'param':{
                      'n_labels': 3,
                      'batch_size': 64,
                      'lr': 8e-4,#6e-5,
                      'warmup': 0, 
                      'weight_decay': 0.01,#Default is 0.01
                      'n_epochs': 5,#4,
                      'n_freeze' : 5,
                      'p_dropout':0,#0.2,#0.6,
                      'scheduler':False,
                      'precision':16, #Default is 32
                      'sample_mode':True,
                      'sample_size': 100,
                        'swa':False,
                        'swa_lrs':1e-2
                        
                  }
              }

seed_everything(91, workers=True)

# Utility

In [30]:
# Freeze the hidden layer within the pretrained model
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

# # Remove unicode error (https://www.kaggle.com/competitions/feedback-prize-2021/discussion/313330)
# def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
#     return error.object[error.start : error.end].encode("utf-8"), error.end


# def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
#     return error.object[error.start : error.end].decode("cp1252"), error.end

# # Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
# codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
# codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

# def resolve_encodings_and_normalize(text: str) -> str:
#     """Resolve the encoding problems and normalize the abnormal characters."""
#     text = (
#         text.encode("raw_unicode_escape")
#         .decode("utf-8", errors="replace_decoding_with_cp1252")
#         .encode("cp1252", errors="replace_encoding_with_utf8")
#         .decode("utf-8", errors="replace_decoding_with_cp1252")
#     )
#     text = unidecode(text)
#     return text

# MODELLING

---

## Dataset

---

Used to convert raw text into tokenized data

In [31]:
class _Dataset(Dataset):
    def __init__(self,data_path,test_path, tokenizer,label_encoder,attributes,config, max_token_len: int = 512, is_train=True,is_test=False):
        self.data_path = data_path
        self.test_path = test_path
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.is_train = is_train
        self.is_test = is_test
        self.label_encoder = label_encoder
        self.config = config
        self._prepare_data()

    def _prepare_data(self):
        SEP = self.tokenizer.sep_token # different model uses different to text as seperator (e.g. [SEP], </s>)
        if self.is_test:
            df = self.test_path
            # df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
            df['text'] = df['discourse_type'] + SEP + df['discourse_text']
            try:
              # Validation use
              df = df.loc[:,['text','labels']]
            except:
              # Test use
              df = df.loc[:,['text']]
        else:
            df = self.data_path
            if self.config['param']['sample_mode']:
                df = df.sample(self.config['param']['sample_size'])
            y = df['discourse_effectiveness']

            train_df, val_df = train_test_split(df, test_size=0.2,stratify=y,random_state=91)

            if self.is_train:
                df = train_df.copy()  
            else:
                df = val_df.copy()

            # df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
            df['text'] = df['discourse_type'] + SEP + df['discourse_text']
            df = df.rename(columns={'discourse_effectiveness':'labels'})
            df = df.loc[:,['text','labels']]
      
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self,index):
        item = self.df.iloc[index]
        text = str(item.text)
        tokens = self.tokenizer.encode_plus(text,
                                  add_special_tokens= True,
                                  return_tensors='pt',
                                  truncation=True,
      #                                   padding='max_length',
                                  max_length=self.max_token_len,
                                  return_attention_mask = True)
        if self.is_test:
            return {'input_ids':tokens.input_ids.flatten(),'attention_mask': tokens.attention_mask.flatten()}
        else:
            # # Convert strings to numerics, follow alphabetical order
            attributes = item['labels'].split()
            self.label_encoder.fit(self.attributes)
            attributes = self.label_encoder.transform(attributes)
            attributes = torch.as_tensor(attributes)
            #         attributes = torch.FloatTensor(item[self.attributes])
            return {'input_ids':tokens.input_ids.flatten(),'attention_mask': tokens.attention_mask.flatten(), 'labels':attributes}


## Collate (Dynamic Padding)

---

Dynamically pad tokenized text to match the max length of each batch to reduce computational time

In [32]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["labels"] = [sample["labels"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s.tolist() + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s.tolist() + (batch_max - len(s)) * [0] for s in output["attention_mask"]]

        else:
            output["input_ids"] = [torch.FloatTensor((batch_max - len(s)) * [self.tokenizer.pad_token_id].tolist()) + s.tolist() for s in output["input_ids"]]
            output["attention_mask"] = [torch.FloatTensor((batch_max - len(s)) * [0]) + s.tolist() for s in output["attention_mask"]]
            
        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["labels"] = torch.tensor(output["labels"], dtype=torch.long)
        return output



## Data Module

---

Data preparation by calling dataset and passing it to the dataloader where the data is collated and batched

In [33]:
class _Data_Module(pl.LightningDataModule):

    def __init__(self, data_path, test_path,attributes,label_encoder,tokenizer,config, batch_size: int = 8, max_token_length: int = 512):
        super().__init__()
        self.data_path = data_path
        self.test_path = test_path
        self.attributes = attributes
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        # self.model_name = model_name
        self.tokenizer = tokenizer #AutoTokenizer.from_pretrained(model_name)
        self.label_encoder = label_encoder
        self.config = config

    def setup(self, stage = None):
        if stage in (None, "fit"):
            self.train_dataset = _Dataset(self.data_path, self.test_path, label_encoder = self.label_encoder,  attributes=self.attributes, is_train=True, tokenizer=self.tokenizer,config = self.config)
            self.val_dataset = _Dataset(self.data_path, self.test_path, label_encoder = self.label_encoder, attributes=self.attributes, is_train=False,  tokenizer=self.tokenizer,config = self.config)
        if stage == 'predict':
            self.test_dataset = _Dataset(self.data_path, self.test_path, label_encoder = self.label_encoder, attributes=self.attributes, is_train=False,is_test=True, tokenizer=self.tokenizer,config = self.config)


    def train_dataloader(self):
        collate_fn = Collate(self.tokenizer, 
                           isTrain=True)

        return DataLoader(self.train_dataset, 
                        batch_size = self.batch_size, 
                        num_workers=2, 
                        shuffle=True,
                        collate_fn = collate_fn)

    def val_dataloader(self):
        collate_fn = Collate(self.tokenizer, 
                           isTrain=True)

        return DataLoader(self.val_dataset, 
                        batch_size = self.batch_size, 
                        num_workers=2, 
                        shuffle=False,
                        collate_fn = collate_fn)

    def predict_dataloader(self):
        collate_fn = Collate(self.tokenizer, 
                           isTrain=False)

        return DataLoader(self.test_dataset, 
                        batch_size = self.batch_size, 
                        num_workers=2, 
                        shuffle=False,
                        collate_fn = collate_fn)


## Classifier

---



### DistilBert classifier

In [34]:
class DistilBert_Text_Classifier(pl.LightningModule):
    
    def __init__(self, config: dict,data_module):
        super().__init__()
        self.config = config
        self.data_module=data_module
        self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
        freeze((self.pretrained_model).embeddings)
        freeze((self.pretrained_model).transformer.layer[:config['param']['n_freeze']])
#         print(get_freezed_parameters(self.pretrained_model))
        # Adding an additional hidden layer on top of the pretrained model
        # self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size,self.pretrained_model.config.hidden_size)
#         self.hidden2 = torch.nn.Linear(self.pretrained_model.config.hidden_size,100)

#         self.batchnorm = nn.BatchNorm1d(self.pretrained_model.config.hidden_size)
        # Adding classifier on top of the pretrained model
        self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['param']['n_labels'])
        
        # Used to initialize the weight of the newly created classifier layer, not sure whether hidden layer need it or not
        # torch.nn.init.xavier_uniform_(self.classifier.weight)
        
        self.loss_func = nn.CrossEntropyLoss() # do not put SoftMax, just use CrossEntropyLoss
        
        self.dropout = nn.Dropout(config['param']['p_dropout'])

    # For inference        
    def forward(self, input_ids, attention_mask, labels = None):
        output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)
        # torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) — Sequence of hidden-states at the output of the last layer of the model
        # example: output.last_hidden_state -> torch.Size([64, 277, 768])
        # torch.mean(output.last_hidden_state, 1) -> torch.Size([64, 768])
        pooled_output = torch.mean(output.last_hidden_state, 1)  # mean of sequence length
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        # pooled_output = self.hidden(pooled_output)
        # pooled_output = F.relu(pooled_output)
#         pooled_output = self.batchnorm(pooled_output)
        # pooled_output = self.dropout(pooled_output)
        # logits = self.classifier(pooled_output)
            
        loss = 0
        if labels is not None:
            loss = self.loss_func(logits,labels)
        return loss, logits


#     def training_step(self, batch, batch_index):
#         logits = self(**batch)  # self(**batch) = model(**batch), where **batch = unpack batch
# #         print(f"batch[labels] = {batch['labels']}")
# #         print(f"{type(batch['labels'])}")
#         class_weights=class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(batch['labels'].tolist()),y=batch['labels'].tolist())
#         class_weights=torch.tensor(class_weights,dtype=torch.float)
#         loss_func = nn.CrossEntropyLoss(weight=class_weights,reduction='mean').to('cuda:0')
#         loss = loss_func(logits,batch['labels']).to('cuda:0')
        
#         f1 = f1_score(logits.argmax(dim=1),batch['labels'],num_classes=3,multiclass=True)
#         f1_weighted = f1_score(logits.softmax(dim=1),batch['labels'],num_classes=3,multiclass=True,average='weighted')
#         wandb.log({"Training Loss": loss.item(),'Train F1 Score':f1,'Train F1_weighted Score':f1_weighted})
#         self.log("f1", f1, on_step=False,on_epoch=True, prog_bar = True, logger=True)
#         self.log("f1_weighted", f1_weighted, on_step=False,on_epoch=True, prog_bar = True, logger=True)
#         self.log("loss ", loss,on_step=False,on_epoch = True, prog_bar = True, logger=True)
#         return {"loss":loss}#, "predictions":logits, "labels": batch["labels"],"progress_bar":pbar}
    
#     def validation_step(self, batch, batch_index):
#         logits = self(**batch)
# #         print(f"batch[labels] = {batch['labels']}")
#         class_weights=class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(batch['labels'].tolist()),y=batch['labels'].tolist())
#         class_weights=torch.tensor(class_weights,dtype=torch.float)
#         loss_func = nn.CrossEntropyLoss(weight=class_weights,reduction='mean').to('cuda:0')
#         loss = loss_func(logits,batch['labels']).to('cuda:0')
        
        
#         f1 = f1_score(logits.argmax(dim=1),batch['labels'],num_classes=3,multiclass=True)
#         f1_weighted = f1_score(logits.softmax(dim=1),batch['labels'],num_classes=3,multiclass=True,average='weighted')
#         wandb.log({"Validation Loss": loss.item(),'Validation F1 Score':f1,'Validation F1_weighted Score':f1_weighted})
#         self.log("f1", f1, on_step=False,on_epoch=True, prog_bar = True, logger=True)
#         self.log("f1_weighted", f1_weighted, on_step=False,on_epoch=True, prog_bar = True, logger=True)
#         self.log("val_loss", loss, on_step=False,on_epoch = True, prog_bar = True, logger=True)
#         return {"val_loss": loss}#, "predictions":logits, "labels": batch["labels"]}
#     
#     def predict_step(self, batch, batch_index):
#         logits = self(**batch)
#         return logits

    def training_step(self, batch, batch_index):
        loss, logits = self(**batch)  # self(**batch) = model(**batch), where **batch = unpack batch
        f1 = f1_score(logits.argmax(dim=1),batch['labels'],num_classes=3,multiclass=True)
        f1_weighted = f1_score(logits.softmax(dim=1),batch['labels'],num_classes=3,multiclass=True,average='weighted')
        # wandb.log({"Training Loss": loss.item(),'Train F1 Score':f1,'Train F1_weighted Score':f1_weighted})
        self.log("f1", f1, on_step=True,on_epoch=True, prog_bar = True, logger=True)
        self.log("f1_weighted", f1_weighted, on_step=True,on_epoch=True, prog_bar = True, logger=True)
        self.log("loss", loss,on_step=True,on_epoch = True, prog_bar = True, logger=True)
        return {"loss":loss,"f1":f1,"f1_weighted":f1_weighted}#, "predictions":logits, "labels": batch["labels"],"progress_bar":pbar}
    
    def training_epoch_end(self, outputs):
        loss = outputs[0]['loss'].item()
        f1 = outputs[0]['f1'].item()
        f1_weighted = outputs[0]['f1_weighted'].item()
        if self.config['wandb']:
            wandb.log({"Train Loss": loss,'Train F1':f1,'Train F1 Weighted':f1_weighted})
        return

    def validation_step(self, batch, batch_index):
        loss, logits = self(**batch)
        f1 = f1_score(logits.argmax(dim=1),batch['labels'],num_classes=3,multiclass=True)
        f1_weighted = f1_score(logits.softmax(dim=1),batch['labels'],num_classes=3,multiclass=True,average='weighted')
        # wandb.log({"Validation Loss": loss.item(),'Validation F1 Score':f1,'Validation F1_weighted Score':f1_weighted})
        self.log("f1", f1, on_step=True,on_epoch=True, prog_bar = True, logger=True)
        self.log("f1_weighted", f1_weighted, on_step=True,on_epoch=True, prog_bar = True, logger=True)
        self.log("val_loss", loss, on_step=True,on_epoch = True, prog_bar = True, logger=True)
        return {"val_loss": loss,"val_f1":f1,"val_f1_weighted":f1_weighted}#, "predictions":logits, "labels": batch["labels"]}
    
    def validation_epoch_end(self, outputs):
        loss = outputs[0]['val_loss'].item()
        f1 = outputs[0]['val_f1'].item()
        f1_weighted = outputs[0]['val_f1_weighted'].item()
        if self.config['wandb']:
            wandb.log({"Val Loss": loss,'Val F1':f1,'Val F1 Weighted':f1_weighted})
        return

    def predict_step(self, batch, batch_index):
        loss, logits = self(**batch)
        return logits
    
#     def on_train_end(self):
#         AutoModel.save_pretrained(logits)
    
    def configure_optimizers(self):
        train_size = len(self.data_module.train_dataloader())
        
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config['param']['lr'], weight_decay=self.config['param']['weight_decay'])
        if self.config['param']['scheduler']:
            total_steps = train_size/self.config['param']['batch_size']
            warmup_steps = math.floor(total_steps * self.config['param']['warmup'])
            scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
            return[optimizer],[scheduler]
        else:
            return optimizer

# TRAINING & VALIDATION

---

## Training Function

In [35]:
def train(config,Text_Classifier,project, notes,timestr):
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'], use_fast=True)
    le = LabelEncoder()
    
    data_module = _Data_Module(data_path,
                                    test_path,
                                    attributes,
                                    le,
                                    tokenizer,
                                    # config['model_name'],
                                    batch_size=config['param']['batch_size'],
                                    config = config
                                   )
    data_module.setup()
    
    # model
    model = Text_Classifier(config,data_module)

    # trainer and fit
    if config['param']['swa']:
        trainer = pl.Trainer(max_epochs=config['param']['n_epochs'],
                             accelerator='auto',
                             callbacks=[EarlyStopping(monitor="val_loss", mode="min",patience = 3),TQDMProgressBar(refresh_rate=30),StochasticWeightAveraging(swa_lrs=config['param']['swa_lrs'])],
                             default_root_dir="./checkpoints",
                             deterministic=True, # To ensure reproducibility
                             precision = config['param']['precision'],
                            ) # automatic mixed precision to reduce memory
    else:
        trainer = pl.Trainer(max_epochs=config['param']['n_epochs'],
                             accelerator='auto',
                             callbacks=[EarlyStopping(monitor="val_loss", mode="min",patience = 3),TQDMProgressBar(refresh_rate=30)],
                             default_root_dir="./checkpoints",
                             deterministic=True, # To ensure reproducibility
                             precision = config['param']['precision'],
                            ) # automatic mixed precision to reduce memory

    if config['wandb']:
        run = wandb.init(name = f"{str(config['param'])}",
                      notes = str(config) + notes,
                      project=project,
                        tags = timestr)

        trainer.fit(model, data_module)

        run.finish()
    else:
        trainer.fit(model, data_module)
    
    try:
        tuned_model_path = f"{TRAINED_DIR}{str(config['param'])}_{timestr}.pth"
        # tuned_model_path = f"/content/drive/MyDrive/Colab Notebooks/FineTuneModel/{config['name']}_E{config['n_epochs']}Size{config['batch_size']}Lr{config['lr']}Warm{config['warmup']}Weight{config['weight_decay']}Freeze{config['n_freeze']}Drop{config['p_dropout']}Text{config['text_method']}_{timestr}.pth"
        config['newly_tuned_model_path'] = tuned_model_path
        torch.save(model.state_dict(), tuned_model_path)
    except:
        try:
            tuned_model_path = f"{TRAINED_DIR}{timestr}.pth"
            config['newly_tuned_model_path'] = tuned_model_path
            torch.save(model.state_dict(), tuned_model_path)
        except:
            print("Failed to save model")

## Note: https://huggingface.co/transformers/v1.0.0/model_doc/overview.html
    # Convert model to huggingface compatible model
#     state_dict = torch.load(config['newly_tuned_model_path'])
#     model_hf = AutoModel.from_pretrained(config['model_name'],state_dict=state_dict)
#     model_hf.save_pretrained(f"{TRAINED_DIR}{config['param']}_{timestr}")
    return model,config


In [36]:
# attributes = ["Adequate" ,"Effective","Ineffective"]
# #     config['model_name'] = config['existing_tuned_model_name']
# tokenizer = AutoTokenizer.from_pretrained(distilbert_config['model_name'], use_fast=True)
# le = LabelEncoder()

#     # Initialize data module
# test_data_module = _Data_Module(data_path,
#                                     test_path,
#                                     attributes,
#                                     le,
#                                     tokenizer,
#                                     batch_size=distilbert_config['param']['batch_size'],
#                                     config=distilbert_config
#                                    )
# test_data_module.setup()

#     # Initialize Model
# model = DistilBert_Text_Classifier(distilbert_config,test_data_module)
# model.load_state_dict(torch.load(distilbert_config['newly_tuned_model_path']))

# # # state_dict = torch.load(distilbert_config['newly_tuned_model_path'])
# # model_hf = AutoModel.from_pretrained(distilbert_config['model_name'])
# # # model_hf.save_pretrained(f"{TRAINED_DIR}{distilbert_config['param']}_{timestr}",state_dict=state_dict)
# # model_hf.load_state_dict(model)
# # model_hf = AutoModel.from_pretrained("./{'n_labels': 3, 'batch_size': 64, 'lr': 0.0008, 'warmup': 0, 'weight_decay': 0.01, 'n_epochs': 5, 'n_freeze': 5, 'p_dropout': 0, 'scheduler': False, 'precision': 16, 'sample_mode': True, 'sample_size': 100, 'swa': False, 'swa_lrs': 0.01}_20220822-011207")

## Validation Function

In [37]:
def validate(_Text_Classifier,config,data_path,val_path,attributes,timestr):
    wandb = False
    if config['wandb']:
        wandb=True
        config['wandb']=False
        
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'], use_fast=True)
    le = LabelEncoder()
    
    val_data_module = _Data_Module(data_path,
                                    val_path, # using 
                                    attributes,
                                    le,
                                    tokenizer,
                                    batch_size=config['param']['batch_size'],
                                    config = config
                                   )   

    val_data_module.setup()

    # Initialize Model
    model = _Text_Classifier(config,val_data_module)
    model.load_state_dict(torch.load(config['newly_tuned_model_path']))

    # Initialize Trainer
    trainer = pl.Trainer(accelerator='auto')
   
    logits = trainer.predict(model, datamodule=val_data_module)
    
    pred_list = []
    for logit in logits:
        pred_list.append(logit)
    y_pred = torch.cat(pred_list)

    argmax_output = y_pred.argmax(dim=1)
    argmax_output = argmax_output.numpy()
    
    val_df = val_path.copy()
    output_df = pd.concat([val_df.reset_index(drop=True), pd.DataFrame(argmax_output,columns=['pred_discourse_effectiveness'])], axis=1)
    output_df['pred_discourse_effectiveness'] = output_df['pred_discourse_effectiveness'].map({0:'Adequate',1:'Effective',2:'Ineffective'})
    df = pd.concat([output_df.reset_index(drop=True),pd.DataFrame(y_pred.tolist(),columns=attributes)],axis=1)

    df.to_csv(f'{TRAINED_DIR}distilbert_valresult_{timestr}.csv',index=False)
    if wandb:
        config['wandb']=True
        wandb=False
    return output_df,y_pred




In [38]:
## Validation Configuration

attributes = ["Adequate" ,"Effective","Ineffective"]

# Create validation csv file from traintestsplit
df = data_path.copy()
y = df['discourse_effectiveness']
train_df,val_df = train_test_split(df, test_size=0.2,stratify=y,random_state=91)

# Run a smaller validation data set as an example. Actual validation was ran using Kaggle notebook and saved as csv
# y = val_df['discourse_effectiveness']
# train_df,val_df = train_test_split(val_df, test_size=0.01,stratify=y,random_state=91)

val_path = val_df
val_path['discourse_effectiveness'].value_counts()

## Validation Utility

In [39]:
#Convert prediction in tensor format to argmax numpy format
def pred_to_argmax(y_pred,df,model_name):
    argmax_output = y_pred.argmax(dim=1)
    argmax_output = argmax_output.numpy()
    df = pd.concat([df.reset_index(drop=True),pd.DataFrame(argmax_output,columns=[f'pred_effectiveness_{model_name}'])], axis=1)
    df[f'pred_effectiveness_{model_name}'] = df[f'pred_effectiveness_{model_name}'].map({0:'Adequate',1:'Effective',2:'Ineffective'})
    return df

#Plot confusion matrix
def do_conf_matrix(y_true, y_pred, ax, title=None):
    cm = confusion_matrix(y_true, y_pred, labels=attributes)
    sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', ax=ax, cmap='Blues');  
    
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('True labels'); 
    ax.set_title(f'Confusion Matrix - {title}'); 

    ax.xaxis.set_ticklabels(attributes)
    ax.yaxis.set_ticklabels(attributes);
    plt.show()
# Calculate metrics
def calculate_score(y_pred,df):
    cel = nn.CrossEntropyLoss()
    f1 = f1_score(y_pred.argmax(dim=1),torch.tensor(df['discourse_effectiveness_numeric'].values),num_classes=3,average=None)
    f1_weighted = f1_score(y_pred.softmax(dim=1),torch.tensor(df['discourse_effectiveness_numeric'].values),num_classes=3,multiclass=True,average='weighted')
    loss = cel(y_pred,torch.tensor(df['discourse_effectiveness_numeric'].values))
    
    return f1, f1_weighted, loss

def validate_score(y_pred,val_path,model_name):
    df = val_path.copy()
    df['discourse_effectiveness_numeric'] = df['discourse_effectiveness'].map({'Adequate':0,'Effective':1,'Ineffective':2})
    y_true = df['discourse_effectiveness'].values
    # fig, axs = plt.subplots(2,2,figsize=(20, 10))

    fig, axs = plt.subplots(1,1,figsize=(10, 10))
    df = pred_to_argmax(y_pred,df,model_name)

    do_conf_matrix(y_true, df[f'pred_effectiveness_distilbert'].values, ax=axs,title = model_name)

    output = calculate_score(y_pred,df)

    return output

## Execute Training & Validation

https://pytorch-lightning.readthedocs.io/en/latest/advanced/training_tricks.html

In [40]:
for iteration in range(1):
    timestr = time.strftime("%Y%m%d-%H%M%S")   
    
    if iteration == 0:
        distilbert_config['param']['lr']= 5e-5
        distilbert_config['param']['sample_mode']= False
        distilbert_config['param']['precision']= 16
        distilbert_config['param']['n_freeze']= 4
        distilbert_config['param']['n_epochs']= 10
        distilbert_config['param']['weight_decay']=0
        distilbert_config['param']['p_dropout'] = 0.4
        distilbert_config['param']['scheduler']=True
        distilbert_config['param']['warmup']=0.3
        distilbert_config['param']['swa']=False
        distilbert_config['param']['swa_lrs']=0.1
        

            
    print(f'Iteration {iteration} : {str(distilbert_config)}')
    ## Train
    distilbert_model, distilbert_config  = train(config = distilbert_config,
                                        Text_Classifier = DistilBert_Text_Classifier,
                                        project = 'DistilBert_Text_Classifier',
                                        timestr = timestr,
                                        notes = 
                                                  """
                                                output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask), 
                                                pooled_output = torch.mean(output.last_hidden_state, 1), 
                                                pooled_output = F.relu(pooled_output), 
                                                pooled_output = self.dropout(pooled_output)
                                                logits = self.classifier(pooled_output), 
                                                dynamic collate = True
                                                  """)
    
    ## Validate last run
    val_output_df_distilbert, y_pred_distilbert = validate(DistilBert_Text_Classifier,
                                                      distilbert_config,
                                                      data_path,
                                                      val_path,
                                                      attributes,
                                                      timestr)
    # Visualize scores
    val_result=pd.read_csv(f'{TRAINED_DIR}distilbert_valresult_{timestr}.csv')
    y_pred = torch.from_numpy(val_result.loc[:,['Adequate','Effective','Ineffective']].values)
    
    output = validate_score(y_pred,val_path,'distilbert')
    
    f1_score_df = pd.DataFrame({'distilbert':output[0].tolist()})
    f1_score_df['label']=attributes
    f1_score_df.set_index(['label'])
    f1_weight_score = []
    f1_weight_score.append(output[1].item())
    logloss_score = []
    logloss_score.append(output[2].item())

    score_df = pd.DataFrame({'f1_weighted_score':f1_weight_score})
    score_df['log loss'] = logloss_score
    score_df['model']='distilbert'
    print(score_df)


# PREDICTION AND KAGGLE SUBMISSION
---

https://www.kaggle.com/competitions/feedback-prize-effectiveness


## Main function to execute prediction on test dataset

In [41]:
def predict(_Text_Classifier,config):
    attributes = ["Adequate" ,"Effective","Ineffective"]
#     config['model_name'] = config['existing_tuned_model_name']
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'], use_fast=True)
    le = LabelEncoder()

    # Initialize data module
    test_data_module = _Data_Module(data_path,
                                    test_path,
                                    attributes,
                                    le,
                                    tokenizer,
                                    batch_size=config['param']['batch_size'],
                                    config=config
                                   )
    test_data_module.setup()

    # Initialize Model
    model = _Text_Classifier(config,test_data_module)
    model.load_state_dict(torch.load(config['newly_tuned_model_path']))

    # Initialize Trainer
    trainer = pl.Trainer(accelerator='auto')

    # Run predictions
    def predict_text_classification(model, dm):
        predictions = trainer.predict(model, datamodule=dm)
        return predictions
    predictions = predict_text_classification(model, test_data_module)

    # Pass logit into a softmax
    pred_list = []
    for logits in predictions:
        pred_list.append(logits)
    y_pred = torch.cat(pred_list)
    y_pred.shape

    softmax_outputs = softmax(y_pred, axis=1)
    test_df = test_path.copy()
    output_df = pd.concat([test_df[['discourse_id']].reset_index(drop=True), pd.DataFrame(softmax_outputs.numpy(), columns=attributes)], axis=1)
#     output_df = pd.concat([test_df[['discourse_id']].reset_index(drop=True), pd.DataFrame(y_pred.numpy(), columns=attributes)], axis=1)
    new_cols = ["discourse_id","Ineffective","Adequate","Effective"]
    output_df = output_df[new_cols]

    return output_df, y_pred

In [42]:

df, y_pred = predict(DistilBert_Text_Classifier,distilbert_config)

softmax_outputs = softmax(y_pred, axis=1)

In [43]:
# Create dataframe for submission
output_df = pd.concat([test_path[['discourse_id']].reset_index(drop=True), pd.DataFrame(softmax_outputs.numpy(), columns=attributes)], axis=1)
#     output_df = pd.concat([test_df[['discourse_id']].reset_index(drop=True), pd.DataFrame(y_pred.numpy(), columns=attributes)], axis=1)

# Re-arrange columns
new_cols = ["discourse_id","Ineffective","Adequate","Effective"]
output_df = output_df[new_cols]

In [44]:
output_df.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv')