# PyTorch BERT Multi-Model Trainer + KFolds🎯

📌 KFolds Inference (Submission) Notebook : https://www.kaggle.com/heyytanay/inference-0-6-lb-vanilla-pytorch-bert-starter

📌 My EDA and Multi Linear Models Notebook: https://www.kaggle.com/heyytanay/commonlit-readability-eda-multi-models

In [1]:
!pip install transformers==2.5.1

Collecting transformers==2.5.1
  Downloading transformers-2.5.1-py3-none-any.whl (499 kB)
[K     |████████████████████████████████| 499 kB 4.1 MB/s eta 0:00:01
Collecting tokenizers==0.5.2
  Downloading tokenizers-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 22.6 MB/s eta 0:00:01
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.2
    Uninstalling tokenizers-0.10.2:
      Successfully uninstalled tokenizers-0.10.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.5.1
    Uninstalling transformers-4.5.1:
      Successfully uninstalled transformers-4.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.3.0 requires transformers<4.6,>=4.1, but you have transformers 2.5.1 wh

In [2]:
import os
import platform
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.simplefilter('ignore')

In [3]:
class Config:
    NB_EPOCHS = 10
    LR = 1e-6
    MAX_LEN = 185
    N_SPLITS = 5
    TRAIN_BS = 16
    VALID_BS = 32
    MODEL_NAME = 'distilbert-base-uncased'
    FILE_NAME = '../input/commonlitreadabilityprize/train.csv'
    TOKENIZER = transformers.DistilBertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
    scaler = GradScaler()

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [4]:
class BERTDataset(Dataset):
    def __init__(self, review, target=None, is_test=False):
        self.review = review
        self.target = target
        self.is_test = is_test
        self.tokenizer = Config.TOKENIZER
        self.max_len = Config.MAX_LEN
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        review = str(self.review[idx])
        review = ' '.join(review.split())
        global inputs
        
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        else:    
            targets = torch.tensor(self.target[idx], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }

In [5]:
class Trainer:
    def __init__(
        self, 
        model, 
        optimizer, 
        scheduler, 
        train_dataloader, 
        valid_dataloader,
        device
    ):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_data = train_dataloader
        self.valid_data = valid_dataloader
        self.loss_fn = self.yield_loss
        self.device = device
        
    def yield_loss(self, outputs, targets):
        """
        This is the loss function for this task
        """
        return torch.sqrt(nn.MSELoss()(outputs, targets))
    
    def train_one_epoch(self):
        """
        This function trains the model for 1 epoch through all batches
        """
        prog_bar = tqdm(enumerate(self.train_data), total=len(self.train_data))
        self.model.train()
        all_losses = []
        with autocast():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(self.device, dtype=torch.long)
                mask = inputs['mask'].to(self.device, dtype=torch.long)
                targets = inputs['targets'].to(self.device, dtype=torch.float)

                outputs = self.model(ids=ids, mask=mask).view(-1)

                loss = self.loss_fn(outputs, targets)
                prog_bar.set_description('loss: {:.2f}'.format(loss.item()))
                all_losses.append(loss.item())

                Config.scaler.scale(loss).backward()
                Config.scaler.step(self.optimizer)
                Config.scaler.update()
                self.optimizer.zero_grad()
                self.scheduler.step()
        
        train_loss = sum(all_losses) / len(all_losses)
        return train_loss
        
    
    def valid_one_epoch(self):
        """
        This function validates the model for one epoch through all batches of the valid dataset
        It also returns the validation Root mean squared error for assesing model performance.
        """
        prog_bar = tqdm(enumerate(self.valid_data), total=len(self.valid_data))
        self.model.eval()
        all_targets = []
        all_predictions = []
        with torch.no_grad():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(self.device, dtype=torch.long)
                mask = inputs['mask'].to(self.device, dtype=torch.long)
                targets = inputs['targets'].to(self.device, dtype=torch.float)

                outputs = self.model(ids=ids, mask=mask).view(-1)
                all_targets.extend(targets.cpu().detach().numpy().tolist())
                all_predictions.extend(outputs.cpu().detach().numpy().tolist())

        val_rmse_loss = np.sqrt(mean_squared_error(all_targets, all_predictions))
        print('Validation RMSE: {:.2f}'.format(val_rmse_loss))
        
        return val_rmse_loss
    
    def get_model(self):
        return self.model

In [6]:
# Model
class DBERT_BASE_UNCASED(nn.Module):
    def __init__(self):
        super(DBERT_BASE_UNCASED, self).__init__()
        self.dbert = transformers.DistilBertModel.from_pretrained(Config.MODEL_NAME)
        self.drop = nn.Dropout(0.2)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask):
        output = self.dbert(ids, attention_mask=mask)
        output = self.drop(output[0][:,0,:])
        output = self.out(output)
        return output

In [7]:
def yield_optimizer(model):
    """
    Returns optimizer for specific parameters
    """
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.003,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    return transformers.AdamW(optimizer_parameters, lr=Config.LR)

In [None]:
# Training Code
if __name__ == '__main__':
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
        DEVICE = torch.device('cuda:0')
    else:
        print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
        DEVICE = torch.device('cpu')
        
    os.makedirs("./states_list", exist_ok=True)
    
    data = pd.read_csv(Config.FILE_NAME)
    data = data.sample(frac=1).reset_index(drop=True)
    data = data[['excerpt', 'target']]
    
    # Do Kfolds training and cross validation
    kf = StratifiedKFold(n_splits=Config.N_SPLITS)
    nb_bins = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(data['target'], bins=nb_bins, labels=False)
    
    train_losses = [{} for _ in range(Config.N_SPLITS)]
    val_losses = [{} for _ in range(Config.N_SPLITS)]
    
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X=data, y=data['bins'].values)):
        print(f"\nFold: {fold}")
        print(f"{'-'*20}\n")
        
        train_data = data.loc[train_idx]
        valid_data = data.loc[valid_idx]
        
        train_set = BERTDataset(
            review = train_data['excerpt'].values,
            target = train_data['target'].values
        )

        valid_set = BERTDataset(
            review = valid_data['excerpt'].values,
            target = valid_data['target'].values
        )

        train = DataLoader(
            train_set,
            batch_size = Config.TRAIN_BS,
            shuffle = True,
            num_workers=8
        )

        valid = DataLoader(
            valid_set,
            batch_size = Config.VALID_BS,
            shuffle = False,
            num_workers=8
        )

        model = DBERT_BASE_UNCASED().to(DEVICE)
        nb_train_steps = int(len(train_data) / Config.TRAIN_BS * Config.NB_EPOCHS)
        optimizer = yield_optimizer(model)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=nb_train_steps
        )

        trainer = Trainer(model, optimizer, scheduler, train, valid, DEVICE)

        best_loss = 100
        for epoch in range(1, Config.NB_EPOCHS+1):
            print(f"\n{'--'*5} EPOCH: {epoch} {'--'*5}\n")

            # Train for 1 epoch
            train_loss = trainer.train_one_epoch()
            train_losses[fold][epoch] = train_loss

            # Validate for 1 epoch
            current_loss = trainer.valid_one_epoch()
            val_losses[fold][epoch] = current_loss

            if current_loss < best_loss:
                print(f"Saving best model in this fold: {current_loss:.4f}")
                torch.save(trainer.get_model().state_dict(), f"./states_list/{Config.MODEL_NAME}_fold_{fold}.pt")
                best_loss = current_loss
        
        print(f"Best RMSE in fold: {fold} was: {best_loss:.4f}")
        print(f"Final RMSE in fold: {fold} was: {current_loss:.4f}")

[INFO] Using GPU: Tesla P100-PCIE-16GB


Fold: 0
--------------------


---------- EPOCH: 1 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.95
Saving best model in this fold: 0.9514

---------- EPOCH: 2 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.79
Saving best model in this fold: 0.7945

---------- EPOCH: 3 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.70
Saving best model in this fold: 0.7020

---------- EPOCH: 4 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.67
Saving best model in this fold: 0.6699

---------- EPOCH: 5 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65
Saving best model in this fold: 0.6533

---------- EPOCH: 6 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65
Saving best model in this fold: 0.6507

---------- EPOCH: 7 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.64
Saving best model in this fold: 0.6434

---------- EPOCH: 8 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.63
Saving best model in this fold: 0.6334

---------- EPOCH: 9 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.63
Saving best model in this fold: 0.6284

---------- EPOCH: 10 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.63
Saving best model in this fold: 0.6277
Best RMSE in fold: 0 was: 0.6277
Final RMSE in fold: 0 was: 0.6277

Fold: 1
--------------------


---------- EPOCH: 1 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.88
Saving best model in this fold: 0.8839

---------- EPOCH: 2 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.72
Saving best model in this fold: 0.7213

---------- EPOCH: 3 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.67
Saving best model in this fold: 0.6715

---------- EPOCH: 4 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.67
Saving best model in this fold: 0.6677

---------- EPOCH: 5 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65
Saving best model in this fold: 0.6534

---------- EPOCH: 6 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65
Saving best model in this fold: 0.6460

---------- EPOCH: 7 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65

---------- EPOCH: 8 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.64
Saving best model in this fold: 0.6397

---------- EPOCH: 9 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.64
Saving best model in this fold: 0.6364

---------- EPOCH: 10 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.64
Saving best model in this fold: 0.6359
Best RMSE in fold: 1 was: 0.6359
Final RMSE in fold: 1 was: 0.6359

Fold: 2
--------------------


---------- EPOCH: 1 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 1.00
Saving best model in this fold: 1.0012

---------- EPOCH: 2 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.84
Saving best model in this fold: 0.8389

---------- EPOCH: 3 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.73
Saving best model in this fold: 0.7331

---------- EPOCH: 4 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.69
Saving best model in this fold: 0.6920

---------- EPOCH: 5 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.67
Saving best model in this fold: 0.6724

---------- EPOCH: 6 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66
Saving best model in this fold: 0.6582

---------- EPOCH: 7 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.65
Saving best model in this fold: 0.6490

---------- EPOCH: 8 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66

---------- EPOCH: 9 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66

---------- EPOCH: 10 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66
Best RMSE in fold: 2 was: 0.6490
Final RMSE in fold: 2 was: 0.6565

Fold: 3
--------------------


---------- EPOCH: 1 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.90
Saving best model in this fold: 0.8965

---------- EPOCH: 2 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.75
Saving best model in this fold: 0.7534

---------- EPOCH: 3 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.70
Saving best model in this fold: 0.6971

---------- EPOCH: 4 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.68
Saving best model in this fold: 0.6850

---------- EPOCH: 5 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66
Saving best model in this fold: 0.6617

---------- EPOCH: 6 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

Validation RMSE: 0.66
Saving best model in this fold: 0.6578

---------- EPOCH: 7 ----------



  0%|          | 0/142 [00:00<?, ?it/s]

In [None]:
plt.plot(train_losses)
plt.title("Train")

In [None]:
plt.plot(val_losses)
plt.title("Val")

In [None]:
class Config:
    MAX_LEN = 284
    TRAIN_BS = 12
    STATE_DIR = "./states_list"
    MODEL_NAME = 'distilbert-base-uncased'
    FILE_NAME = '../input/commonlitreadabilityprize/test.csv'
    TOKENIZER = transformers.DistilBertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
    scaler = GradScaler()

In [None]:
@torch.no_grad()
def inference(model, states_list, test_dataloader, device=torch.device('cuda:0')):
    """
    Do inference for different model folds
    """
    model.eval()
    all_preds = []
    for state in states_list:
        print(f"State: {state}")
        state_dict = torch.load(state)
        model.load_state_dict(state_dict)
        model = model.to(device)
        
        # Clean
        del state_dict
        gc.collect()
        torch.cuda.empty_cache()
        
        preds = []
        prog = tqdm(test_dataloader, total=len(test_dataloader))
        for data in prog:
            ids = data['ids'].to(DEVICE, dtype=torch.long)
            mask = data['mask'].to(DEVICE, dtype=torch.long)

            outputs = model(ids=ids, mask=mask)
            preds.append(outputs.squeeze(-1).cpu().detach().numpy())
            
        all_preds.append(np.concatenate(preds))
        
        # Clean
        gc.collect()
        torch.cuda.empty_cache()
        
    return all_preds

In [None]:
# Inference Code
if __name__ == '__main__':
    if torch.cuda.is_available():
        print("\n[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
        DEVICE = torch.device('cuda:0')
    else:
        print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
        DEVICE = torch.device('cpu')
    
    test_file = pd.read_csv(Config.FILE_NAME)
    
    test_data = BERTDataset(test_file['excerpt'].values, is_test=True)
    test_data = DataLoader(
        test_data,
        batch_size=Config.TRAIN_BS,
        shuffle=False
    )
    
    state_list = [os.path.join(Config.STATE_DIR, x) for x in os.listdir(Config.STATE_DIR) if x.endswith(".pt")]
    model = DBERT_BASE_UNCASED()
    
    print("Doing Predictions for all folds")
    predictions = inference(model, state_list, test_data, device=DEVICE)
    
    final_predictions = pd.DataFrame(predictions).T.mean(axis=1).tolist()

In [None]:
# Form the sample submission
sub = pd.DataFrame()
sub['id'] = test_file['id']
sub['target'] = final_predictions

sub.to_csv("submission.csv", index=None)
sub.head()