In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import os

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class TensorBoardLogger:
    def __init__(self, log_dir):
        self.summary_writer = SummaryWriter(log_dir)

    def add_scalars(self, tag_step_value_dict):
        """
        :param parent_tag: str, e.g. "Training Loss"
        :param tag_step_value_dict: dict, e.g., {"key":(step, value), "q_grad":(10000, 1.11)}
        """
        for tag, (step, value) in tag_step_value_dict.items():
            self.summary_writer.add_scalar(tag, value, step)

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

In [3]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

In [4]:
X = df.drop(columns=["score","essay_id"])
y = df["score"] - 1
test = df_test.drop(columns=["essay_id"])


## NLP Preprocessing

In [6]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    #print(re.findall("\\'s", x))
    #x = re.sub(r"\[\]'s", "'s", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [7]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [7]:
len(X)

17307

## Create Train-Validation Pandas Dataset Split

In [8]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.8
validation_proportion = 0.2
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(1,len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

# Fine Tuning DistilBERT

In [9]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc

from accelerate import Accelerator

In [10]:
config = {
    'model': 'distilbert/distilbert-base-uncased',
    'dropout': 0.2,
    'max_length': 2048,
    'batch_size': 10, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 10,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])



In [12]:
text = train["full_text"][0]

encoded =tokenizer.encode(train["full_text"][0])
len(encoded)

322

In [13]:
decoded = tokenizer.decode(encoded) 
decoded

'[CLS] the face on mars is a landformation. people would argue it isnt but keep in mind that isnt the case with this. with people argueing that it isnt they think aliens did it but that may not have happend. the face of mars has to be a landformation. the face on mars is a landformation because natural occurances happen. this could have happend fromm a meteor strike or the winds in mars. not to mention if the clouds in mars are constant then finding this twice on over a 20 year period should still be the same fro slow winds and mild weater. many people belive aliens were the cause of the face. well thats nearly impossible if you consider the aliens were made in mars. life forms take upon millions of years to grow and need about 60 degree weather on average and mars has below freezing average. plus if theree were life on mars, there should be green on mars because of plants. people also think its aliens because of the lattitude changed. well this is because the camera that had to take t

## Pytorch Datasets & Dataloaders

In [12]:
class EssayScoringDataset:
    def __init__(self, df,y, config, tokenizer=None, is_test=False):
        self.df = df
        self.y = y
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test
        
    def __getitem__(self,idx):
        essay = self.df.iloc[idx]["full_text"]
        if self.y is not None:
            score = self.y[idx]

        tokenized = tokenizer.encode_plus(essay,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }
        
        if self.is_test == True:
            return inputs
        
        targets = {
            "labels": torch.tensor(score).type(torch.LongTensor),
        }
        
        return inputs, targets
    
    def __len__(self):
        return len(self.df)

In [13]:
train_pytorch_dataset = EssayScoringDataset(train,train_labels,config,tokenizer)
validation_pytorch_dataset = EssayScoringDataset(validation,validation_labels,config,tokenizer)
test_pytorch_dataset = EssayScoringDataset(validation,None,config,tokenizer,True)


train_loader = torch.utils.data.DataLoader(train_pytorch_dataset, 
                                           batch_size=config["batch_size"])
valid_loader = torch.utils.data.DataLoader(validation_pytorch_dataset,batch_size=config["batch_size"])
test_loader = torch.utils.data.DataLoader(test_pytorch_dataset,
                                          batch_size=config["batch_size"])

dataloaders = {"train" : [train_loader,len(train_pytorch_dataset)],
               "valid" : [valid_loader,len(validation_pytorch_dataset)],
               "test" : [test_loader,len(test_pytorch_dataset)]

}

In [14]:
#Test functionality of Dataloader
demo_loader  = torch.utils.data.DataLoader(train_pytorch_dataset, batch_size=10)
batch = next(iter(demo_loader))
inputs, targets = batch

In [14]:
inputs["input_ids"].shape

torch.Size([10, 2048])

In [15]:
targets["labels"].shape

torch.Size([10, 1])

In [16]:
train_pytorch_dataset[0][0]["input_ids"][320:330]

tensor([1012,  102,    0,    0,    0,    0,    0,    0,    0,    0])

In [65]:
train_pytorch_dataset[0][0]["attention_mask"][320:330]

tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

## Model

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class EssayModel(nn.Module):
    def __init__(self,config,num_classes=6):
        super().__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']
        
        self.encoder = AutoModel.from_pretrained(self.model_name)

        if self.freeze:
            for param in self.encoder.parameters():
                param.requires_grad = False
                
        #self.encoder.config.sinusoidal_pos_embds = True
        #update word embeggins
        self.encoder.resize_position_embeddings(config["max_length"])
        
                
        self.pooler = MeanPooling()
        self.classifier = nn.Sequential(
            nn.Dropout(config['dropout']),
            nn.Linear(self.encoder.config.hidden_size,num_classes)
        )
        
        
    def forward(self,inputs):
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask'])
        outputs = self.classifier(outputs)
        return outputs

In [16]:
mean_pool = MeanPooling()
model = EssayModel(config)

debug initial weights embeddings : Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)


In [21]:
model.encoder.embeddings

Embeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(2048, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [27]:
model.encoder.config

DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 2048,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.40.2",
  "vocab_size": 30522
}

In [17]:
demo_loader  = torch.utils.data.DataLoader(train_pytorch_dataset, batch_size=10)
batch = next(iter(demo_loader))
inputs, targets = batch



with torch.no_grad():
    inputs = {k : v.to(device) for k,v in inputs.items()}
    targets = {k : v.to(device) for k,v in targets.items()}


    model = model.to(device)
    model.eval()
    outputs = model(inputs)
outputs

tensor([[-0.4534,  0.1980, -0.4353, -0.3093,  0.3490, -0.2494],
        [-0.3071,  0.3186, -0.4425, -0.2849,  0.3202, -0.2637],
        [-0.2170,  0.2765, -0.5082, -0.2564,  0.2611, -0.2908],
        [-0.2299,  0.2738, -0.4787, -0.2624,  0.2848, -0.3053],
        [-0.2243,  0.2747, -0.4809, -0.2583,  0.2825, -0.3033],
        [-0.4028,  0.2675, -0.4281, -0.3060,  0.3106, -0.2215],
        [-0.3590,  0.2798, -0.4298, -0.2614,  0.2992, -0.2684],
        [-0.4054,  0.2576, -0.4156, -0.2985,  0.3222, -0.2269],
        [-0.4530,  0.2027, -0.4426, -0.2947,  0.3536, -0.2563],
        [-0.1992,  0.2072, -0.3722, -0.1664,  0.3606, -0.2773]],
       device='cuda:0')

In [25]:
outputs.shape

torch.Size([10, 6])

## Training Model

In [18]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [19]:
class Trainer:
    def __init__(self, model,model_dir, loaders, config, accelerator,logger : TensorBoardLogger,debug = False):
        self.model_dir = model_dir
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.accelerator = accelerator
        self.debug = debug
        
        self.optim = self._get_optim()
        
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.logger = logger
        
        self.train_losses = []
        self.val_losses = []

        self.loss = nn.CrossEntropyLoss()
        
    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model, 
            self.optim, 
            self.train_loader, 
            self.val_loader, 
            self.scheduler
        )
        
    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(self.model.classifier.parameters(), lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer

        
    def loss_fn(self):
        pass
    
    
    def train_one_epoch(self,epoch,log_vars):

        # statistics
        running_corrects = 0.        
        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))
        
        for idx,(inputs,targets) in enumerate(progress):
            with self.accelerator.accumulate(self.model):
            
                outputs = self.model(inputs)
                _,preds = torch.max(torch.softmax(outputs,-1),-1)

                loss = self.loss(outputs, targets['labels']) 

                running_loss += loss.item()


                self.accelerator.backward(loss)
                
                self.optim.step()                
                
                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                # statistics
                running_corrects += torch.sum(preds == targets["labels"].data)
                    
                self.optim.zero_grad()

                del inputs, targets, outputs, loss

        train_loss = running_loss/len(self.train_loader)  
        epoch_acc = running_corrects.double() / dataloaders["train"][1]

        if self.debug:
            print(f'"train Loss: {train_loss:.4f} Acc: {epoch_acc:.4f}\n')

        
        log_vars.update(
            {f"Train/Loss" : (epoch,train_loss),
                                      f"Train/Accuracy" : (epoch,epoch_acc.cpu().numpy())
                    }
                )
                
        self.logger.add_scalars(log_vars)
        
        self.train_losses.append(train_loss)
        
    @torch.no_grad()
    def valid_one_epoch(self,epoch,best_acc,log_vars):
        
        running_loss = 0.
        running_corrects = 0.
        progress = tqdm(self.val_loader, total=len(self.val_loader))
        
        for (inputs, targets) in progress:
            
            outputs = self.model(inputs)
            _,preds = torch.max(torch.softmax(outputs,-1),-1)
            
            loss = self.loss(outputs, targets['labels'])
            running_loss += loss.item()

            # statistics
            running_corrects += torch.sum(preds == targets["labels"].data)
            
            del inputs, targets, outputs, loss

        val_loss = running_loss/len(self.val_loader)
        epoch_acc = running_corrects.double() / dataloaders["valid"][1]

        # deep copy the model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(self.model.state_dict(), self.best_model_params_path)

        if self.debug:
            print(f'"Valid Loss: {val_loss:.4f} Acc: {epoch_acc:.4f}\n')

        
        log_vars.update(
            {f"Valid/Loss" : (epoch,val_loss),
                                      f"Valid/Accuracy" : (epoch,epoch_acc.cpu().numpy())
                    }
                )
                
        self.logger.add_scalars(log_vars)
            
        
        
        self.val_losses.append(val_loss)
        
    
    def test(self, test_loader):
        
        preds = []
        for (inputs) in test_loader:
            
            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())
            
        preds = torch.concat(preds)
        return preds
    
    def fit(self):

        os.makedirs(self.model_dir, exist_ok=True)
        self.best_model_params_path = os.path.join(self.model_dir, 'best_model_params.pt')
        torch.save(model.state_dict(), self.best_model_params_path)

        log_vars = {}
        
        self.prepare()
        
        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            desc="Training..."
        )

        best_acc = 0.0
        
        for epoch in fit_progress:
            if self.debug:
                print(f"Epoch : {epoch}")
            log_vars.update({
                "learning_rate" : (epoch,self.scheduler.get_last_lr()[0] if self.scheduler  else self.optim.param_groups[0]['lr'])
            }
            )
            
            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch,log_vars)
            self.clear()
            
            self.model.eval()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch,best_acc,log_vars)
            self.clear()

            #print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            #print(f"train loss: {self.train_losses[-1]}")
            #print(f"valid loss: {self.val_losses[-1]}\n\n")
            
    
    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

In [20]:
dir = os.path.abspath(os.getcwd()) + "/model/DeBERTaV3-base-FE2/"
log_dir = os.path.abspath(os.getcwd()) + "/runs/DeBERTaV3-base-FE2/"
os.makedirs(log_dir, exist_ok=True)
logger = TensorBoardLogger(log_dir)

In [21]:
trainer = Trainer(model,dir, (train_loader, valid_loader), config, accelerator,logger,True)

In [22]:
trainer.fit()

EPOCH 1 / 30 | training...:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch : 1


100%|██████████| 1385/1385 [19:04<00:00,  1.21it/s]
EPOCH 1 / 30 | validating...:   0%|          | 0/30 [19:04<?, ?it/s]

"train Loss: 1.1214 Acc: 5.4195



100%|██████████| 347/347 [04:03<00:00,  1.42it/s]
EPOCH 2 / 30 | training...:   3%|▎         | 1/30 [23:09<11:11:32, 1389.39s/it]  

"Valid Loss: 1.0830 Acc: 5.4697

Epoch : 2


100%|██████████| 1385/1385 [19:09<00:00,  1.21it/s]
EPOCH 2 / 30 | validating...:   3%|▎         | 1/30 [42:18<11:11:32, 1389.39s/it]

"train Loss: 1.0602 Acc: 5.5545



100%|██████████| 347/347 [04:07<00:00,  1.40it/s]
EPOCH 3 / 30 | training...:   7%|▋         | 2/30 [46:28<10:50:56, 1394.86s/it]  

"Valid Loss: 1.0765 Acc: 5.4899

Epoch : 3


 28%|██▊       | 392/1385 [05:36<14:13,  1.16it/s]
EPOCH 3 / 30 | training...:   7%|▋         | 2/30 [52:04<12:09:09, 1562.48s/it]


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
