In [5]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import os

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
class TensorBoardLogger:
    def __init__(self, log_dir):
        self.summary_writer = SummaryWriter(log_dir)

    def add_scalars(self, tag_step_value_dict):
        """
        :param parent_tag: str, e.g. "Training Loss"
        :param tag_step_value_dict: dict, e.g., {"key":(step, value), "q_grad":(10000, 1.11)}
        """
        for tag, (step, value) in tag_step_value_dict.items():
            self.summary_writer.add_scalar(tag, value, step)

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

In [34]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

In [35]:
X = df.drop(columns=["score","essay_id"])
y = df["score"]
test = df_test.drop(columns=["essay_id"])


## NLP Preprocessing

In [36]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    #print(re.findall("\\'s", x))
    #x = re.sub(r"\[\]'s", "'s", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [37]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [38]:
len(X)

17307

## Create Train-Validation Pandas Dataset Split

In [39]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.8
validation_proportion = 0.2
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(1,len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

# Fine Tuning DEBERTAV3

In [40]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import gc

from accelerate import Accelerator

In [112]:
config = {
    'model': 'microsoft/deberta-v3-base',
    'dropout': 0.2,
    'max_length': 2048,
    'batch_size': 8, # anything more results in CUDA OOM [for unfreezed encoder] on Kaggle GPU
    'epochs': 7,
    'lr': 3e-4,
    'enable_scheduler': True,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'gradient_accumulation_steps': 2,
    'adam_eps': 1e-6, # 1e-8 default
    'freeze_encoder': True
}

In [113]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [114]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [115]:
tokenizer.bos_token_id

50256

In [116]:
len(tokenizer.encode(train["full_text"][0]))

309

## Pytorch Datasets & Dataloaders

In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [117]:
class EssayScoringDataset:
    def __init__(self, df,y, config, tokenizer=None, is_test=False):
        self.df = df
        self.y = y
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test
        
    def __getitem__(self,idx):
        essay = self.df.iloc[idx]["full_text"]
        if self.y is not None:
            score = self.y[idx]

        tokenized = tokenizer.encode_plus(essay,
                                          None,
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          truncation=True,
                                          padding='max_length'
                                         )
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }
        
        if self.is_test == True:
            return inputs
        
        targets = {
            "labels": torch.tensor([score], dtype=torch.float32),
        }
        
        return inputs, targets
    
    def __len__(self):
        return len(self.df)

In [54]:
test.head()

Unnamed: 0,full_text
0,many people have car where they live. the thin...
1,i am a scientist at nasa that is discussing th...
2,people always wish they had the same technolog...


In [55]:
train

Unnamed: 0,full_text
0,the face on mars is a landformation. people wo...
1,"the author of ""the challenge of exploring venu..."
2,do you like seeing different places and differ...
3,"dear senator, we should abolish the electoral ..."
4,"over the past few decades, talk regarding car ..."
...,...
13840,i say that the auther did do really good suppo...
13841,the face on mars as believed by many is not a ...
13842,when the author talks about venus he talks abo...
13843,in the story the author presented both sides f...


In [118]:
train_pytorch_dataset = EssayScoringDataset(train,train_labels,config,tokenizer)
validation_pytorch_dataset = EssayScoringDataset(validation,validation_labels,config,tokenizer)
test_pytorch_dataset = EssayScoringDataset(validation,None,config,tokenizer,True)


train_loader = torch.utils.data.DataLoader(train_pytorch_dataset, 
                                           batch_size=100)
valid_loader = torch.utils.data.DataLoader(validation_pytorch_dataset,batch_size=100)
test_loader = torch.utils.data.DataLoader(test_pytorch_dataset,
                                          batch_size=100)

dataloaders = {"train" : [train_loader,len(train_pytorch_dataset)],
               "valid" : [valid_loader,len(validation_pytorch_dataset)],
               "test" : [test_loader,len(test_pytorch_dataset)]

}

In [119]:
#Test functionality of Dataloader
demo_loader  = torch.utils.data.DataLoader(train_pytorch_dataset, batch_size=10)
batch = next(iter(demo_loader))
inputs, targets = batch

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [58]:
targets["labels"].shape

torch.Size([10, 1])

In [59]:
train_pytorch_dataset[0][0]["input_ids"][300:320]

tensor([  267,   312,  2221,   269,   266,  1311, 45887,   260,     2,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [60]:
train_pytorch_dataset[0][0]["attention_mask"][300:320]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Model

In [74]:
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class EssayModel(nn.Module):
    def __init__(self,config,num_classes=6):
        super().__init__()
        self.model_name = config['model']
        self.freeze = config['freeze_encoder']
        
        self.encoder = AutoModel.from_pretrained(self.model_name)
        if self.freeze:
            for param in self.encoder.base_model.parameters():
                param.requires_grad = False
                
        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64)
        self.fc2 = nn.Linear(64,num_classes)
        
        
    def forward(self,inputs):
        outputs = self.encoder(**inputs,return_dict=True)
        outputs = self.pooler(outputs['last_hidden_state'], inputs['attention_mask'])
        outputs = self.fc1(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [73]:
mean_pool = MeanPooling()
model = EssayModel(config)



In [84]:
demo_loader  = torch.utils.data.DataLoader(train_pytorch_dataset, batch_size=10)
batch = next(iter(demo_loader))
inputs, targets = batch



with torch.no_grad():
    inputs = {k : v.to(device) for k,v in inputs.items()}
    targets = {k : v.to(device) for k,v in targets.items()}


    model = model.to(device)
    model.eval()
    outputs = model(inputs)
outputs

tensor([[-0.3067,  0.4017, -0.2736,  0.0248, -0.6324,  0.0109],
        [-0.3765,  0.3870, -0.0940,  0.0620, -0.4668, -0.1100],
        [-0.3492,  0.4197, -0.0614, -0.0244, -0.5931, -0.1408],
        [-0.2148,  0.3868, -0.0897,  0.0594, -0.4785, -0.0222],
        [-0.2806,  0.4046, -0.0783,  0.1286, -0.4465, -0.1848],
        [-0.2197,  0.4880, -0.2339,  0.1133, -0.5067,  0.0590],
        [-0.2491,  0.3928,  0.1088,  0.0404, -0.4130, -0.1674],
        [-0.2633,  0.3346, -0.0624, -0.0514, -0.5125, -0.0701],
        [-0.2374,  0.4108, -0.0937,  0.0385, -0.5673, -0.0453],
        [-0.2724,  0.4246, -0.2430,  0.1170, -0.5886, -0.0533]],
       device='cuda:0')

In [85]:
outputs.shape

torch.Size([10, 6])

## Training Model

In [86]:
import time

In [93]:
model

EssayModel(
  (encoder): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dr

In [90]:
accelerator = Accelerator(gradient_accumulation_steps=config['gradient_accumulation_steps'])

In [109]:
class Trainer:
    def __init__(self, model,model_dir, loaders, config, accelerator,logger : TensorBoardLogger,debug = False):
        self.model_dir = model_dir
        self.model = model
        self.train_loader, self.val_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','token_type_ids','attention_mask']
        self.accelerator = accelerator
        self.debug = debug
        
        self.optim = self._get_optim()
        
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7)

        self.logger = logger
        
        self.train_losses = []
        self.val_losses = []
        
    def prepare(self):
        self.model, self.optim, self.train_loader, self.val_loader, self.scheduler = self.accelerator.prepare(
            self.model, 
            self.optim, 
            self.train_loader, 
            self.val_loader, 
            self.scheduler
        )
        
    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.config['lr'], eps=self.config['adam_eps'])
        return optimizer

        
    def loss_fn(self, outputs, targets):
        return nn.CrossEntropyLoss(outputs,targets)
    
    
    def train_one_epoch(self,epoch,log_vars):

        # statistics
        running_corrects = 0.        
        running_loss = 0.
        #progress = tqdm(self.train_loader, total=len(self.train_loader))
        
        for idx,(inputs,targets) in enumerate(self.train_loader):
            with self.accelerator.accumulate(self.model):
            
                outputs = self.model(inputs)
                preds = torch.max(torch.softmax(outputs,-1))

                loss = self.loss_fn(outputs, targets['labels'])
                running_loss += loss.item()

                self.accelerator.backward(loss)
                
                self.optim.step()                
                
                if self.config['enable_scheduler']:
                    self.scheduler.step(epoch - 1 + idx / len(self.train_loader))

                # statistics
                running_corrects += torch.sum(preds == targets["labels"].data)
                    
                self.optim.zero_grad()

                del inputs, targets, outputs, loss

        train_loss = running_loss/len(self.train_loader)  
        epoch_acc = running_corrects.double() / len(self.train_loader)

        if self.debug:
            print(f'"train Loss: {train_loss:.4f} Acc: {epoch_acc:.4f}\n')

        
        log_vars.update(
            {f"Train/Loss" : (epoch,train_loss),
                                      f"Train/Accuracy" : (epoch,epoch_acc.cpu().numpy())
                    }
                )
                
        self.logger.add_scalars(log_vars)
        
        self.train_losses.append(train_loss)
        
    @torch.no_grad()
    def valid_one_epoch(self,epoch,best_acc,log_vars):
        
        running_loss = 0.
        #progress = tqdm(self.val_loader, total=len(self.val_loader))
        
        for (inputs, targets) in self.val_loader:
            
            outputs = self.model(inputs)
            preds = torch.max(torch.softmax(outputs,-1))
            
            loss = self.loss_fn(outputs, targets['labels'])
            running_loss += loss.item()

            # statistics
            running_corrects += torch.sum(preds == targets["labels"].data)
            
            del inputs, targets, outputs, loss

        val_loss = running_loss/len(self.val_loader)
        epoch_acc = running_corrects.double() / len(self.val_loader)

        # deep copy the model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(self.model.state_dict(), self.best_model_params_path)

        if self.debug:
            print(f'"Valid Loss: {val_loss:.4f} Acc: {epoch_acc:.4f}\n')

        
        log_vars.update(
            {f"Valid/Loss" : (epoch,val_loss),
                                      f"Valid/Accuracy" : (epoch,epoch_acc.cpu().numpy())
                    }
                )
                
        self.logger.add_scalars(log_vars)
            
        
        
        self.val_losses.append(val_loss)
        
    
    def test(self, test_loader):
        
        preds = []
        for (inputs) in test_loader:
            
            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())
            
        preds = torch.concat(preds)
        return preds
    
    def fit(self):

        os.makedirs(self.model_dir, exist_ok=True)
        self.best_model_params_path = os.path.join(self.model_dir, 'best_model_params.pt')
        torch.save(model.state_dict(), self.best_model_params_path)

        log_vars = {}
        
        self.prepare()
        
        #fit_progress = tqdm(
        #    range(1, self.config['epochs']+1),
        #    desc="Training..."
        #)

        best_acc = 0.0
        
        for epoch in range(1, self.config['epochs']+1):
            if self.debug:
                print(f"Epoch : {epoch}")
            log_vars.update({
                "learning_rate" : (epoch,self.scheduler.get_last_lr()[0] if self.scheduler  else self.optim.param_groups[0]['lr'])
            }
            )
            
            self.model.train()
            #fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch,log_vars)
            self.clear()
            
            self.model.eval()
            #fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | validating...")
            self.valid_one_epoch(epoch,best_acc,log_vars)
            self.clear()

            #print(f"{'➖️'*10} EPOCH {epoch} / {self.config['epochs']} {'➖️'*10}")
            #print(f"train loss: {self.train_losses[-1]}")
            #print(f"valid loss: {self.val_losses[-1]}\n\n")
            
    
    def clear(self):
        gc.collect()
        torch.cuda.empty_cache()

In [94]:
dir = os.path.abspath(os.getcwd()) + "/model/DeBERTaV3-base-FE/"
log_dir = os.path.abspath(os.getcwd()) + "/runs/DeBERTaV3-base-FE/"
os.makedirs(log_dir, exist_ok=True)
logger = TensorBoardLogger(log_dir)

In [110]:
trainer = Trainer(model,dir, (train_loader, valid_loader), config, accelerator,logger,True)

In [111]:
trainer.fit()

Epoch : 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.75 GiB. GPU 

In [None]:
def train_model(model, criterion, optimizer,dataloaders,weight_dir,logger : TensorBoardLogger,
        scheduler = None, num_epochs=25,debug = False):
        since = time.time()

   
        os.makedirs(weight_dir, exist_ok=True)
        best_model_params_path = os.path.join(weight_dir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0
        log_vars = {}
        

        for epoch in range(num_epochs):

            log_vars.update({
                "learning_rate" : (epoch,scheduler.get_last_lr()[0] if scheduler else optimizer.param_groups[0]['lr'])
            }
            )


            if debug:
                print(f'Epoch {epoch}/{num_epochs - 1}')
                print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'valid']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase][0]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)

                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    
                if phase == 'train' and scheduler:
                    scheduler.step()

                epoch_loss = running_loss / dataloaders[phase][1]
                epoch_acc = running_corrects.double() / dataloaders[phase][1]

                if debug:
                    print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}\n')

                # deep copy the model
                if phase == 'valid' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

                log_vars.update(
                    {f"{phase}/Loss" : (epoch,epoch_loss),
                                      f"{phase}/Accuracy" : (epoch,epoch_acc.cpu().numpy())
                    }
                )
                
                logger.add_scalars(log_vars)
        

        time_elapsed = time.time() - since
        if debug:
            print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
            print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
        return model

In [87]:
def test_model(model,logger : TensorBoardLogger):
    model.eval()

    log_vars = {}

    tot_accurate_pred = 0


    for i,(image,labels) in enumerate(dataloaders["test"][0]):
        image,labels = image.to(device),labels.to(device)

        #predictions...
        with torch.no_grad():
            predictions = model(image)
            _,predictions = torch.max(predictions,dim=-1)
            accurate_pred = torch.sum(predictions == labels.data)


        log_vars.update(
            {
                "Test/Accuracy_Batch" : (i, accurate_pred.double().cpu().numpy() / image.shape[0])
            }
        )

        logger.add_scalars(log_vars)

        tot_accurate_pred += accurate_pred.double().cpu().numpy()

    logger.add_scalars(
            {
                "Test/Accuracy_Tot" : (0, tot_accurate_pred / dataloaders["test"][1])
            }
        )

# Adapt Essay Scoring DataFrame to be compatible with Pytorch

In [None]:
%pip install transformers

In [None]:
from dataclasses import dataclass
from torch.utils.data import DataLoader, Dataset
from typing import Optional, Union, Any
from transformers import DataCollatorWithPadding


from transformers import AutoTokenizer

def define_tokenizer(cfg):
    """
    Let's use basic AutoTokenizer
    """

    tokenizer = AutoTokenizer.from_pretrained(cfg.architecture["backbone"], trust_remote_code=True)

    # Make sure that we have a pad token and and eos token that will be used for pooling
    if tokenizer.pad_token is None:
        print("Setting new pad token")
        # pad token is missig        
        tokenizer.pad_token="<|reserved_special_token_0|>"
        
    if tokenizer.eos_token is None:
        print("Setting new eos_token token")
        # eos_token token is missig
        tokenizer.eos_token="<|reserved_special_token_1|>"
    
    # Make sure that padding is always "right"
    if tokenizer.padding_side != "right":
        print(f"Changing padding side from {tokenizer.padding_side} to 'right'")
        tokenizer.padding_side = "right"
    return tokenizer