# Train model

In [None]:
from azureml.core import Run, Workspace, Datastore, Experiment
from azureml.core import Dataset as DSET

import numpy as np
import pandas as pd
from transformers import  AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from torch import nn
from pathlib import Path
import gc
import argparse
import time
import os

#seed everything
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

ws = Workspace.from_config()
experiment = Experiment(ws, "oneshot_feedback")
run = experiment.start_logging()

#read data using azure dataset (create if it doesn't exist)
try:
    dset = DSET.get_by_name(ws, "train")
except: 
    try:
        dstore = DSET.get(ws, "fdbacktrain")
        dset = DSET.Tabular.from_delimited_files(path = (dstore, "train.csv"), support_multi_line=True)
        dset.register(ws, "train")
        print("dataset created!")
    except:
        dstore = Datastore.register_azure_blob_container(ws, "fdbacktrain", 
                                            container_name="blobdata",
                                            account_name="fdback3",
                                            account_key="RKpT935tVL5g0wHcnlS1cqnynMc1c6iyyvSuifJU+AdoV8UQX*3lCMAVsxwYkp7bOetbbdP*uXa8+AStP8VDvA==")
        dset = DSET.Tabular.from_delimited_files(path = (dstore, "train.csv"), support_multi_line=True)
        dset.register(ws, "train")
        print("datastore created!\ndataset created!")

df = dset.to_pandas_dataframe()


#********************************************
#                Configuration
#********************************************
class CONF:
    model_name = ["microsoft/deberta-v3-base","microsoft/deberta-v3-large"]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_classes = 6
    max_len = 512,1024
    inter_size= [512, 264, 128, 64]


#tokenize text
print("tokenization step...")
tokenizer = AutoTokenizer.from_pretrained(CONF.model_name[0])
def tokenize_func(text):
    return tokenizer(text,padding="max_length",truncation=True,max_length= CONF.max_len[0],return_tensors="pt",return_token_type_ids=False)

df.full_text = df.full_text.apply(lambda x:x.replace("\n\n", "|"))
df["tokens"] = df.full_text.apply(tokenize_func)

    

#********************************************
#                dataset
#********************************************
class custom_dataset(Dataset):
    def __init__(self, data, istrain):
        self.x = data.tokens.values
        self.y = data.loc[:,"cohesion":"conventions"].values
        self.istrain = istrain
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        if self.istrain:
            return self.x[idx], self.y[idx]
        return self.x[idx]
    

#********************************************
#                dataloader
#********************************************
dtset = custom_dataset(df.iloc[:10], True)
train, test, val = random_split(dtset,[6,2,2])
# train, test, val = random_split(dtset,[3500,211,200]) #percent only available in v1.13
trainloader = DataLoader(train, batch_size=1, shuffle=True, num_workers=0)
testloader = DataLoader(test, batch_size=1, shuffle=False, num_workers=0)
valloader = DataLoader(val, batch_size=1, shuffle=True, num_workers=0)


#********************************************
#                Model
#********************************************
base_model = AutoModel.from_pretrained(CONF.model_name[0])
hidden_size = base_model.config.hidden_size


class MeanMaxPool(nn.Module):
    def __init__(self, size_in):
        super().__init__()
        self.size_in = size_in

    def forward(self,x, mask):
        CLS = x[:,0,:]        #retreive first vector of CLS
        EMBED = x[:,1:,:]         # only words
        mask = mask[:,1:].unsqueeze(2).expand(-1,-1, self.size_in).clone()
        # mask padding tokens
        EMBED = EMBED*mask.clone()
        # replace 0 of mask by tiny values to capture the next division
        mask[mask==0] = 1e-4
        meanpool= ((EMBED.sum(dim=1))/mask.sum(dim=1))
        # replace 0 vectors by  huge negative values to never be selected as max
        EMBED[mask==1e-4]=-1e9
        maxpool= EMBED.max(dim=1).values
        mean_max_pool=torch.concat((meanpool, maxpool,CLS),dim=1)
        return mean_max_pool


class custom_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.base = base_model
        self.mean_max_pool = MeanMaxPool(hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size*3,CONF.inter_size[0])
        self.out = nn.Linear(CONF.inter_size[0],CONF.n_classes)

    def forward(self, ids,mask):
        x = self.base(ids, mask)[0]
        x = self.mean_max_pool(x,mask)
        x= self.fc(x)
        x=self.dropout(x)
        out=self.out(x)
        return out


#********************************************
#                Training func
#********************************************

#loss functions
def MCRMSE(yhat, ytrue):
    return ((yhat-ytrue)**2).mean(dim=0).sqrt().mean()
mse = nn.MSELoss()

def train_one_epoch():
    tot_mse = 0
    tot_mcr=0
    for xbatch, ybatch in trainloader:
        model.train()
        ids = xbatch.input_ids.squeeze(1).to(CONF.device)
        att_mask = xbatch.attention_mask.squeeze(1).to(CONF.device)
        ybatch = ybatch.to(CONF.device)
        y_hat = model(ids, att_mask)
        opt.zero_grad()

        mseloss = mse(y_hat.float(), ybatch.float())
        tot_mse+=mseloss.item()

        mcrloss = MCRMSE(y_hat.float(), ybatch.float())
        tot_mcr+=mcrloss.item()

        mcrloss.backward()   # reduce mcrmse loss
        opt.step()
    return tot_mse/len(trainloader), tot_mcr/len(trainloader)

@torch.no_grad()
def val_one_epoch():
    tot_mse = 0
    tot_mcr=0
    for xbatch, ybatch in valloader:
        model.eval()
        ids = xbatch.input_ids.squeeze(1).to(CONF.device)
        att_mask = xbatch.attention_mask.squeeze(1).to(CONF.device)
        ybatch = ybatch.to(CONF.device)
        y_hat = model(ids, att_mask)

        mseloss = mse(y_hat.float(), ybatch.float())
        tot_mse+=mseloss.item()

        mcrloss = MCRMSE(y_hat.float(), ybatch.float())
        tot_mcr+=mcrloss.item()

    return tot_mse/len(valloader), tot_mcr/len(valloader)


def Early_stopping(arr, patience):
    gt = arr[1:]>arr[:-1]
    if sum(gt[-patience:])==patience:
        return True
    return False


#********************************************
#                Training
#********************************************
model = custom_model()
model.to(CONF.device)
model= nn.DataParallel(model)
opt = torch.optim.Adam(model.parameters(), lr= 1e-5)
epochs = 2

train_mcr = []
train_mse = []
val_mcr = []
val_mse=[]
os.makedirs("models", exist_ok=True)
print("Training...")
start = time.perf_counter()
for epoch in range(epochs):
    # gc.collect()
    tmse, tmcr = train_one_epoch()
    vmse, vmcr = val_one_epoch()
    run.log("train mse",tmse)
    run.log("val mse",vmse)
    run.log("train mcr",tmcr)
    run.log("val mcr",vmcr)

    train_mcr.append(tmcr)
    val_mcr.append(vmcr)
    
    train_mse.append(tmse)
    val_mse.append(vmse)

    print(f'epoch {epoch+1}/{epochs}:  train mcrmse: {tmcr: .3f}  ===================== val mcrmse: {vmcr: .3f}')
    print(f'epoch {epoch+1}/{epochs}:  train mse:    {tmse: .3f}  ===================== val mse:    {vmse: .3f}\n')

    if val_mcr[epoch]==np.min(val_mcr):
        best_wt = model.state_dict()
        best_mcr = val_mcr[epoch]
        best_ep = epoch+1

    early_stop = Early_stopping(np.array(val_mcr),3)
    if early_stop:
        break

finish = time.perf_counter()
os.makedirs("models", exist_ok=True)
torch.save(best_wt, f'models/model_wght_{best_mcr: .3f}_ep_{best_ep}.pt')
print(f"the task took {np.round(finish-start, 0)} s")


In [None]:

run.register_model(model_name = "model1", model_path="./models")
 

In [None]:
run.complete()