In [None]:
#Credits to Abishek

In [None]:
import torch
import transformers
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn import model_selection
from transformers import AdamW, get_linear_schedule_with_warmup
#for TPU
import torch_xla.core.xla_model as xm
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

class BERTBaseUncased(nn.Module):
    def __init__(self,bert_path):
        super(BERTBaseUncased,self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768,30)
        
    #similar to transformers.BertModel.forward
    def forward(self,ids,mask,token_type_ids):
        #calls transformers.BertModel.forward
        #transformers.BertModel.forward overrides __call__() method
        _,o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        return self.out(bo)

class BERTDatasetTraining:
    def __init__(self,qtitle,qbody,answer,targets,tokenizer,max_len):
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer= answer
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = targets #a numpy array
    
    def __len__(self):
        return len(self.answer)
        #can be self.qbody etc.
        
    def __getitem__(self,item):
        #in case 
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer = str(self.answer[item])
        
        inputs = self.tokenizer.encode_plus(
            question_title+" "+question_body,
            answer,
            add_special_tokens=True,
            max_length=self.max_len
        )
        
        ids = inputs["input_ids"]
        #ids of each token(words/subwords)
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        #add padding
        padding_len = self.max_len - len(ids)
        ids = ids + ([0]*padding_len)
        token_type_ids = token_type_ids + ([0]*padding_len)
        mask = mask + ([0]*padding_len)
        
        return {
            "ids": torch.tensor(ids,dtype=torch.long),
            "mask": torch.tensor(mask,dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids,dtype=torch.long),
            "targets": torch.tensor(self.targets[item,:],dtype=torch.float)
        }
#custom loss function
def loss_fn(outputs,targets):
    return nn.BCEWithLogitsLoss()(outputs,targets)

def train_loop_fn(data_loader,model,optimizer,device,scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d["token_type_ids"]
        targets = d["targets"]
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(ids=ids,mask=mask,token_type_ids=token_type_ids)
        loss = loss_fn(outputs,targets)
        loss.backward()
        #for TPU,
        xm.optimizer_step(optimizer, barrier=True) #barrier=True if using only 1 TPU node
        #for GPU,
        #optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi %10 == 0: #bi for batch index
            print(f"bi={bi}, loss={loss}")
            
def eval_loop_fn(data_loader,model,device):
    model.eval()
    #there are 30 targets for google dataset
    fin_targets = []
    fin_outputs = []
    
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d["token_type_ids"]
        targets = d["targets"]
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        outputs = model(ids=ids,mask=mask,token_type_ids=token_type_ids)
        #outputs params, there should be 2 tensors
        loss = loss_fn(outputs,targets)
        
        fin_targets.append(targets.cpu().detach().numpy())
        fin_outputs.append(outputs.cpu().detach().numpy())
        
        #np.vstack stacks array nicely along the first axis??
    return np.vstack(fin_outputs), np.vstack(fin_targets)

#last layer is linear layer, should apply sigmoid function but ok since use spearman
def run(): #index is for multiprocessing
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 4
    EPOCHS = 20
    
    dfx = pd.read_csv("input/google-quest-challenge/train.csv").fillna("none")
    df_train, df_valid = model_selection.train_test_split(dfx,random_state=42,test_size=0.1)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    sample = pd.read_csv('input/google-quest-challenge/sample_submission.csv')
    target_cols = list(sample.drop("qa_id", axis=1).columns)
    train_targets = df_train[target_cols].values #30 different metrics
    valid_targets = df_valid[target_cols].values #30 different metrics
    
    tokenizer = transformers.BertTokenizer.from_pretrained("input/bert_base_uncased/")
    
    train_dataset = BERTDatasetTraining(
        qtitle=df_train.question_title.values,
        qbody=df_train.question_body.values,
        answer=df_train.answer.values,
        targets=train_targets,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True
    )

    valid_dataset = BERTDatasetTraining(
        qtitle=df_valid.question_title.values,
        qbody=df_valid.question_body.values,
        answer=df_valid.answer.values,
        targets=valid_targets,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,  #explicitly stated
        shuffle=False   #True or false is ok
    )
    
    device = xm.xla_device() #for TPUE #"cuda" for GPU
    lr = 3e-5
    num_train_steps = int(len(train_dataset)/TRAIN_BATCH_SIZE * EPOCHS)
    model = BERTBaseUncased("input/bert_base_uncased").to(device)
    
    optimizer = AdamW(model.parameters(),lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    
    for epoch in range(EPOCHS):
        train_loop_fn(train_data_loader,model,optimizer,device,scheduler)
        o, t = eval_loop_fn(valid_data_loader,model,device)
        
        #t is fin_targets
        #o os fin_output params?
        spear = []
        for jj in range(t.shape[1]):
            p1 = list(t[:,jj])
            p2 = list(o[:,jj])
            coef,_ = np.nan_to_num(stats.spearmanr(p1,p2)) #spearman rank scipy
            spear.append(coef)
        spear = np.mean(spear)
        print(f"epoch = {epoch},spearman={spear}")
        #WHEN using TPU
        xm.save(model.state_dict(),"model.bin")
        
if __name__ == "__main__":
    run()
    