In [None]:
!pip install torch

In [None]:
!pip install import_ipynb

In [None]:
!pip install pytorch_pretrained_bert 

In [None]:
import os
import sys
import pandas as pd
import csv
import collections
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import import_ipynb
from cvd_utils import *
from PreTrainingModel import *
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
import time
from sklearn import preprocessing
import math
import sklearn.metrics as skm


In [None]:
src = pd.read_csv("src_df.csv", header=0)
sex = pd.read_csv("sex_df.csv", header=0)
ethni = pd.read_csv("ethni_df.csv", header=0)
race = pd.read_csv("race_df.csv", header=0)
age = pd.read_csv("age_df.csv", header=0)
vocab = pd.read_csv("vocab_df.csv", header=0)
age_vocab = pd.read_csv("age_vocab_df.csv", header=0)

In [None]:
src= src.drop(columns=["Unnamed: 0"])
src = src.fillna(-125)
age = age.drop(columns=["Unnamed: 0"])
age = age.fillna(-125)
sex = sex.drop(columns=["Unnamed: 0"])
sex = sex.fillna(-125)
ethni = ethni.drop(columns=["Unnamed: 0"])
ethni = ethni.fillna(-125)
race = race.drop(columns=["Unnamed: 0"])
race = race.fillna(-125)

In [None]:
vocab = vocab.to_dict()['Unnamed: 0']
vocab = {v: k for k, v in vocab.items()}

In [None]:
train_l = int(len(src)*0.70)
val_l = int(len(src)*0.1)
test_l = len(src) - val_l - train_l

In [None]:
test_l

In [None]:
file_config = {
    'model_path': 'model/', # where to save model
    'model_name': 'PreTrain_CVDTransformer', # model name
    'file_name': 'log.txt',  # log path
}
#create_folder(file_config['model_path'])

global_params = {
    'max_seq_len': src.shape[1],
    'max_age': max(age_vocab.keys()),
    'month': 1,
    'age_symbol': None,
    'min_visit': 3,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 32,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': "cuda:0" if torch.cuda.is_available() else "cpu",
    'data_len' : len(src),
    'train_data_len' : train_l,
    'val_data_len' : val_l,
    'test_data_len' : test_l,
    'epochs' : 50,
    'action' : 'train'
}

model_config = {
    'vocab_size': len(vocab), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': age_vocab.shape[0], # number of vocab for age embedding
    'gender_vocab_size': 2,
    'ethnicity_vocab_size': 4,
    'race_vocab_size': 5,
    'max_position_embedding': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02 # parameter weight initializer range
}

In [None]:
train_code = src.values[:train_l]
val_code = src.values[train_l:train_l + val_l]
test_code = src.values[train_l + val_l:]

train_age = age.values[:train_l]
val_age = age.values[train_l:train_l + val_l]
test_age = age.values[train_l + val_l:]


train_gender = sex.values[:train_l]
val_gender = sex.values[train_l:train_l + val_l]
test_gender = sex.values[train_l + val_l:]

train_ethni = ethni.values[:train_l]
val_ethni = ethni.values[train_l:train_l + val_l]
test_ethni = ethni.values[train_l + val_l:]

train_race = race.values[:train_l]
val_race = race.values[train_l:train_l + val_l]
test_race = race.values[train_l + val_l:]

train_data = {"code":train_code, "age":train_age, "gender" : train_gender, "ethnicity" : train_ethni, "race" : train_race}
val_data = {"code":val_code, "age":val_age, "gender" : val_gender, "ethnicity" : val_ethni, "race" : val_race}
test_data = {"code":test_code, "age":test_age, "gender" : test_gender, "ethnicity" : test_ethni, "race" : test_race}

In [None]:
for key in train_data:
    tmp = []
    for i in range(len(train_data[key])):
        tmp.append(train_data[key][i][train_data[key][i] != -125])
    train_data[key] = tmp

In [None]:
for key in val_data:
    tmp = []
    for i in range(len(val_data[key])):
        tmp.append(val_data[key][i][val_data[key][i] != -125])
    val_data[key] = tmp

In [None]:
for key in test_data:
    tmp = []
    for i in range(len(test_data[key])):
        tmp.append(test_data[key][i][test_data[key][i] != -125])
    test_data[key] = tmp

In [None]:
conf = BertConfig(model_config)
model = BertForMaskedLM(conf)
model = model.to(train_params['device'])
optim = adam(params=list(model.named_parameters()), config=optim_param)

In [None]:
def run_epoch(e, trainload):
    tr_loss = 0
    start = time.time()
    for step, batch in enumerate(trainload):
        optim.zero_grad()
        batch = tuple(t for t in batch)
        age_ids, input_ids, gender_ids, ethni_ids, race_ids, posi_ids, segment_ids, attMask, masked_label = batch
        loss, pred, label = model(input_ids, age_ids, gender_ids, ethni_ids, race_ids, segment_ids, posi_ids,attention_mask=attMask, masked_lm_labels=masked_label)
        if global_params['gradient_accumulation_steps'] >1:
            loss = loss/global_params['gradient_accumulation_steps']
        loss.backward()
        tr_loss += loss.item()
        if step%50 == 0:
            print(cal_acc(label, pred))
        optim.step()
    print("** ** * Saving fine - tuned model ** ** * ")
    model_to_save = model.module if hasattr(model, 'module') else model
    save_model(model_to_save.state_dict())
    cost = time.time() - start
    return tr_loss, cost

In [None]:
def train(trainload, valload):
    with open("log_fine_tune.txt", 'w') as f:
            f.write('')
    model.train()
    for e in range(train_params["epochs"]):
        print("Epoch n" + str(e))
        train_loss, train_time_cost = run_epoch(e, trainload)
        val_loss, val_time_cost,pred, label = eval(valload)
        train_loss = train_loss / math.ceil((train_params["train_data_len"]/train_params['batch_size']))
        val_loss = val_loss / math.ceil((train_params["val_data_len"]/train_params['batch_size']))
        print('TRAIN {}\t{} secs\n'.format(train_loss, train_time_cost))
        with open("log_fine_tune.txt", 'a') as f:
            f.write("Epoch n" + str(e) + '\n TRAIN {}\t{} secs\n'.format(train_loss, train_time_cost) + '\n\n\n')
            f.write('EVAL {}\t{} secs\n'.format(val_loss, val_time_cost) + '\n\n\n')
        my_bucket = os.getenv('WORKSPACE_BUCKET')
        os.system(f"gsutil cp './log_fine_tune.txt' '{my_bucket}/data/'")
    return train_loss, 0

In [None]:
def eval(_valload):
    tr_loss = 0
    temp_loss = 0
    start = time.time()
    model.eval()
    for step, batch in enumerate(_valload):
        batch = tuple(t for t in batch)
        age_ids, input_ids, gender_ids, ethni_ids, race_ids, posi_ids, segment_ids, attMask, masked_label = batch
        loss, pred, label = model(input_ids, age_ids, gender_ids, ethni_ids, race_ids, segment_ids, posi_ids,attention_mask=attMask, masked_lm_labels=masked_label)
        #pred_array += pred
        #label_array += label
        if global_params['gradient_accumulation_steps'] >1:
            loss = loss/global_params['gradient_accumulation_steps']
        temp_loss += loss.item()
        tr_loss += loss.item()

    cost = time.time() - start
    return tr_loss, cost, pred, label


In [None]:
def save_model(_model_dict):
    torch.save(_model_dict, "PreTrain_CVDTransformer")
    my_bucket = os.getenv('WORKSPACE_BUCKET')
    os.system(f"gsutil cp './PreTrain_CVDTransformer' '{my_bucket}/data/'")
    print(f'[INFO] model_dict is successfully uploaded in your bucket.')

In [None]:
def cal_acc(label, pred):
    logs = nn.LogSoftmax()
    label=label.cpu().numpy()
    ind = np.where(label!=-1)[0]
    truepred = pred.detach().cpu().numpy()
    truepred = truepred[ind]
    truelabel = label[ind]
    truepred = logs(torch.tensor(truepred))
    outs = [np.argmax(pred_x) for pred_x in truepred.numpy()]
    precision = skm.precision_score(truelabel, outs, average='micro')
    return precision

In [None]:
if train_params['action'] == 'resume' or train_params['action'] == 'eval':
    my_bucket = os.getenv('WORKSPACE_BUCKET')
    name_of_file_in_bucket = "PreTrain_CVDTransformer"
    os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")
    model.load_state_dict(torch.load("PreTrain_CVDTransformer", map_location=train_params['device']))
if train_params['action'] == 'train' or train_params['action'] == 'resume':
    TrainDset = CVDLoader(train_data, vocab, max_len=train_params['max_len_seq'], code='code')
    trainload = torch.utils.data.DataLoader(dataset=TrainDset, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)
    ValDset = CVDLoader(val_data, vocab, max_len=train_params['max_len_seq'], code='code')
    valload = torch.utils.data.DataLoader(dataset=ValDset, batch_size=train_params['batch_size'], shuffle=True, num_workers=1)
    train_loss, val_loss = train(trainload, valload)
elif train_params['action'] == 'eval':
    TestDset = CVDLoader(test_data, vocab, max_len=train_params['max_len_seq'], code='code')
    testload = torch.utils.data.DataLoader(dataset=TestDset, batch_size=int(32), shuffle=True, num_workers=3)
    loss, cost, pred, label = eval(testload)

In [None]:
#my_bucket = os.getenv('WORKSPACE_BUCKET')
#name_of_file_in_bucket = "PreTrain_CVDTransformer"
#os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")
#model.load_state_dict(torch.load("model_dict", map_location=train_params['device']))
TestDset = CVDLoader(test_data, vocab, max_len=train_params['max_len_seq'], code='code')
testload = torch.utils.data.DataLoader(dataset=TestDset, batch_size=int(32), shuffle=False, num_workers=3)
loss, cost, pred, label = eval(testload)