In [6]:
import pandas as pd
import utils.get_datasets as get_datasets
import utils.config as config
import models.arch as arch
import models.get_loss_optim as get_loss_optim
from sklearn.model_selection import train_test_split, KFold
import torch
from torch import nn, optim 
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import tqdm


In [2]:
# how to get files
# icustays = get_icu_files('icustays.csv.gz')
d_icd_diagnoses = get_datasets.get_hosp_files('d_icd_diagnoses.csv.gz')
icd_diagnoses = get_datasets.get_hosp_files('diagnoses_icd.csv.gz')

print(d_icd_diagnoses, icd_diagnoses)

       icd_code  icd_version  \
0          0010            9   
1          0011            9   
2          0019            9   
3          0020            9   
4          0021            9   
...         ...          ...   
109770     Z992           10   
109771     Z993           10   
109772     Z998           10   
109773    Z9981           10   
109774    Z9989           10   

                                               long_title  
0                          Cholera due to vibrio cholerae  
1                   Cholera due to vibrio cholerae el tor  
2                                    Cholera, unspecified  
3                                           Typhoid fever  
4                                     Paratyphoid fever A  
...                                                   ...  
109770                       Dependence on renal dialysis  
109771                           Dependence on wheelchair  
109772  Dependence on other enabling machines and devices  
109773         

In [3]:
# how to join dataframe
# example - inner join with icd_code (hosp_data)
new_df = pd.merge(left=icd_diagnoses, right=d_icd_diagnoses, how='inner', on='icd_code')
print(new_df)

         subject_id   hadm_id  seq_num icd_code  icd_version_x  icd_version_y  \
0          10000032  22595853        1     5723              9              9   
1          10000826  20032235        4     5723              9              9   
2          10000826  28289260        1     5723              9              9   
3          10005866  26158160        4     5723              9              9   
4          10008924  23676183        7     5723              9              9   
...             ...       ...      ...      ...            ...            ...   
4860275    19990427  29695607       24  T24011A             10             10   
4860276    19996016  28015466        4   O30093             10             10   
4860277    19996783  25894657       20  H353131             10             10   
4860278    19997062  20096107        1    K8036             10             10   
4860279    19999043  23037011       10  O359XX2             10             10   

                           

In [2]:
# get arguments ( utils / config.py)
args = config.get_arguments()
print(args)
# gpu setup
device = config.setup(args)
print(device)


{'device': 0, 'seed': 42, 'model': 'SVM', 'batch_size': 128, 'num_workers': 2, 'epoch': 200, 'num_cls': 100, 'lr': 0.01, 'momentum': 0.9, 'weight_decay': 0.0001, 'nesterov': True, 'data_path': './', 'save_path': './', 'print_freq': 10}
mps


In [None]:
class TensorData(Dataset):

    def __init__(self, x_data, y_data):
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]

    def __getitem__(self, index):

        return self.x_data[index], self.y_data[index] 

    def __len__(self):
        return self.len

# 데이터 전처리 이후 학습 Feature와 target Feature 설정해야함.
data = new_df['subject_id']
target = new_df['icd_code']

# train : test = 8 : 2  -> split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, shuffle=True, stratify=target, random_state=args.seed)

# if use torch models
'''
trainset = TensorData(x_train, y_train)
testset = TensorData(x_test, y_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False)
'''

In [None]:
# xgbregressor example

import xgboost as XGBRegressor

# use xgboost
model = XGBRegressor()
models = []

kfold = KFold(n_splits=args.folds, shuffle=True)

for fold, (train_idx, val_idx) in enumerate(kfold.split(x_train)):
    # use xgboost
    x_t = x_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    x_val = x_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]

    models.append(model.fit(x_t, y_t),
                  eval_set = [(x_val, y_val)],
                  early_stopping_rounds = 100,
                  verbose=100)

    # use sampler if use torchmodels
    '''
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
    
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, sampler=train_subsampler) # 해당하는 index 추출
    valloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, sampler=val_subsampler)
    '''

    # get model to models/arch.py
    '''
    model = arch.get_model(args)
    optimizer, criterion = get_loss_optim.get_optimizer(args, model)
    model.to(device)

    for epoch in range(args.epoch):
        valloss = 0
        trainloss = 0

        model.train()
        for data in tqdm.tqdm(trainloader):
            inputs, values = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, values)
            trainloss += loss
            loss.backward()
            optimizer.step() 

        model.eval()
        with torch.no_grad():
            for data in tqdm.tqdm(valloader):
                inputs, values = data
                outputs = model(inputs)
                valloss += criterion(outputs, values)

    print("k-fold", fold," Train Loss: %.4f, Validation Loss: %.4f" %(trainloss/len(trainloader), valloss/len(valloader))) 
    '''

# xgboost prediction
preds = []
for model in models:
    preds.append(model.predict(x_test))