In [1]:
import numpy as np
import pandas as pd
import os
import shutil

from tqdm.auto import tqdm
import itertools
import random
import pickle

from fastNLP import DataSet
from missingprocessor import Processor

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def cohort_selection(df):
    """
    (1) CANCER_PT_BSNF : BSPT_FRST_DIAG_YMD
    (2) CANCER_PT_BSNF : (BSPT_FRST_OPRT_YMD, BSPT_FRST_TRTM_STRT_YMD)
    0 <= Diff = MIN((2)) - (1) <= 3 months
    """
    selected_cols = ['PT_SBST_NO', 'BSPT_FRST_DIAG_YMD', 'BSPT_FRST_OPRT_YMD', 'BSPT_FRST_ANCN_TRTM_STRT_YMD','BSPT_FRST_RDT_STRT_YMD']
    df = df[selected_cols].copy()
    df['BSPT_FRST_MIN_YMD'] = df.loc[:, selected_cols[2:4]].min(axis=1)
    #df = df.replace(r'\N', np.nan)
    df[df.columns[1:]] = df[df.columns[1:]].apply(lambda x : pd.to_datetime(x, format='%Y%m%d'))

    df['BSPT_FRST_DIFF'] = (df['BSPT_FRST_MIN_YMD'] - df['BSPT_FRST_DIAG_YMD']).dt.days
    df = df[(df['BSPT_FRST_DIFF']>= 0) & (df['BSPT_FRST_DIFF'] <= 90)]

    pt_key_id = sorted(df['PT_SBST_NO'].unique())

    return pt_key_id

In [5]:
patient_basicinfo_path = '/home/tohone06/synthetic_cancer_patients/data/CLRC/clrc_pt_bsnf.csv'
df_pt_bsnf = pd.read_csv(patient_basicinfo_path, na_values='\\N')

pt_key_id = cohort_selection(df_pt_bsnf)

In [6]:
selected_cols = ['PT_SBST_NO', 'BSPT_IDGN_AGE', 'BSPT_SEX_CD', 'BSPT_FRST_DIAG_CD', 'BSPT_FRST_DIAG_YMD','BSPT_DEAD_YMD']
df_pt_bsnf = df_pt_bsnf[
    df_pt_bsnf['PT_SBST_NO'].isin(pt_key_id)][selected_cols]

df_pt_bsnf['BSPT_SEX_CD'] = df_pt_bsnf['BSPT_SEX_CD'].replace({'F': 0, 'M': 1})

In [7]:
diag_cd = sorted(df_pt_bsnf['BSPT_FRST_DIAG_CD'].unique())
diag_cd = {cd: i for i, cd in enumerate(diag_cd)}
df_pt_bsnf['BSPT_FRST_DIAG_CD'] = df_pt_bsnf['BSPT_FRST_DIAG_CD'].replace(diag_cd)

df_pt_bsnf['BSPT_DEAD'] = df_pt_bsnf['BSPT_DEAD_YMD'].notnull().astype(np.int32)

In [8]:
df_ex_diag1_raw_path = '/home/tohone06/synthetic_cancer_patients/data/CLRC/clrc_ex_diag1.csv'
df_ex_diag1_raw = pd.read_csv(df_ex_diag1_raw_path, encoding='cp949').replace(r'\N', np.nan)

df_ex_diag2_raw_path = '/home/tohone06/synthetic_cancer_patients/data/CLRC/clrc_ex_diag2.csv'
df_ex_diag2_raw = pd.read_csv(df_ex_diag2_raw_path, encoding='cp949').replace(r'\N', np.nan)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
df_ex_diag1 = df_ex_diag1_raw[df_ex_diag1_raw['PT_SBST_NO'].isin(pt_key_id)]
df_ex_diag1 = df_ex_diag1[['PT_SBST_NO', 'CEXM_YMD', 'CEXM_NM', 'CEXM_RSLT_CONT', 'CEXM_RSLT_UNIT_CONT']]
df_ex_diag2 = df_ex_diag2_raw[df_ex_diag2_raw['PT_SBST_NO'].isin(pt_key_id)]
df_ex_diag2 = df_ex_diag2[['PT_SBST_NO', 'CEXM_YMD', 'CEXM_NM', 'CEXM_RSLT_CONT', 'CEXM_RSLT_UNIT_CONT']]
df_ex_diag = pd.concat([df_ex_diag1, df_ex_diag2], axis=0, ignore_index=True).sort_values(
    by=['PT_SBST_NO', 'CEXM_YMD', 'CEXM_NM']).reset_index(drop=True)

In [10]:
var_list = [
    'ALP',
    'ALT',
    'AST',
    'Albumin',
    'BUN',
    'Bilirubin, Total',
    'CA 19-9',
    'CEA',
    'CRP, Quantitative (High Sensitivity)',
    'ESR (Erythrocyte Sedimentation Rate)',
    'Protein, Total',
]

In [11]:
exclusion = ['Anti-HBs Antibody', 'Anti-HCV Antibody', 'Anti-HIV combo', 'HBsAg']
df_ex_diag = df_ex_diag[~df_ex_diag['CEXM_NM'].isin(exclusion)]

df_ex_diag = pd.merge(df_ex_diag, df_pt_bsnf[['PT_SBST_NO', 'BSPT_FRST_DIAG_YMD']],
                      how='left', on='PT_SBST_NO')
df_ex_diag[['CEXM_YMD', 'BSPT_FRST_DIAG_YMD']] = df_ex_diag[['CEXM_YMD', 'BSPT_FRST_DIAG_YMD']].apply(
    lambda x: pd.to_datetime(x, format='%Y%m%d'))

df_ex_diag['TIMESTAMP'] = (df_ex_diag['CEXM_YMD'] - df_ex_diag['BSPT_FRST_DIAG_YMD']).dt.days
df_ex_diag = df_ex_diag[(df_ex_diag['TIMESTAMP']/365 >= 0) & (df_ex_diag['TIMESTAMP']/365 <= 5)]
#df_pt_bsnf = df_pt_bsnf[df_pt_bsnf['PT_SBST_NO'].isin(df_ex_diag['PT_SBST_NO'].unique())]
df_ex_diag['CEXM_RSLT_CONT'] = df_ex_diag['CEXM_RSLT_CONT'].astype(np.float32)
cols_ex_diag = ['PT_SBST_NO', 'CEXM_NM', 'CEXM_RSLT_CONT', 'TIMESTAMP']
df_ex_diag = df_ex_diag[cols_ex_diag]

In [12]:
os.makedirs('./data', exist_ok = True)

data_collector = []
label_collector = []
#var_collector = []

for k, g in tqdm(df_ex_diag.groupby('PT_SBST_NO')):
    to_physionet_style = []
    
    g = g.pivot_table(index='TIMESTAMP', 
                      columns='CEXM_NM', 
                      values='CEXM_RSLT_CONT', 
                      aggfunc='mean').reset_index(drop=False)

    data_collector.append(g)
    
    age_sex_diag = df_pt_bsnf[df_pt_bsnf['PT_SBST_NO'] == k][['BSPT_IDGN_AGE', 'BSPT_SEX_CD', 'BSPT_FRST_DIAG_CD']].to_numpy()[0]
    label = df_pt_bsnf.loc[df_pt_bsnf['PT_SBST_NO']==k, 'BSPT_DEAD'].values.item()
    label_collector.append([age_sex_diag[0], age_sex_diag[1], age_sex_diag[2], label])   

100%|██████████| 15827/15827 [01:27<00:00, 180.67it/s]


In [13]:
random.seed(42) ## seed works only in same cell
test_idx = sorted(random.sample(range(len(label_collector)), int(len(label_collector)*0.2)))

In [14]:
train_data = [data_collector[idx] for idx in range(0, len(label_collector)) if idx not in test_idx]
test_data = [data_collector[idx] for idx in test_idx]

In [27]:
len(train_data)

12662

In [25]:
test_data

[CEXM_NM  TIMESTAMP   ALP   ALT   AST  Albumin   BUN  Bilirubin, Total  \
 0                6  75.0  14.0  17.0      4.4  12.8               0.6   
 1              147  63.0  16.0  22.0      3.7  11.5               0.2   
 2              161  61.0  15.0  23.0      3.8   9.7               0.2   
 3              174  54.0  12.0  21.0      3.7   8.1               0.2   
 4              189  59.0  13.0  24.0      4.0   8.4               0.2   
 5              203  56.0  11.0  21.0      3.8   7.2               0.2   
 6              217  58.0  12.0  25.0      3.9   8.5               0.2   
 7              234  68.0  17.0  29.0      4.1   9.4               0.3   
 8              251  65.0  16.0  24.0      3.9  10.3               0.2   
 9              269  59.0  12.0  21.0      3.9  11.2               0.2   
 10             291  59.0  16.0  21.0      3.7  13.1               0.3   
 11             305  63.0  14.0  20.0      3.8  10.0               0.3   
 12             319  62.0  14.0  19.0 

In [15]:
sta = pd.DataFrame(label_collector, columns=['age', 'sex', 'diag_code', 'result'])
sta["seq_len"] = np.array([len(x) for x in data_collector])

train_sta = sta.iloc[~sta.index.isin(test_idx)].reset_index(drop=True)
test_sta = sta.iloc[test_idx].reset_index(drop=True)

In [16]:
dyn_train = pd.concat(train_data)

In [17]:
dyn_types = ['continuos'] * len(dyn_train.columns)
sta_types = ['int', 'binary', 'categorical','binary', 'int']

In [18]:
d_P = Processor(dyn_types, use_pri='TIMESTAMP')
s_P = Processor(sta_types)
d_P.fit(dyn_train)
s_P.fit(train_sta)

TIMESTAMP 1 None continuos None
ALP 1 0.10157173128081907 continuos None
ALT 1 0.08861369472370931 continuos None
AST 1 0.08864339985081425 continuos None
Albumin 1 0.09716877132993154 continuos None
BUN 1 0.08447147977740958 continuos None
Bilirubin, Total 1 0.09487157483381632 continuos None
CA 19-9 1 0.6403072170256586 continuos None
CEA 1 0.5755830456336763 continuos None
CRP, Quantitative (High Sensitivity) 1 0.737393474113632 continuos None
ESR (Erythrocyte Sedimentation Rate) 1 0.8494313118444243 continuos None
Protein, Total 1 0.10958551445979577 continuos None
age 1 None int None
[95.] [15.]
sex 1 None binary None
diag_code 3 None categorical None
[0 1 2]
result 1 None binary None
seq_len 1 None int None
[192.] [1.]


In [None]:
#space : int =1, binary = 1 (0,1), categorical = len(diag_code) = 12, binary = 1, int = 1
#total 16 = s_p.shape[1] (4946, 16)

In [19]:
def build_dataset(sta, dyn, seq_len):
    d_lis=[d_P.transform(ds) for ds in dyn] #dataframe to array len=6
    d = [x[0].tolist() for x in d_lis]
    lag = [x[1].tolist() for x in d_lis]
    mask = [x[2].tolist() for x in d_lis]
    times = [x[-1].tolist() for x in d_lis] 
    priv = [x[3].tolist() for x in d_lis]
    nex = [x[4].tolist() for x in d_lis]
    
    s = s_P.transform(sta)
    label = [float(x[-2]) for x in s] #-1=seq_len, -2=result(death=1)
    
    dataset = DataSet({"seq_len": seq_len, 
                       "dyn": d, "lag":lag, "mask": mask,
                       "sta": s, "times":times, "priv":priv, "nex":nex, "label": label
                      })
    return dataset

In [20]:
train_set = build_dataset(train_sta, train_data, train_sta['seq_len'].tolist())
test_set = build_dataset(test_sta, test_data, test_sta['seq_len'].tolist())

In [23]:
train_set

+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| seq_len | dyn          | lag          | mask         | sta          | times        | priv         | nex          | label |
+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| 32      | [[0.0, 0.... | [[0.00657... | [[0.0, 0.... | [0.625   ... | [[0.00657... | [[0.0, 0.... | [[0.0, 0.... | 1.0   |
| 11      | [[0.01640... | [[0.00602... | [[1.0, 1.... | [0.575   ... | [[0.00602... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 20      | [[0.01051... | [[0.00657... | [[1.0, 1.... | [0.575   ... | [[0.00657... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 14      | [[0.00716... | [[0.00931... | [[1.0, 1.... | [0.4125  ... | [[0.00931... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 16      | [[0.01051... | [[0.01260... | [[1.0, 1.... | [0.4875  ... | [[0.01260... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |


In [None]:
finaldic = {
    "train_set": train_set,
    'raw_set': (train_sta, train_data),
    'test_set': (test_sta, test_data),
    'val_set': test_set,
    "dynamic_processor": d_P,
    "static_processor":s_P
}

with open("./connect_clrc.pkl", "wb") as f:
    pickle.dump(finaldic, f)

In [None]:
dataset = pickle.load(open('./connect_clrc.pkl', "rb"))

In [None]:
dataset