In [86]:
import pandas as pd
import numpy as np

In [87]:
DATA_PATH = "../data/"
MED_FILE = "PRESCRIPTIONS.csv"
DIAGS_FILE = "DIAGNOSES_ICD.csv"
NDC_TO_ADC_FILE = "package_NDC_ATC4_classes.csv"
PROCEDURE_FILE = "PROCEDURES_ICD.csv"

In [88]:
def process_med():
    Field = ["SUBJECT_ID","HADM_ID","ICUSTAY_ID","STARTDATE","NDC"]
    FieldType = {"SUBJECT_ID": 'Int64',
                "HADM_ID": 'Int64',
                "ICUSTAY_ID": 'Int64',
                "NDC": 'str',
                }
    med_df = pd.read_csv(DATA_PATH+MED_FILE, usecols=Field, dtype=FieldType, parse_dates=["STARTDATE"])

    med_df = med_df[med_df['NDC'] != '0'] 
    med_df.fillna(method='pad', inplace=True)
    med_df.dropna(inplace=True) 
    med_df.drop_duplicates(inplace=True)
    med_df.sort_values(by=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTDATE'], inplace=True)
    med_df = med_df.reset_index(drop=True)

    def filter_first24hour_med(med_df):
        med_df_new = med_df.drop(columns=['NDC'])
        med_df_new = med_df_new.groupby(by=['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).head(1).reset_index(drop=True) #returns the first startdate
        med_df_new = pd.merge(med_df_new, med_df, on=['SUBJECT_ID','HADM_ID','ICUSTAY_ID','STARTDATE'])
        med_df_new = med_df_new.drop(columns=['STARTDATE'])
        return med_df_new
    med_df = filter_first24hour_med(med_df) 
    med_df = med_df.drop(columns=['ICUSTAY_ID'])
    med_df = med_df.drop_duplicates()

    return med_df.reset_index(drop=True)

In [89]:
def process_ndc2atc4():
    """ process_ndc2atc4 reformats ndc from 10 digits to 11 digits 
            https://www.michigan.gov/-/media/Project/Websites/lara/healthsystemslicensing/Folder4/lara_MAPS_NDC_Guidelines.pdf?rev=4cb0e4c8d98946659f47dce703dabc22

        return: 2 column array ['NDC','ATC_class']
    """
    Field = ["NDC","ATC_class"]
    FieldType = {"NDC": 'str',
                "ATC_class": 'str'
                }
    ndc2atc_df = pd.read_csv(DATA_PATH+NDC_TO_ADC_FILE, usecols=Field, dtype=FieldType)
    ndc2atc_df[['Seg1', 'Seg2', 'Seg3']] = ndc2atc_df['NDC'].str.split('-', expand=True)
    ndc2atc_df['Seg1'] = ndc2atc_df['Seg1'].str.pad(side="left",width=5,fillchar='0')
    ndc2atc_df['Seg2'] = ndc2atc_df['Seg2'].str.pad(side="left",width=4,fillchar='0')
    ndc2atc_df['Seg3'] = ndc2atc_df['Seg3'].str.pad(side="left",width=2,fillchar='0')
    ndc2atc_df['NDC'] = ndc2atc_df[['Seg1', 'Seg2', 'Seg3']].agg(''.join, axis=1)
    ndc2atc_df = ndc2atc_df.drop_duplicates()
    return ndc2atc_df.filter(items=['NDC','ATC_class'])

In [90]:
def ndc2atc4(med_pd):
    with open('../data/ndc2rxnorm_mapping.txt', 'r') as f:
        ndc2rxnorm = eval(f.read())
    med_pd['RXCUI'] = med_pd['NDC'].map(ndc2rxnorm)
    med_pd.dropna(inplace=True)

    rxnorm2atc = pd.read_csv('../data/ndc2atc_level4.csv')
    rxnorm2atc = rxnorm2atc.drop(columns=['YEAR', 'MONTH', 'NDC'])
    rxnorm2atc.drop_duplicates(subset=['RXCUI'], inplace=True)
    med_pd.drop(index=med_pd[med_pd['RXCUI'].isin(
        [''])].index, axis=0, inplace=True)

    med_pd['RXCUI'] = med_pd['RXCUI'].astype('int64')
    med_pd = med_pd.reset_index(drop=True)
    med_pd = med_pd.merge(rxnorm2atc, on=['RXCUI'])
    med_pd.drop(columns=['NDC', 'RXCUI'], inplace=True)
    
    med_pd['ATC4'] = med_pd['ATC4'].map(lambda x: x[:5])
    med_pd = med_pd.drop_duplicates()
    med_pd = med_pd.reset_index(drop=True)
    med_pd.rename(columns={'ATC4': 'ATC_class'}, inplace=True)
    return med_pd

In [91]:
med_df = process_med()
ndc2atc_df = process_ndc2atc4()


  med_df.fillna(method='pad', inplace=True)


In [92]:
ndc2atc_df.shape

(337186, 2)

In [93]:
# create map
med_df['NDC'].unique()

np.savetxt(str(DATA_PATH)+"input.txt", med_df['NDC'].unique(), fmt='%s')


In [94]:
med_df.dtypes

SUBJECT_ID     Int64
HADM_ID        Int64
NDC           object
dtype: object

In [95]:
med_df['NDC'].unique()

array(['63323017302', '63323038810', '00088222033', ..., '00904125061',
       '11980002205', '00075800180'], dtype=object)

In [96]:
test = med_df.merge(ndc2atc_df, right_on='NDC',left_on='NDC', how='left')
test = test.drop_duplicates()
test.shape

(1898277, 4)

In [97]:
def process_diag():
    diag_df = pd.read_csv(DATA_PATH+DIAGS_FILE)
    diag_df = diag_df.dropna()
    diag_df = diag_df.drop(columns=['SEQ_NUM','ROW_ID'])
    diag_df = diag_df.drop_duplicates()
    diag_df = diag_df.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)
    return diag_df

In [98]:
# I did num = 129 instead of 128 to make the shapes match the output of their code since our
# filtering methods are different.

def filter_diag(diag_df, num=129):
    most_common_codes = diag_df['ICD9_CODE'].value_counts().head(num).index
    diag_df = diag_df[diag_df['ICD9_CODE'].isin(most_common_codes)].reset_index(drop=True)
    return diag_df

In [99]:
diag_df = process_diag()
# print(diag_df.shape)

diag_df = filter_diag(diag_df)
# print(diag_df.shape)

In [100]:
def filter_by_visit(med_df,single_visit):
    grouped_med_patients = med_df[['SUBJECT_ID', 'HADM_ID']].groupby(['SUBJECT_ID'])['HADM_ID'].unique().reset_index()
    grouped_med_patients['length'] = grouped_med_patients['HADM_ID'].apply(lambda x: len(x))
    if single_visit:
        grouped_med_patients = grouped_med_patients[grouped_med_patients['length'] == 1].reset_index(drop=True)
    else:
        grouped_med_patients = grouped_med_patients[grouped_med_patients['length'] > 1].reset_index(drop=True)
    df = med_df[med_df['SUBJECT_ID'].isin(grouped_med_patients['SUBJECT_ID'].unique())].reset_index(drop=True)

    return df

def filter_patients(df):
    drop_subjects = []
    for subject in df['SUBJECT_ID'].unique():
        subject_data = df[df['SUBJECT_ID'] == subject]
        for index, row in subject_data.iterrows():
            if len(list(row['ICD9_CODE'])) < 2 and len(list(row['ATC_class'])) < 2:
                drop_subjects.append(subject)
                break
    return df[~df['SUBJECT_ID'].isin(drop_subjects)].reset_index(drop=True)
    

In [101]:
def process_pro():
    pro_df = pd.read_csv(DATA_PATH+PROCEDURE_FILE, dtype={'ICD9_CODE': 'category'})
    pro_df = pro_df.drop(columns=['ROW_ID'])
    pro_df = pro_df.drop_duplicates()
    pro_df = pro_df.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)
    pro_df = pro_df.drop(columns=['SEQ_NUM'])
    pro_df = pro_df.drop_duplicates().reset_index(drop=True)

    return pro_df


def process_all(single_visit):
    med_df = process_med()
    # print("med_df shape", med_df.shape)
    med_df = ndc2atc4(med_df)
    # med_df = med_df.merge(ndc_atc_mapping, on='NDC', how='left')
    # print(med_df.shape)
    med_df = filter_by_visit(med_df,single_visit)
    diag_df = process_diag()
    # print("diag_df shape", diag_df.shape)
    diag_df = filter_diag(diag_df,num=2000)
    # print("diag_df shape", diag_df.shape)

    if single_visit:

        # print("med", med_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().shape)
        # print("diag", diag_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().shape)
        keys = [set(map(tuple, med_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().values)),
                set(map(tuple, diag_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().values))]
        
        common_keys = set.intersection(*keys)
        # print('common_keys', len(common_keys))
        common_df = pd.DataFrame(list(common_keys), columns=['SUBJECT_ID', 'HADM_ID'])

        med_df = med_df.merge(common_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
        # print('med_df shape', med_df.shape)
        diag_df = diag_df.merge(common_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
        # print('diag_df shape', diag_df.shape)

        med_df = med_df.groupby(['SUBJECT_ID', 'HADM_ID'])['ATC_class'].unique().reset_index()
        diag_df = diag_df.groupby(['SUBJECT_ID', 'HADM_ID'])['ICD9_CODE'].unique().reset_index()

        med_df['ATC_class'] = med_df['ATC_class'].apply(lambda x: list(x))
        diag_df['ICD9_CODE'] = diag_df['ICD9_CODE'].apply(lambda x: list(x))

        df = med_df.merge(diag_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

    else:
        pro_df = process_pro()
        keys = [set(map(tuple, med_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().values)),
                set(map(tuple, diag_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().values)),
                set(map(tuple, pro_df[['SUBJECT_ID', 'HADM_ID']].drop_duplicates().values))]
        
        common_keys = set.intersection(*keys)
        common_df = pd.DataFrame(list(common_keys), columns=['SUBJECT_ID', 'HADM_ID'])

        med_df = med_df.merge(common_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
        diag_df = diag_df.merge(common_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
        pro_df = pro_df.merge(common_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

        med_df = med_df.groupby(['SUBJECT_ID', 'HADM_ID'])['ATC_class'].unique().reset_index()
        diag_df = diag_df.groupby(['SUBJECT_ID', 'HADM_ID'])['ICD9_CODE'].unique().reset_index()
        pro_df = pro_df.groupby(['SUBJECT_ID', 'HADM_ID'])['ICD9_CODE'].unique().reset_index().rename(columns={'ICD9_CODE': 'PRO_CODE'})

        med_df['ATC_class'] = med_df['ATC_class'].apply(lambda x: list(x))
        diag_df['ICD9_CODE'] = diag_df['ICD9_CODE'].apply(lambda x: list(x))
        pro_df['PRO_CODE'] = pro_df['PRO_CODE'].apply(lambda x: list(x))

        df = med_df.merge(diag_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
        df = df.merge(pro_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
    
    # print(df.columns)

    return df

def run(single_visit):
    df = process_all(single_visit)
    # print(df.shape)
    df = filter_patients(df)

    med = df['ICD9_CODE'].values
    diag = df['ATC_class'].values

    med_unique = set([j for i in med for j in list(i)])
    diag_unique = set([j for i in diag for j in list(i)])

    return df, med_unique, diag_unique    

In [102]:
data_single_visit, med1, diag1 = run(single_visit=True)
data_multi_visit, med2, diag2 = run(single_visit=False)

unique_diag = set.union(med1, med2)
unique_med = set.union(diag1, diag2)

with open('../model_data/unique_diags.txt', 'w') as f:
    for item in unique_diag:
        f.write(str(item) + '\n')

with open('../model_data/unique_meds.txt', 'w') as f:
    for item in unique_med:
        f.write(str(item) + '\n')

with open('../model_data/multi-visit-diags.txt', 'w') as f:
    for item in diag2:
        f.write(str(item) + '\n')

with open('../model_data/multi-visit-meds.txt', 'w') as f:
    for item in med2:
        f.write(str(item) + '\n')

  med_df.fillna(method='pad', inplace=True)
  med_df.fillna(method='pad', inplace=True)


In [103]:
# print(data_single_visit.shape, data_multi_visit.shape)
# print(data_single_visit.columns)
# print(data_multi_visit.columns)

In [104]:
from sklearn.model_selection import train_test_split

subject_ids = data_multi_visit['SUBJECT_ID'].unique()

train_ids, temp_ids = train_test_split(subject_ids, test_size=1/3, random_state=1203)

eval_ids, test_ids = train_test_split(temp_ids, test_size=1/2, random_state=1203)

with open('../model_data/train-id.txt', 'w') as f:
    for item in train_ids:
        f.write(str(item) + '\n')

with open('../model_data/eval-id.txt', 'w') as f:
    for item in eval_ids:
        f.write(str(item) + '\n')

with open('../model_data/test-id.txt', 'w') as f:
    for item in test_ids:
        f.write(str(item) + '\n')