# Import the Data

In [None]:
import numpy as np
import pandas as pd
import os

os.chdir("/Users/dexinli/Dropbox (MIT)/MGH Prostate Research Group/RPDR/")

In [None]:
# Identifying seperator and first row as header, import all four files for diagnoses
diagnosis = pd.read_csv('Raw Data/First/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)

In [None]:
# patients with rp
singlerp_df = pd.read_csv("Processed data/singlerp.csv")

# check that all observations have singlerp = 1
singlerp_df["totalrp"].unique() # yes! all are 1

In [None]:
# patients undergone biopsy
biopsy_df = pd.read_csv("Processed data/df_pathology_biopsy_final.csv")

print(len(singlerp_df.index))
print(len(biopsy_df.index))
# all empis that we want to find comorbidities for
df_empi = singlerp_df.merge(biopsy_df, on = "EMPI", how="outer")
print(len(df_empi.index))

In [None]:
# other empis we want comorbidities for as well
need_empi_df = pd.read_csv("Code/Diagnoses/empis_need_comorbidity.csv")
need_empi_df = need_empi_df.drop(columns=["Unnamed: 0"])
need_empi_df.head()
print(len(need_empi_df.index))

In [None]:
# import the other three diagnosis datasets
diagnosis_second = pd.read_csv('Raw Data/Second/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diagnosis_third = pd.read_csv('Raw Data/Third/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)
diagnosis_fourth = pd.read_csv('Raw Data/Fourth/txt/KS185_20200918_114153_Dia.txt', sep="|", header=0, low_memory=False)

In [None]:
diamerged = diagnosis.append([diagnosis_second, diagnosis_third, diagnosis_fourth], sort=True)

# Cleaning the Data

In [None]:
# make data in correct format
diamerged['diadate_parsed'] = pd.to_datetime(diagnosis['Date'], errors='coerce', format='%m/%d/%Y')

# change date to a pd.datetime variable
diamerged['Date'] =pd.to_datetime(diamerged.Date)

In [None]:
# Only keep the patients that actually are either in singlerp, or in our biopsy reports

# merge diagnoses data with empis we want
diamerged2 = diamerged.merge(singlerp_df, on = "EMPI")
print(len(diamerged.index))
print(len(diamerged2.index)) 

In [None]:
# Map some of the LMR Codes to ICD Codes that can be read by our R package to calculate the comorbidity scores
# Mapping provided by Madhur
df_mapping = pd.read_csv("Raw Data/lmr_icd9map.csv")

# rename index column to Diagnosis_Name
df_mapping = df_mapping.rename(columns={"index": "Diagnosis_Name"})

# Drop ICD10 code, we will only use ICD9
df_mapping = df_mapping.drop(['ICD10'], axis=1)

In [None]:
# left merge so some LMR codes now have ICD9 codes
diamerged3 = diamerged2.merge(df_mapping, on='Diagnosis_Name', how='left')

# fill in values in ICD9 column where observations always had ICD9 values
diamerged3.loc[(diamerged3["Code_Type"] == "ICD9"), 'ICD9'] = diamerged3["Code"]

print(len(diamerged3.index))

In [None]:
# change nulls to -1 so they become easier to handle in R
diamerged3.loc[(diamerged3["ICD9"].isnull()), 'ICD9'] = -1

print(len(diamerged3[diamerged3['ICD9'].isnull()].index))
print(len(diamerged3[diamerged3['ICD9'] == -1].index))
print(len(diamerged3[diamerged3["Code_Type"] == "ICD9"].index))
print(len(diamerged3[diamerged3["Code_Type"] != "ICD9"].index))
print(len(diamerged3.index))

In [None]:
 # print to csv to use in R to calculate the comorbidity scores using R package
diamerged3.to_csv("Code/Diagnoses/diamerged_singlerp.csv")

# next portion of code is done in R, then reimported afterwards below

# Reimport Comorbidities From R to Find Comorbidity per Patient Day, Combining both ICD9 and ICD10 codes

In [None]:
# Reimport comorbidity for each patient-day data from R
charlson_singlerp = pd.read_csv('Code/Diagnoses/charlson_singlerp.csv')

# Delete first column
charlson_singlerp = charlson_singlerp.drop(columns=['Unnamed: 0'])

print(len(charlson_singlerp.index))
charlson_singlerp.head()

In [None]:
# Combine ICD9  and ICD10 comorbidity scores under each EMPI_day

# Groupby EMPI_day, then sum diagnoses (so ICD9, ICD10 codes can come together)
comorb_df = charlson_singlerp.groupby(['EMPI_day']).sum()
comorb_df[comorb_df > 1] = 1 # Make each of the diagnoses either 0 or 1
# score is just the sum of all of the diagnoses columns
comorb_df["score"] = comorb_df.drop(columns=['wscore', 'score'], axis=1).sum(axis=1)
#weighted score is the weighted sum of all the diagnoses columns
comorb_df["wscore"] = comorb_df.ami + comorb_df.pvd + comorb_df.cevd + comorb_df.dementia \
    + comorb_df.copd + comorb_df.rheumd + comorb_df.pud + comorb_df.mld \
    + comorb_df.diab + comorb_df.diabwc + 2*comorb_df.hp + 2*comorb_df.rend \
    + 2*comorb_df.canc + 3*comorb_df.msld + 6*comorb_df.metacanc + 6*comorb_df.aids
comorb_df.reset_index(inplace=True) # make EMPI_day a new column instead of index

comorb_df.head(50)

# Aggregate Comorbidity Scores Across Time 

In [None]:
# extract the actual EMPI and day
comorb_df[['EMPI','Date']] = comorb_df.EMPI_day.str.split(" ",expand=True,)
comorb_df.head(50)

In [None]:
# sort values based on EMPI, then date
comorb_df = comorb_df.sort_values(by=['EMPI', 'Date'])
print(len(comorb_df.index))
print(comorb_df.columns)
comorb_df.head(50)

In [None]:
# Now, we aggregate the comorbidities for each patient throughout time
# To do so, for each subsequent date for the patient, get sum of the diagnoses up to that date
# then iteratively update the score
for col in ["ami", "chf", "pvd", "cevd", "dementia", "copd", 
            "rheumd", "pud", "mld", "diab", "diabwc", "hp", "rend",
            "canc", "msld", "metacanc", "aids", "score", "wscore"]:
    comorb_df[col + "_agg"] = comorb_df[col]

comorb_df.head() 

In [None]:
# do iterrows to iterate through the patients and update the aggregate scores
print(len(comorb_df.index))
for i in range(1, len(comorb_df.index)):
    for col in ["ami", "chf", "pvd", "cevd", "dementia", "copd", 
            "rheumd", "pud", "mld", "diab", "diabwc", "hp", "rend",
            "canc", "msld", "metacanc", "aids"]:
        if(comorb_df.loc[i, "EMPI"] == comorb_df.loc[i-1, "EMPI"]):
            comorb_df.loc[i, col + "_agg"] = comorb_df.loc[i-1, col + "_agg"] + comorb_df.loc[i, col]
        
    if(i % 3000 == 0):
        print(i)
    


In [None]:
comorb_df.to_csv("Processed Data/dia_comorb.csv")
comorb_df.head(50)

In [None]:
# create agg columns
for col in ["ami", "chf", "pvd", "cevd", "dementia", "copd", 
            "rheumd", "pud", "mld", "diab", "diabwc", "hp", "rend",
            "canc", "msld", "metacanc", "aids"]:
    col_agg = col + "_agg"
    comorb_df.loc[comorb_df[col_agg] > 1, col_agg] = 1 # change > 1 to 1
    
# calculate agg score
comorb_df["score_agg"] = comorb_df["ami_agg"] + comorb_df["chf_agg"] + comorb_df["pvd_agg"] + \
    comorb_df["cevd_agg"] + comorb_df["dementia_agg"] + comorb_df["copd_agg"] +  comorb_df["rheumd_agg"] + \
    comorb_df["pud_agg"] + comorb_df["mld_agg"] + comorb_df["diab_agg"] + comorb_df["diabwc_agg"] + \
    comorb_df["hp_agg"] + comorb_df["rend_agg"] + comorb_df["canc_agg"] + comorb_df["msld_agg"] + \
    comorb_df["metacanc_agg"] + comorb_df["aids_agg"]

In [None]:
# calculate agg weighted score
comorb_df["wscore_agg"] = comorb_df.ami_agg + comorb_df.chf_agg + comorb_df.pvd_agg + comorb_df.cevd_agg + comorb_df.dementia_agg \
    + comorb_df.copd_agg + comorb_df.rheumd_agg + comorb_df.pud_agg + comorb_df.mld_agg \
    + comorb_df.diab_agg + 2*comorb_df.diabwc_agg + 2*comorb_df.hp_agg + 2*comorb_df.rend_agg \
    + 2*comorb_df.canc_agg + 3*comorb_df.msld_agg + 6*comorb_df.metacanc_agg + 6*comorb_df.aids_agg

In [None]:
#comorb_df.to_csv("Processed Data/dia_comorb_new_empi.csv")
comorb_df.to_csv("Processed Data/dia_comorbDL.csv")