In [2]:
import pandas as pd
import numpy as np
import random

### Pancreatic Cancer codes

In [4]:
df = pd.read_csv("../data/raw/d_icd_diagnoses.csv")
df.head()

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A


In [10]:
icd9_codes = []
icd10_codes = []
for i in range(len(df)):
    title = df["long_title"][i]
    if(("pancreas" in title or "pancreati" in title) and "Malignant neoplasm" in title):
        if(df["icd_version"][i]==9):
            icd9_codes.append(df["icd_code"][i])
        else:
            icd10_codes.append(df["icd_code"][i])
        print(df["icd_code"][i], df["icd_version"][i], df["long_title"][i])
print()
print(icd9_codes)
print(icd10_codes)

1570 9 Malignant neoplasm of head of pancreas
1571 9 Malignant neoplasm of body of pancreas
1572 9 Malignant neoplasm of tail of pancreas
1573 9 Malignant neoplasm of pancreatic duct
1578 9 Malignant neoplasm of other specified sites of pancreas
1579 9 Malignant neoplasm of pancreas, part unspecified
C25 10 Malignant neoplasm of pancreas
C250 10 Malignant neoplasm of head of pancreas
C251 10 Malignant neoplasm of body of pancreas
C252 10 Malignant neoplasm of tail of pancreas
C253 10 Malignant neoplasm of pancreatic duct
C254 10 Malignant neoplasm of endocrine pancreas
C257 10 Malignant neoplasm of other parts of pancreas
C258 10 Malignant neoplasm of overlapping sites of pancreas
C259 10 Malignant neoplasm of pancreas, unspecified

['1570', '1571', '1572', '1573', '1578', '1579']
['C25', 'C250', 'C251', 'C252', 'C253', 'C254', 'C257', 'C258', 'C259']


### Finding number of patients with Pancreatic Cancer

In [11]:
df_diag = pd.read_csv("../data/raw/diagnoses_icd.csv")
df_diag.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


In [24]:
patients_icd9 = []
patients_icd10 = []

for i in range(len(df_diag)):
    icd_version = df_diag['icd_version'][i]
    icd_code = df_diag['icd_code'][i]
    subj_id = df_diag['subject_id'][i]
    
    if(icd_code in icd9_codes and icd_version == 9):
        patients_icd9.append(subj_id)
    elif(icd_code in icd10_codes and icd_version == 10):
        patients_icd10.append(subj_id)
        
patients_icd9 = list(set(patients_icd9))
patients_icd10 = list(set(patients_icd10))

print("Patients with icd9 PanCan:", len(patients_icd9))
print("Patients with icd10 PanCan:", len(patients_icd10))

Patients with icd9 PanCan: 835
Patients with icd10 PanCan: 709


### Checking Overlap between ICD9 and ICD10

In [25]:
print(len(list(set(patients_icd9) & set(patients_icd10))))

75


### Create subset of data

In [28]:
all_subj_ids = list(set(list(df_diag['subject_id'])))
pancan_subj = []
safe_subj = []

for subj_id in all_subj_ids:
    if(subj_id in patients_icd9 or subj_id in patients_icd10):
        pancan_subj.append(subj_id)
    else:
        random_number = random.randint(1, 100)
        if(random_number==1):
            safe_subj.append(subj_id)

print("Yes PanCan:", len(pancan_subj))
print("No PanCan:", len(safe_subj))
print("Total:", len(all_subj_ids))

Yes PanCan: 1469
No PanCan: 1728
Total: 180640


In [29]:
pancan_subj_array = np.array(pancan_subj)
np.save('pancan_subj.npy', pancan_subj_array)

safe_subj_array = np.array(safe_subj)
np.save('safe_subj.npy', safe_subj_array)