In [None]:
%load_ext google.cloud.bigquery
from datetime import date
import time
from datetime import datetime as dt
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
from IPython.display import display, HTML
import os
import subprocess

In [None]:
dataset = os.getenv("WORKSPACE_CDR")
CDR_split=dataset.split(".")
CDR_version=CDR_split[1]
prefix = CDR_split[0]

In [None]:
# Information about CDR
query="""SELECT DISTINCT p.person_id
    ,gender_concept_id,race_concept_id,year_of_birth
    FROM 
    `"""+prefix+"""."""+CDR_version+""".person` p """

demo_patients=pd.read_gbq(query, dialect="standard")

In [None]:
demo_patients["age_today"] = datetime.today().year-demo_patients["year_of_birth"]

In [None]:
# Go through all steps of K51
icd_K51_3num = np.linspace(start=51.000, stop = 51.999, num = 1000)
icd_K51_2num = np.linspace(start=51.00, stop = 51.99, num = 100)

decimal_digits = 3
icd_K51_3 = ['K'+f"{num:.{decimal_digits}f}" for num in icd_K51_3num]

decimal_digits = 2
icd_K51_2 = ['K'+f"{num:.{decimal_digits}f}" for num in icd_K51_2num]

icd_K51_all = icd_K51_2 + icd_K51_3

print(icd_K51_all)

In [None]:
# Go through all steps of K50.1 and K50.8
icd_K50_11 = np.linspace(start=50.111, stop = 50.119, num = 9)

decimal_digits = 3
icd_K50_11 = ['K'+f"{num:.{decimal_digits}f}" for num in icd_K50_11]

icd_K50_81 = np.linspace(start=50.811, stop = 50.819, num = 9)

decimal_digits = 3
icd_K50_81 = ['K'+f"{num:.{decimal_digits}f}" for num in icd_K50_81]

icd_K50_all = icd_K50_81 + icd_K50_11

print(icd_K50_all)

In [None]:
# Check the codes for K50
query = ('''
SELECT distinct condition_source_value
FROM fc-aou-cdr-prod-ct.C2022Q4R9.condition_occurrence
WHERE condition_source_value LIKE 'K50%'
''')
K50_codes_df = pd.read_gbq(query, dialect="standard")

# Check the codes for K51
query = ('''
SELECT distinct condition_source_value
FROM fc-aou-cdr-prod-ct.C2022Q4R9.condition_occurrence
WHERE condition_source_value LIKE 'K51%'
''')
K51_codes_df = pd.read_gbq(query, dialect="standard")

# Check the codes for K52.3
query = ('''
SELECT distinct condition_source_value
FROM fc-aou-cdr-prod-ct.C2022Q4R9.condition_occurrence
WHERE condition_source_value LIKE 'K52.3%'
''')
K52_3_codes_df = pd.read_gbq(query, dialect="standard")

In [None]:
# we will use these arrays to dump into sql queries
colitis_diagnostic_condition_icd9 = ['556', '556.0', '556.1', '556.2', '556.3',
                                     '556.5', '556.6', '556.8', '556.9',
                                     '555.1', '555.2'] 
# Split by crohn's and UC
crohns_icd9 = ['555.1', '555.2']
uc_icd9  = ['556', '556.0', '556.1', '556.2', '556.3', '556.5', '556.6', '556.7', '556.8', '556.9']

colitis_diagnostic_condition_icd10 = ['K51', 'K51.0', 'K51.1', 'K51.2', 'K51.3', 'K51.4', 'K51.5', 
                                      'K51.8', 'K50.1', 'K50.8', 'K50.10', 'K50.11',
                                      'K50.80', 'K50.81', 'K52.3'] + icd_K51_all + icd_K50_all
# Split by crohn's and UC
crohns_icd10 = ['K50.1', 'K50.8', 'K50.10', 'K50.11', 'K50.80', 'K50.81'] + icd_K50_all
uc_icd10 = ['K51', 'K51.0', 'K51.1', 'K51.2', 'K51.3', 'K51.4', 'K51.5', 
                                      'K51.8'] + icd_K51_all


colitis_icd9_conds = ",".join(["'"+code+"'" for code in colitis_diagnostic_condition_icd9])
colitis_icd10_conds = ",".join(["'"+code+"'" for code in colitis_diagnostic_condition_icd10])

In [None]:
query = ("""
SELECT distinct * 
FROM 
    (SELECT DISTINCT person_id, condition_source_concept_id, condition_source_value, condition_start_date 
        FROM `"""+prefix+"""."""+ str(CDR_version) +""".condition_occurrence`) AS cond 
     INNER JOIN 
        (SELECT DISTINCT concept_id, concept_name, concept_code, vocabulary_id 
            FROM `"""+prefix+"""."""+str(CDR_version)+""".concept` 
            where (concept_code in ("""+colitis_icd9_conds+""") 
            and vocabulary_id ='ICD9CM') or (concept_code in ("""+colitis_icd10_conds+""")
            and vocabulary_id ='ICD10CM')) as concept 
            on concept.concept_id = cond.condition_source_concept_id
""")
df_colitis_diagnostic_condition= pd.read_gbq(query, dialect="standard")

In [None]:
# Extract the earliest date
min_dates_colitis_diags = df_colitis_diagnostic_condition.sort_values(["person_id","condition_start_date"]).groupby("person_id", as_index=False).first()
#min_dates_colitis_diags

In [None]:
len(min_dates_colitis_diags.person_id.unique())
#min_dates_colitis_diags.person_id.head()
min_dates_colitis_diags.head()

In [None]:
# now include this info in the colitis df
#all_colitis = pd.merge(min_dates_colitis_diags, ehr_metadata, on ="person_id")
all_colitis = pd.merge(min_dates_colitis_diags, demo_patients, on ="person_id")

In [None]:
# Now include the age of colitis diagnosis
all_colitis["age_at_colitis_dx"] = [i.year for i in all_colitis["condition_start_date"]]
all_colitis["age_at_colitis_dx"] = all_colitis["age_at_colitis_dx"] - all_colitis["year_of_birth"]
all_colitis.head()
len(all_colitis)

In [None]:
# Separate Crohn's and UC
uc_all_codes = uc_icd9 + uc_icd10
crohns_all_codes = crohns_icd9 + crohns_icd10

#all_uc = all_colitis[all_colitis.concept_code.isin(uc_all_codes)]
#all_crohns = all_colitis[all_colitis.concept_code.isin(crohns_all_codes)]
#####adding as indicator column
def assign_uc(row):
    if row['concept_code'] in uc_all_codes:
        return 1
    elif row['concept_code'] in crohns_all_codes:
        return 2
    else:
        return None
    
all_colitis['uc1_crohns2'] = all_colitis.apply(assign_uc, axis = 1)  
all_colitis['uc1_crohns2'].value_counts()
all_colitis.head()

In [None]:
# Identify CACRC patients
crc_diagnostic_condition_icd9 = ['153', '153.0', '153.1', '153.2', '153.3', '153.4', 
                  '153.5', '153.6', '153.7', '153.8', '153.9',
                  '154', '154.0', '154.1', '154.2', '154.3', '154.8']

crc_diagnostic_condition_icd10 = ['C18', 'C18.0', 'C18.1', 'C18.2', 'C18.3', 'C18.4',
                  'C18.5', 'C18.6', 'C18.7', 'C18.8', 'C18.9',
                  'C19',
                  'C20']

crc_icd9_conds = ",".join(["'"+code+"'" for code in crc_diagnostic_condition_icd9])
crc_icd10_conds = ",".join(["'"+code+"'" for code in crc_diagnostic_condition_icd10])

In [None]:
query = ("""
SELECT distinct * 
FROM 
    (SELECT DISTINCT person_id, condition_source_concept_id, condition_source_value, condition_start_date 
        FROM `"""+prefix+"""."""+ str(CDR_version) +""".condition_occurrence`) AS cond 
     INNER JOIN 
        (SELECT DISTINCT concept_id, concept_name, concept_code, vocabulary_id 
            FROM `"""+prefix+"""."""+str(CDR_version)+""".concept` 
            where (concept_code in ("""+crc_icd9_conds+""") 
            and vocabulary_id ='ICD9CM') or (concept_code in ("""+crc_icd10_conds+""")
            and vocabulary_id ='ICD10CM')) as concept 
            on concept.concept_id = cond.condition_source_concept_id
""")
df_crc_diagnostic_condition= pd.read_gbq(query, dialect="standard")

In [None]:
# Extract the earliest date
min_dates_crc_diags = df_crc_diagnostic_condition.sort_values(["person_id","condition_start_date"]).groupby("person_id", as_index=False).first()
min_dates_crc_diags

# Subset to population of colitis patients
crc_and_colitis = pd.merge(min_dates_crc_diags, all_colitis, on = "person_id")
#crc_and_colitis
# NOTE: condition_x is now the CRC diagnosis and _y is the colitis diagnosis

In [None]:
# Add the age at CRC diagnosis
crc_and_colitis["age_at_crc_dx"] = [i.year for i in crc_and_colitis["condition_start_date_x"]]
crc_and_colitis["age_at_crc_dx"] = crc_and_colitis["age_at_crc_dx"] - crc_and_colitis["year_of_birth"]

#crc_and_colitis

In [None]:
# Remove colitis cases that had cancer diagnosis first from both dfs
minYearsColitis_to_crc=1
colitis_no_previous_cancer = pd.merge(all_colitis, crc_and_colitis[["person_id", "age_at_crc_dx"]], how = "left", on = "person_id")
pids_remove = pd.DataFrame({"pids_remove": colitis_no_previous_cancer.person_id[colitis_no_previous_cancer["age_at_crc_dx"] <= minYearsColitis_to_crc+colitis_no_previous_cancer["age_at_colitis_dx"]]})

# Use setdiff to keep the ones not in remove list
pids_keep_colitis = np.setdiff1d(colitis_no_previous_cancer.person_id, pids_remove)
pids_keep_crc = np.setdiff1d(crc_and_colitis.person_id, pids_remove)

# Subset dataframes
colitis_no_previous_cancer = colitis_no_previous_cancer[colitis_no_previous_cancer.person_id.isin(pids_keep_colitis)]
ca_crc = crc_and_colitis[crc_and_colitis.person_id.isin(pids_keep_crc)]

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = cacrc_incidence

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'colitis_updated.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr
