## Pulling comorbidities for Colorectal Cancer Patients

Credit: Dr. Krisnamurthy & Lab 9

In [4]:
import os
import sqlalchemy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
creds = pd.read_csv("sample_mimic_login_creds.csv")
myUserName = str(creds.iloc[0]['Username']).strip()
myPassword = str(creds.iloc[0]['password']).strip()

server_url = "mimic-db.renci.unc.edu"
database = "mimic"

# Create Connection String
conn_str = f"{myUserName}:{myPassword}@{server_url}/{database}"

# Create Engine
engine = sqlalchemy.create_engine('postgresql://' + conn_str)


We aren't trying to restrict by number of patients here because of the realities of our sample size.

In [8]:
query = """
    -- First step
-- Pulls all the concept ids we need to care about
WITH icd10_conc AS (
SELECT * 
FROM omop.concept
WHERE concept_code LIKE 'C17.%' OR concept_code LIKE 'C18.%' OR
concept_code LIKE 'C21.%' OR  concept_code = 'Z86.010'
),

-- Second step 
-- Pulling the concepts
-- We want to pull the concept_id_2 for each of these
conc_maps AS (
SELECT concept_id_2
FROM omop.concept_relationship 
WHERE concept_id_1 in (SELECT concept_id FROM icd10_conc) 
AND relationship_id = 'Maps to'
)
,

-- third step -> ensuring all standard concepts
-- ensuring these are all standard concepts
stand_conc as (
SELECT concept_id_2 as cond_conc_id
FROM conc_maps cm
INNER JOIN omop.concept co ON cm.concept_id_2 = co.concept_id
WHERE co.standard_concept = 'S'
),


-- checking to see if they are in the condition occurrence table
canc_pats as (
SELECT DISTINCT co.person_id, co.condition_concept_id, co.condition_source_value
FROM omop.condition_occurrence co
WHERE co.condition_concept_id IN (SELECT * FROM stand_conc)
),

-- checking to see how many deaths we have records for
canc_deaths as (
SELECT * 
FROM omop.death od
WHERE od.person_id IN (SELECT person_id from canc_pats)
),

-- pulling max & min dates for patients -> range of records
-- earliest visit start to latest visit end provides us with visit dif
vis_max_min AS (
SELECT vo.person_id, MIN(vo.visit_start_datetime) AS first_vis, 
MAX(vo.visit_end_datetime) AS last_vis
FROM omop.visit_occurrence vo
WHERE vo.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY vo.person_id
),

-- earliest procedure start to latest procedure end
proc_max_min AS (
SELECT po.person_id, MIN(po.procedure_datetime) AS first_proc , 
MAX(po.procedure_datetime) AS last_proc
FROM omop.procedure_occurrence po
WHERE po.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY po.person_id
),

-- earliest drug exposure start to latest drug exposure end
drug_max_min AS (
SELECT de.person_id, MIN(drug_exposure_start_datetime) AS first_drug,
MAX(drug_exposure_end_datetime) AS last_drug
FROM omop.drug_exposure de
WHERE de.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY de.person_id
),

-- earliest measurement to latest measurement
-- slows the running down though :(
meas_max_min AS (
SELECT om.person_id, MIN(om.measurement_datetime) AS first_meas, 
MAX(om.measurement_datetime) AS last_meas
FROM omop.measurement om
WHERE om.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY om.person_id
),

-- no device exposures so we can just ignore that...

-- pulling the first and last datapoint date for each of the patients
num_years as (
SELECT cp.person_id, ((
greatest(last_vis, last_proc, last_drug, last_meas)::date -
least(first_vis, first_proc, first_drug, first_meas)::date)/365.25)
AS num_years
FROM canc_pats cp
INNER JOIN vis_max_min vmm ON cp.person_id = vmm.person_id
INNER JOIN proc_max_min pmm ON cp.person_id = pmm.person_id
INNER JOIN drug_max_min dmm ON cp.person_id = dmm.person_id
INNER JOIN meas_max_min mmm ON cp.person_id = mmm.person_id
),

-- figuring out the start of the dx
dx_date AS (
SELECT person_id, MIN(condition_start_datetime) AS pat_dx_date
FROM omop.condition_occurrence co
WHERE co.person_id IN (SELECT person_id FROM canc_pats) AND
co.condition_concept_id IN (SELECT cond_conc_id FROM stand_conc)
GROUP BY co.person_id
),

-- finding the for our patients after their diagnosis
-- returns list of patients and drugs they've taken
pat_drugs as (
SELECT oc.concept_name, de.drug_concept_id
FROM omop.drug_exposure de
INNER JOIN canc_pats cp ON cp.person_id = de.person_id
INNER JOIN dx_date dd ON dd.person_id = de.person_id
INNER JOIN omop.concept oc ON oc.concept_id = de.drug_concept_id
WHERE de.drug_exposure_start_datetime > dd.pat_dx_date
)

-- trying to find co-morbidities
-- gives us a list of patient ids and concept names for comorbiditis
SELECT distinct(co.person_id), oc.concept_name
FROM omop.condition_occurrence co
INNER JOIN canc_pats cp on co.person_id = cp.person_id 
INNER JOIN omop.concept oc on oc.concept_id = co.condition_concept_id
WHERE co.condition_concept_id not in (SELECT * FROM stand_conc) AND 
concept_name <> 'No matching concept';
    """

In [14]:
%%time
# need to run the text through sqlalchemy to clean it up
comorbidity_data = pd.read_sql_query(sqlalchemy.text(query), engine)
print(f"The returned date frame with no mimumum patient threshold has {len(comorbidity_data)} rows")
comorbidity_data.head(20)

The returned date frame with no mimumum patient threshold has 3569 rows
Wall time: 3.83 s


Unnamed: 0,person_id,concept_name
0,392775850,Acquired hypothyroidism
1,392775850,Acute bronchitis
2,392775850,Acute combined systolic and diastolic heart fa...
3,392775850,Acute posthemorrhagic anemia
4,392775850,Acute renal failure syndrome
5,392775850,Acute subendocardial infarction
6,392775850,Acute systolic heart failure
7,392775850,Anemia
8,392775850,Anemia due to chronic blood loss
9,392775850,Angina pectoris


In [15]:
print(f"The data frame has {len(comorbidity_data['person_id'].unique())} patients")
print(f"The data frame includes {len(comorbidity_data['concept_name'].unique())} different conditions")

The data frame has 213 patients
The data frame includes 752 different conditions


In [16]:
new_d = comorbidity_data.groupby(['person_id'])
type(new_d)
print(new_d)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F064015340>


In [17]:
new_d = comorbidity_data.groupby(['person_id'])

comorb_list = []
for person, comorb in new_d:
#     print(f"Person is {person}")
#     print(comorb['concept_name'])
    comorb_list.append(list(set(comorb['concept_name'])))
print(len(comorb_list))

213


In [19]:
comorb_list[::] # start, stop, step -> no need for a step here I'd say

[['Benign hypertensive renal disease with renal failure',
  'Low blood pressure',
  'Frank hematuria',
  'Old myocardial infarction',
  'Asthma',
  'Hemiplegia',
  'Acute posthemorrhagic anemia',
  'Acute systolic heart failure',
  'Anticoagulant adverse reaction',
  'Respiratory insufficiency',
  'Cerebral infarction due to embolism of cerebral arteries',
  'Esophageal reflux finding',
  'Wound seroma',
  'Chronic kidney disease stage 3',
  'Angina pectoris',
  'Late effect of medical and surgical care complication',
  'Acute subendocardial infarction',
  'Anemia',
  'Dysarthria',
  'Feces contents abnormal',
  'Malnutrition',
  'Arteriosclerosis of autologous vein coronary artery bypass graft',
  'Acquired hypothyroidism',
  'Chronic kidney disease',
  'Paralytic ileus',
  'Coronary arteriosclerosis in native artery',
  'Secondary malignant neoplasm of skin',
  'Not for resuscitation',
  'Acute renal failure syndrome',
  'Atrial fibrillation',
  'Acute combined systolic and diastolic

In [20]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(comorb_list).transform(comorb_list)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head(25)

Unnamed: 0,Abdominal aortic aneurysm,Abdominal compartment syndrome,Abdominal pain,Abnormal body temperature,Abnormal breathing,Abnormal gait,Abnormal patient reaction,Abnormal uterine bleeding,Abnormal weight loss,Abscess of intestine,...,Vascular insufficiency of intestine,Venous thrombosis,Venous varices,Ventricular fibrillation,Viral hepatitis B without hepatic coma,Vitamin B deficiency,Vocal cord paralysis,Vomiting,Wound dehiscence,Wound seroma
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
%%time
from mlxtend.frequent_patterns import apriori
support = 0.01
frequent_itemsets = apriori(df, min_support=support, use_colnames=True)
print(f"Using a support of {support}, found {len(frequent_itemsets)} frequent itemsets")
frequent_itemsets.head(25)

Using a support of 0.01, found 17736 frequent itemsets
Wall time: 1.93 s


Unnamed: 0,support,itemsets
0,0.023474,(Abdominal aortic aneurysm)
1,0.037559,(Abdominal pain)
2,0.018779,(Abnormal patient reaction)
3,0.018779,(Abnormal weight loss)
4,0.014085,(Abscess of intestine)
5,0.028169,(Abscess of peritoneum)
6,0.051643,(Accident)
7,0.065728,(Accidental wound during procedure)
8,0.169014,(Acidosis)
9,0.018779,(Acquired coagulation factor deficiency)


In [22]:
frequent_itemsets['count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.head(25)

Unnamed: 0,support,itemsets,count
0,0.023474,(Abdominal aortic aneurysm),1
1,0.037559,(Abdominal pain),1
2,0.018779,(Abnormal patient reaction),1
3,0.018779,(Abnormal weight loss),1
4,0.014085,(Abscess of intestine),1
5,0.028169,(Abscess of peritoneum),1
6,0.051643,(Accident),1
7,0.065728,(Accidental wound during procedure),1
8,0.169014,(Acidosis),1
9,0.018779,(Acquired coagulation factor deficiency),1


In [23]:
fi = frequent_itemsets.sort_values(by = ['support'], ascending = False)
print(f"Highest support is {fi.iloc[0]['support']} with a count of {fi.iloc[0]['count']}")
print(f"Lowest support is {fi.iloc[-1]['support']} with a count of {fi.iloc[-1]['count']}")
fi.head(25)

Highest support is 0.5023474178403756 with a count of 1
Lowest support is 0.014084507042253521 with a count of 8


Unnamed: 0,support,itemsets,count
131,0.502347,(Essential hypertension),1
54,0.408451,(Atrial fibrillation),1
95,0.361502,(Congestive heart failure),1
24,0.361502,(Acute renal failure syndrome),1
98,0.239437,(Coronary arteriosclerosis in native artery),1
1222,0.239437,"(Essential hypertension, Atrial fibrillation)",2
106,0.234742,(Diabetes mellitus without complication),1
1203,0.230047,"(Atrial fibrillation, Congestive heart failure)",2
22,0.215962,(Acute posthemorrhagic anemia),1
170,0.211268,(Intestinal obstruction),1


In [25]:
fige2 = fi[fi['count'] >= 2]
fige2.head(25)

Unnamed: 0,support,itemsets,count
1222,0.239437,"(Essential hypertension, Atrial fibrillation)",2
1203,0.230047,"(Atrial fibrillation, Congestive heart failure)",2
1712,0.211268,"(Essential hypertension, Congestive heart fail...",2
643,0.192488,"(Acute renal failure syndrome, Atrial fibrilla...",2
682,0.192488,"(Essential hypertension, Acute renal failure s...",2
665,0.173709,"(Acute renal failure syndrome, Congestive hear...",2
1808,0.159624,"(Essential hypertension, Coronary arterioscler...",2
1697,0.140845,"(Congestive heart failure, Coronary arterioscl...",2
1918,0.140845,"(Essential hypertension, Diabetes mellitus wit...",2
2068,0.13615,"(Essential hypertension, Hyperlipidemia)",2
