## Pulling Baseline Comorbidities

Credit: Dr. Krisnamurthy & Lab 9

In [1]:
import os
import sqlalchemy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
creds = pd.read_csv("sample_mimic_login_creds.csv")
myUserName = str(creds.iloc[0]['Username']).strip()
myPassword = str(creds.iloc[0]['password']).strip()

server_url = "mimic-db.renci.unc.edu"
database = "mimic"

# Create Connection String
conn_str = f"{myUserName}:{myPassword}@{server_url}/{database}"

# Create Engine
engine = sqlalchemy.create_engine('postgresql://' + conn_str)


In [3]:
def get_comorbid_data(patient_count_threshold = 50):
    query = \
    f"with condlist as (\
    select c.concept_name, count(distinct co.person_id) as ptct \
    from omop.condition_occurrence co JOIN omop.concept c ON co.condition_concept_id = c.concept_id \
    where concept_name <> 'No matching concept' \
    group by c.concept_name \
    having count(distinct co.person_id) > {patient_count_threshold} \
    ) \
    \
    select distinct co.person_id, list.concept_name \
    from omop.condition_occurrence co JOIN omop.concept c ON co.condition_concept_id = c.concept_id \
    JOIN condlist list ON c.concept_name = list.concept_name"
    
    return pd.read_sql_query(query, engine)

In [4]:
%%time
min_patients = 500 # our own adjustment here
comorbidity_data = get_comorbid_data(min_patients)
print(f"The returned date frame for a {min_patients} patient threshold has {len(comorbidity_data)} rows")
comorbidity_data.head(20)

The returned date frame for a 500 patient threshold has 391216 rows
Wall time: 13.1 s


Unnamed: 0,person_id,concept_name
0,392775850,Acquired hypothyroidism
1,392775850,Acute posthemorrhagic anemia
2,392775850,Acute renal failure syndrome
3,392775850,Acute subendocardial infarction
4,392775850,Anemia
5,392775850,Anemia due to chronic blood loss
6,392775850,Angina pectoris
7,392775850,Anticoagulant adverse reaction
8,392775850,Asthma
9,392775850,Atrial fibrillation


In [5]:
print(f"The data frame has {len(comorbidity_data['person_id'].unique())} patients")
print(f"The data frame includes {len(comorbidity_data['concept_name'].unique())} different conditions")

The data frame has 45798 patients
The data frame includes 230 different conditions


Looking for conditions that only occur in 500 patients, we can see a difference from the work we did in class.

In [6]:
new_d = comorbidity_data.groupby(['person_id'])
type(new_d)
print(new_d)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B99C9CCA90>


In [7]:
new_d = comorbidity_data.groupby(['person_id'])

comorb_list = []
for person, comorb in new_d:
#     print(f"Person is {person}")
#     print(comorb['concept_name'])
    comorb_list.append(list(set(comorb['concept_name'])))
print(len(comorb_list))

45798


In [8]:
comorb_list[::500] # start, stop, step -> but why is there a 5000 step in cycling through the list?

[['Paralytic ileus',
  'Acute subendocardial infarction',
  'Chronic kidney disease stage 3',
  'Respiratory insufficiency',
  'Low blood pressure',
  'Anemia due to chronic blood loss',
  'Benign hypertensive renal disease with renal failure',
  'Congestive heart failure',
  'Malnutrition',
  'Acute renal failure syndrome',
  'Hemorrhage AND/OR hematoma complicating procedure',
  'Hyperlipidemia',
  'Iatrogenic hypotension',
  'Cerebral infarction due to embolism of cerebral arteries',
  'Angina pectoris',
  'Coronary arteriosclerosis in native artery',
  'Diabetes mellitus without complication',
  'Acute posthemorrhagic anemia',
  'Chronic kidney disease',
  'Old myocardial infarction',
  'Late effect of medical and surgical care complication',
  'Acquired hypothyroidism',
  'Not for resuscitation',
  'Anemia',
  'Asthma',
  'Anticoagulant adverse reaction',
  'Esophageal reflux finding',
  'Atrial fibrillation'],
 ['Accident',
  'Iatrogenic hypotension',
  'Paroxysmal ventricular ta

In [9]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(comorb_list).transform(comorb_list)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head(25)

Unnamed: 0,Abdominal aortic aneurysm,Abdominal pain,Abnormal patient reaction,Accident,Accidental wound during procedure,Acidosis,Acquired hypothyroidism,Acute deep vein thrombosis of lower limb,Acute disease of cardiovascular system,Acute exacerbation of chronic obstructive bronchitis,...,Tricuspid valve disorder,Twins - both live born,Type 2 diabetes mellitus,Ulcer of foot,Upper gastrointestinal bleeding,Urinary tract infectious disease,Vascular complication of medical care,Ventilator associated pneumonia,Ventricular fibrillation,Viral hepatitis C
0,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,True,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
%%time
from mlxtend.frequent_patterns import apriori
support = 0.01
frequent_itemsets = apriori(df, min_support=support, use_colnames=True)
print(f"Using a support of {support}, found {len(frequent_itemsets)} frequent itemsets")
frequent_itemsets.head(25)

Using a support of 0.01, found 2114 frequent itemsets
Wall time: 4min 42s


Unnamed: 0,support,itemsets
0,0.013887,(Abdominal aortic aneurysm)
1,0.01559,(Abdominal pain)
2,0.012752,(Abnormal patient reaction)
3,0.040941,(Accident)
4,0.020045,(Accidental wound during procedure)
5,0.091227,(Acidosis)
6,0.083737,(Acquired hypothyroidism)
7,0.011332,(Acute deep vein thrombosis of lower limb)
8,0.012599,(Acute disease of cardiovascular system)
9,0.02059,(Acute exacerbation of chronic obstructive bro...


In [11]:
frequent_itemsets['count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.head(25)

Unnamed: 0,support,itemsets,count
0,0.013887,(Abdominal aortic aneurysm),1
1,0.01559,(Abdominal pain),1
2,0.012752,(Abnormal patient reaction),1
3,0.040941,(Accident),1
4,0.020045,(Accidental wound during procedure),1
5,0.091227,(Acidosis),1
6,0.083737,(Acquired hypothyroidism),1
7,0.011332,(Acute deep vein thrombosis of lower limb),1
8,0.012599,(Acute disease of cardiovascular system),1
9,0.02059,(Acute exacerbation of chronic obstructive bro...,1


In [12]:
fi = frequent_itemsets.sort_values(by = ['support'], ascending = False)
print(f"Highest support is {fi.iloc[0]['support']} with a count of {fi.iloc[0]['count']}")
print(f"Lowest support is {fi.iloc[-1]['support']} with a count of {fi.iloc[-1]['count']}")
fi.head(25)

Highest support is 0.3845801126686755 with a count of 1
Lowest support is 0.01000043670029259 with a count of 3


Unnamed: 0,support,itemsets,count
109,0.38458,(Essential hypertension),1
83,0.235272,(Coronary arteriosclerosis in native artery),1
40,0.224377,(Atrial fibrillation),1
79,0.216538,(Congestive heart failure),1
17,0.208262,(Acute renal failure syndrome),1
160,0.170772,(Newborn),1
130,0.162998,(Hyperlipidemia),1
91,0.160924,(Diabetes mellitus without complication),1
18,0.146709,(Acute respiratory failure),1
777,0.144024,"(Coronary arteriosclerosis in native artery, E...",2


In [13]:
fige2 = fi[fi['count'] >= 2]
fige2.head(25)

Unnamed: 0,support,itemsets,count
777,0.144024,"(Coronary arteriosclerosis in native artery, E...",2
1030,0.138281,"(Newborn, Single live birth)",2
1027,0.125857,"(Requires vaccination, Newborn)",2
560,0.117887,"(Essential hypertension, Atrial fibrillation)",2
1059,0.108913,"(Requires vaccination, Single live birth)",2
1827,0.108848,"(Requires vaccination, Newborn, Single live bi...",3
879,0.107537,"(Essential hypertension, Hyperlipidemia)",2
551,0.10269,"(Congestive heart failure, Atrial fibrillation)",2
823,0.10007,"(Essential hypertension, Diabetes mellitus wit...",2
723,0.099546,"(Essential hypertension, Congestive heart fail...",2
