## Pulling Medication Itemsets - Cancer Patients

Credit: Dr. Krisnamurthy & Lab 9

In [1]:
import os
import sqlalchemy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
creds = pd.read_csv("sample_mimic_login_creds.csv")
myUserName = str(creds.iloc[0]['Username']).strip()
myPassword = str(creds.iloc[0]['password']).strip()

server_url = "mimic-db.renci.unc.edu"
database = "mimic"

# Create Connection String
conn_str = f"{myUserName}:{myPassword}@{server_url}/{database}"

# Create Engine
engine = sqlalchemy.create_engine('postgresql://' + conn_str)


We aren't trying to restrict by number of patients here because of the realities of our sample size.

In [3]:
query = """
        -- First step
-- Pulls all the concept ids we need to care about
WITH icd10_conc AS (
SELECT * 
FROM omop.concept
WHERE concept_code LIKE 'C17.%' OR concept_code LIKE 'C18.%' OR
concept_code LIKE 'C21.%' OR  concept_code = 'Z86.010'
),

-- Second step 
-- Pulling the concepts
-- We want to pull the concept_id_2 for each of these
conc_maps AS (
SELECT concept_id_2
FROM omop.concept_relationship 
WHERE concept_id_1 in (SELECT concept_id FROM icd10_conc) 
AND relationship_id = 'Maps to'
)
,

-- third step -> ensuring all standard concepts
-- ensuring these are all standard concepts
stand_conc as (
SELECT concept_id_2 as cond_conc_id
FROM conc_maps cm
INNER JOIN omop.concept co ON cm.concept_id_2 = co.concept_id
WHERE co.standard_concept = 'S'
),


-- checking to see if they are in the condition occurrence table
canc_pats as (
SELECT DISTINCT co.person_id, co.condition_concept_id, co.condition_source_value
FROM omop.condition_occurrence co
WHERE co.condition_concept_id IN (SELECT * FROM stand_conc)
),

-- checking to see how many deaths we have records for
canc_deaths as (
SELECT * 
FROM omop.death od
WHERE od.person_id IN (SELECT person_id from canc_pats)
),

-- pulling max & min dates for patients -> range of records
-- earliest visit start to latest visit end provides us with visit dif
vis_max_min AS (
SELECT vo.person_id, MIN(vo.visit_start_datetime) AS first_vis, 
MAX(vo.visit_end_datetime) AS last_vis
FROM omop.visit_occurrence vo
WHERE vo.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY vo.person_id
),

-- earliest procedure start to latest procedure end
proc_max_min AS (
SELECT po.person_id, MIN(po.procedure_datetime) AS first_proc , 
MAX(po.procedure_datetime) AS last_proc
FROM omop.procedure_occurrence po
WHERE po.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY po.person_id
),

-- earliest drug exposure start to latest drug exposure end
drug_max_min AS (
SELECT de.person_id, MIN(drug_exposure_start_datetime) AS first_drug,
MAX(drug_exposure_end_datetime) AS last_drug
FROM omop.drug_exposure de
WHERE de.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY de.person_id
),

-- earliest measurement to latest measurement
-- slows the running down though :(
meas_max_min AS (
SELECT om.person_id, MIN(om.measurement_datetime) AS first_meas, 
MAX(om.measurement_datetime) AS last_meas
FROM omop.measurement om
WHERE om.person_id IN (SELECT person_id FROM canc_pats)
GROUP BY om.person_id
),

-- no device exposures so we can just ignore that...

-- pulling the first and last datapoint date for each of the patients
num_years as (
SELECT cp.person_id, ((
greatest(last_vis, last_proc, last_drug, last_meas)::date -
least(first_vis, first_proc, first_drug, first_meas)::date)/365.25)
AS num_years
FROM canc_pats cp
INNER JOIN vis_max_min vmm ON cp.person_id = vmm.person_id
INNER JOIN proc_max_min pmm ON cp.person_id = pmm.person_id
INNER JOIN drug_max_min dmm ON cp.person_id = dmm.person_id
INNER JOIN meas_max_min mmm ON cp.person_id = mmm.person_id
),

-- figuring out the start of the dx
dx_date AS (
SELECT person_id, MIN(condition_start_datetime) AS pat_dx_date
FROM omop.condition_occurrence co
WHERE co.person_id IN (SELECT person_id FROM canc_pats) AND
co.condition_concept_id IN (SELECT cond_conc_id FROM stand_conc)
GROUP BY co.person_id
)

-- finding the for our patients after their diagnosis
-- returns list of patients and drugs they've taken
SELECT DISTINCT(de.person_id),oc.concept_name
FROM omop.drug_exposure de
INNER JOIN canc_pats cp ON cp.person_id = de.person_id
INNER JOIN dx_date dd ON dd.person_id = de.person_id
INNER JOIN omop.concept oc ON oc.concept_id = de.drug_concept_id
WHERE de.drug_exposure_start_datetime > dd.pat_dx_date AND
oc.concept_name <> 'No matching concept';
    """

In [4]:
%%time
# need to run the text through sqlalchemy to clean it up
medication_data = pd.read_sql_query(sqlalchemy.text(query), engine)
print(f"The returned date frame with no mimumum patient threshold has {len(medication_data)} rows")
medication_data.head(20)

The returned date frame with no mimumum patient threshold has 9460 rows
CPU times: total: 0 ns
Wall time: 9.34 s


Unnamed: 0,person_id,concept_name
0,392775850,1000 ML Glucose 50 MG/ML / Potassium Chloride ...
1,392775850,1000 ML Sodium Chloride 9 MG/ML Injection
2,392775850,100 ML Metronidazole 5 MG/ML Injection
3,392775850,10 ML Potassium Chloride 2 MEQ/ML Injection
4,392775850,1 ML Hydromorphone Hydrochloride 1 MG/ML Cartr...
5,392775850,1 ML Vitamin K 1 10 MG/ML Injection
6,392775850,200 ML Ciprofloxacin 2 MG/ML Injection
7,392775850,200 ML Vancomycin 5 MG/ML Injection
8,392775850,20 ML Potassium Chloride 2 MEQ/ML Injection
9,392775850,"250 ML Albumin Human, USP 50 MG/ML Injection [..."


In [5]:
print(f"The data frame has {len(medication_data['person_id'].unique())} patients")
print(f"The data frame includes {len(medication_data['concept_name'].unique())} different conditions")

The data frame has 204 patients
The data frame includes 970 different conditions


In [6]:
new_d = medication_data.groupby(['person_id'])
type(new_d)
print(new_d)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025B8887F340>


In [7]:
new_d = medication_data.groupby(['person_id'])

med_list = []
for person, med in new_d:
#     print(f"Person is {person}")
#     print(comorb['concept_name'])
    med_list.append(list(set(med['concept_name'])))
print(len(med_list))

204


In [8]:
med_list[::] # start, stop, step -> no need for a step here I'd say

[['20 ML Potassium Chloride 2 MEQ/ML Injection',
  'Camphor 5 MG/ML / Menthol 5 MG/ML Topical Lotion [Men-phor]',
  '1000 ML Glucose 50 MG/ML / Potassium Chloride 0.02 MEQ/ML / Sodium Chloride 9 MG/ML Injection',
  'Levothyroxine Sodium 0.2 MG Injection',
  '50 ML Albumin Human, USP 250 MG/ML Injection',
  '500 ML Sodium Chloride 9 MG/ML Injection',
  '50 ML Potassium Chloride 0.4 MEQ/ML Injection',
  '50 ML Glucose 50 MG/ML Injection',
  'Calcium Chloride 0.0014 MEQ/ML / Potassium Chloride 0.004 MEQ/ML / Sodium Chloride 0.103 MEQ/ML / Sodium Lactate 0.028 MEQ/ML Injectable Solution',
  '1 ML Vitamin K 1 10 MG/ML Injection',
  '2 ML Metoclopramide 5 MG/ML Injection',
  'Acetaminophen 325 MG Oral Tablet',
  '5 ML heparin sodium, porcine 10 UNT/ML Prefilled Syringe',
  'Metoprolol',
  'Potassium Chloride 10 MEQ Extended Release Oral Tablet [Klor-Con]',
  'Vitamin K 1 5 MG Oral Tablet',
  'Piperacillin 2000 MG / tazobactam 250 MG Injection [Zosyn]',
  '500 ML Albumin Human, USP 50 MG/ML I

In [10]:
# trying to make it sparse
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
med_ary = te.fit(med_list).transform(med_list, sparse=True)
df = pd.DataFrame.sparse.from_spmatrix(med_ary, columns=te.columns_)
df.head(25)

Unnamed: 0,0.3 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe [Lovenox],0.4 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe [Lovenox],0.4 ML Fondaparinux sodium 12.5 MG/ML Prefilled Syringe [Arixtra],0.5 ML Fondaparinux sodium 5 MG/ML Prefilled Syringe [Arixtra],"0.5 ML Haemophilus influenzae type b strain 1482, capsular polysaccharide inactivated tetanus toxoid conjugate vaccine 0.068 MG/ML Injection [ActHIB]","0.5 ML Influenza Virus Vaccine, Inactivated A-California-07-2009 X-181 (H1N1) strain 0.03 MG/ML / Influenza Virus Vaccine, Inactivated A-Victoria-210-2009 X-187 (H3N2) (A-Perth-16-2009) strain 0.03 MG/ML / Influenza Virus Vaccine, Inactivated B-Brisbane-6",0.5 ML Neisseria meningitidis serogroup A capsular polysaccharide diphtheria toxoid protein conjugate vaccine 0.104 MG/ML / Neisseria meningitidis serogroup C capsular polysaccharide diphtheria toxoid protein conjugate vaccine 0.104 MG/ML / Neisseria meni,0.5 ML Vitamin K 1 2 MG/ML Injection,0.5 ML meningococcal group A polysaccharide 0.1 MG/ML / meningococcal group C polysaccharide 0.1 MG/ML / MENINGOCOCCAL POLYSACCHARIDE VACCINE GROUP W-135 0.1 MG/ML / MENINGOCOCCAL POLYSACCHARIDE VACCINE GROUP Y 0.1 MG/ML Injection [Menomune A/C/Y/W-135],0.5 ML pneumococcal capsular polysaccharide type 1 vaccine 0.05 MG/ML / pneumococcal capsular polysaccharide type 10A vaccine 0.05 MG/ML / pneumococcal capsular polysaccharide type 11A vaccine 0.05 MG/ML / pneumococcal capsular polysaccharide type 12F vac,...,trandolapril 1 MG Oral Tablet,trisodium citrate,valganciclovir 450 MG Oral Tablet [Valcyte],valsartan 160 MG Oral Tablet [Diovan],valsartan 40 MG Oral Tablet [Diovan],valsartan 80 MG Oral Tablet [Diovan],venlafaxine 37.5 MG Oral Tablet,voriconazole 200 MG Injection [Vfend],{10 (Prednisone 10 MG Oral Tablet) } Pack,{4 (risedronate sodium 35 MG Oral Tablet [Actonel]) } Pack [Actonel 35 4-Week]
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
sparse_te_ary = te.fit(med_list).transform(med_list, sparse=True)
sparse_med_df = pd.DataFrame.sparse.from_spmatrix(sparse_te_ary, columns=te.columns_)
sparse_med_df

In [None]:
%%time
from mlxtend.frequent_patterns import apriori
support = 0.1 # choosing this b/c .05 gave us 26476502 itemsets in 1hr 4min -> very slow
frequent_itemsets = apriori(df, min_support=support, use_colnames=True)
print(f"Using a support of {support}, found {len(frequent_itemsets)} frequent itemsets")
frequent_itemsets.head(25)

In [27]:
pd.options.display.max_rows = 50

In [28]:
frequent_itemsets['count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.head(25)

Unnamed: 0,support,itemsets,count
0,0.117647,(0.5 ML pneumococcal capsular polysaccharide t...,1
1,0.147059,(1 ML Hydralazine Hydrochloride 20 MG/ML Injec...,1
2,0.210784,(1 ML Hydromorphone Hydrochloride 1 MG/ML Cart...,1
3,0.142157,(1 ML Hydromorphone Hydrochloride 2 MG/ML Cart...,1
4,0.181373,(1 ML Hydromorphone Hydrochloride 4 MG/ML Inje...,1
5,0.313725,(1 ML Lorazepam 2 MG/ML Cartridge),1
6,0.392157,(1 ML Morphine Sulfate 2 MG/ML Prefilled Syringe),1
7,0.142157,(1 ML Morphine Sulfate 4 MG/ML Injection),1
8,0.205882,(1 ML Phenylephrine Hydrochloride 10 MG/ML Inj...,1
9,0.416667,(10 ML Calcium Gluconate 100 MG/ML Injection),1


In [29]:
fi = frequent_itemsets.sort_values(by = ['support'], ascending = False)
print(f"Highest support is {fi.iloc[0]['support']} with a count of {fi.iloc[0]['count']}")
print(f"Lowest support is {fi.iloc[-1]['support']} with a count of {fi.iloc[-1]['count']}")
fi.head(25)

Highest support is 0.7156862745098039 with a count of 1
Lowest support is 0.10294117647058823 with a count of 11


Unnamed: 0,support,itemsets,count
106,0.715686,(Sodium Chloride Prefilled Syringe),1
115,0.647059,"(heparin sodium, porcine 5000 UNT/ML Injectabl...",1
48,0.642157,(500 ML Sodium Chloride 9 MG/ML Injection),1
32,0.573529,(250 ML Sodium Chloride 9 MG/ML Injection),1
20,0.54902,(1000 ML Sodium Chloride 9 MG/ML Injection),1
36,0.539216,(5 ML Metoprolol Tartrate 1 MG/ML Injection),1
1881,0.534314,"(Sodium Chloride Prefilled Syringe, heparin so...",2
1470,0.534314,"(Sodium Chloride Prefilled Syringe, 500 ML Sod...",2
62,0.534314,(Calcium Chloride 0.0014 MEQ/ML / Potassium Ch...,1
51,0.529412,(Acetaminophen 325 MG Oral Tablet),1


In [30]:
fige2 = fi[fi['count'] >= 2]
fige2.head(25)

Unnamed: 0,support,itemsets,count
1881,0.534314,"(Sodium Chloride Prefilled Syringe, heparin so...",2
1470,0.534314,"(Sodium Chloride Prefilled Syringe, 500 ML Sod...",2
1475,0.480392,"(heparin sodium, porcine 5000 UNT/ML Injectabl...",2
744,0.480392,"(Sodium Chloride Prefilled Syringe, 1000 ML So...",2
1051,0.47549,"(500 ML Sodium Chloride 9 MG/ML Injection, 250...",2
1226,0.465686,"(heparin sodium, porcine 5000 UNT/ML Injectabl...",2
1622,0.460784,"(Sodium Chloride Prefilled Syringe, Calcium Ch...",2
1095,0.455882,"(Sodium Chloride Prefilled Syringe, 250 ML Sod...",2
705,0.455882,"(1000 ML Sodium Chloride 9 MG/ML Injection, 50...",2
1716,0.45098,"(Sodium Chloride Prefilled Syringe, Fluid inta...",2
