In [2]:
import pandas as pd

In [3]:
patients = pd.read_csv("mimic-data/patients.csv.gz", compression='gzip')
print(len(patients))

diagnoses_icd = pd.read_csv("mimic-data/diagnoses_icd.csv.gz", compression='gzip')
print(len(diagnoses_icd))


discharge = pd.read_csv("mimic-data/discharge.csv.gz", compression='gzip')
print(len(discharge))

prescriptions = pd.read_csv("mimic-data/prescriptions.csv.gz", compression='gzip')
print(len(prescriptions))

patients.columns, diagnoses_icd.columns, discharge.columns, prescriptions.columns

364627
6364488
331793


  prescriptions = pd.read_csv("mimic-data/prescriptions.csv.gz", compression='gzip')


20292611


(Index(['subject_id', 'gender', 'anchor_age', 'anchor_year',
        'anchor_year_group', 'dod'],
       dtype='object'),
 Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'], dtype='object'),
 Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
        'charttime', 'storetime', 'text'],
       dtype='object'),
 Index(['subject_id', 'hadm_id', 'pharmacy_id', 'poe_id', 'poe_seq',
        'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug',
        'formulary_drug_cd', 'gsn', 'ndc', 'prod_strength', 'form_rx',
        'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
        'doses_per_24_hrs', 'route'],
       dtype='object'))

In [4]:
#get individuals with a depression diagnosis

mimic_df_icd10 = diagnoses_icd[diagnoses_icd["icd_version"] == 10]
depression = mimic_df_icd10[mimic_df_icd10["icd_code"].str.match(r"^F3[23]")]
len(depression)


49471

In [5]:
#merge in discharge 
discharge_selected = discharge[["note_id", "hadm_id", "note_seq", "text"]]
merged = pd.merge(depression, discharge_selected, on="hadm_id", how = "inner")
len(merged)

24289

In [6]:
#merge in patients
patients_selected = patients[["subject_id", "gender", "anchor_age"]]
merged = pd.merge(merged, patients_selected, on="subject_id", how="inner")
len(merged)

24289

In [7]:
#merge in prescriptions
prescription_selected = prescriptions[["hadm_id", "poe_id", "drug", "gsn", "ndc"]]
merged = pd.merge(merged, prescription_selected, on="hadm_id", how="left")
len(merged)


1254818

## experiment with MIMIC-IV-ED (FINAL)

In [8]:
import pandas as pd


#the two datasets from MIMIC-IV-ED
edstays = pd.read_csv("mimic-data/edstays.csv.gz", compression='gzip')
print(len(edstays))
medrecon = pd.read_csv("mimic-data/medrecon.csv.gz", compression='gzip')
print(len(medrecon))

425087
2987342


In [9]:
medrecon.columns, edstays.columns
medrecon_selected = medrecon[["subject_id", "stay_id", "name", "gsn", "ndc", "etc_rn", "etccode", "etcdescription"]]
edstays_selected = edstays[["subject_id", "hadm_id", "stay_id", "gender", "race"]]

In [10]:
ed_merged = pd.merge(edstays_selected, medrecon_selected, on=['subject_id', 'stay_id'], how='inner')
len(ed_merged)

2987342

In [11]:
#GETTING 2 datasets from MIMIC-IV
patients = pd.read_csv("mimic-data/patients.csv.gz", compression='gzip')
print(len(patients))

diagnoses_icd = pd.read_csv("mimic-data/diagnoses_icd.csv.gz", compression='gzip')
print(len(diagnoses_icd))


discharge = pd.read_csv("mimic-data/discharge.csv.gz", compression='gzip')
print(len(discharge))


364627
6364488
331793


In [12]:
#Get individuals with DIAGNOSES DEPRESSION
diagnoses_icd10 = diagnoses_icd[diagnoses_icd["icd_version"] == 10]
depression_patients = diagnoses_icd10[diagnoses_icd10["icd_code"].str.match(r"^F3[23]")]
len(depression_patients)

49471

In [13]:
#merge with patients
patients_selected = patients[["subject_id", "anchor_age"]]
depression_patients_with_demographics = pd.merge(depression_patients, patients_selected, on="subject_id", how="inner")
len(depression_patients_with_demographics)

49471

In [14]:
#merge in discharge notes 
discharge_selected = discharge[["subject_id","note_id", "hadm_id", "note_seq", "text"]]
depression_patients_with_demographics_and_notes = pd.merge(depression_patients_with_demographics, discharge_selected, on=["subject_id","hadm_id"], how = "inner")
len(depression_patients_with_demographics_and_notes)

24289

## MERGING

In [15]:
#merge by subject_id and hadm_id
final = pd.merge(depression_patients_with_demographics_and_notes, ed_merged, on=["subject_id", "hadm_id"], how="inner")
len(final), final["subject_id"].nunique(), final["race"].nunique(), final["anchor_age"].describe()

(220429,
 9121,
 30,
 count    220429.000000
 mean         59.305019
 std          15.933945
 min          18.000000
 25%          49.000000
 50%          60.000000
 75%          71.000000
 max          91.000000
 Name: anchor_age, dtype: float64)

In [16]:
final.columns
final.to_csv("final_mimic.csv")

## further exploration

In [17]:
#filter by keeping only individuals with antidepressants
only_antidepressant = final[final['etcdescription'].str.contains("Antidepressant", na=False)]
len(only_antidepressant), only_antidepressant["subject_id"].nunique(), only_antidepressant["hadm_id"].nunique()

(16461, 7184, 11903)

In [18]:
#any therapies mentioned?
#patterns = ["psychotherapy", "Psychotherapy", "psycho therapy", "Psycho Therapy", "Psycho therapy", "psycho-therapy", "Psycho-therapy"]
#patterns = ["CBT", "cognitive behavioral therapy", "Cognitive Behavioral Therapy", "cognitive-behavioral therapy", "Cognitive-Behavioral Therapy"]
patterns = ["DBT", "dialectical behavior therapy", "Dialectical Behavior Therapy", "dialectical-behavioral therapy", "Dialectical-Behavioral Therapy"]

all_patterns = "|".join(patterns)
any_therapies = final[final['text'].str.contains(all_patterns, na=False)]
len(any_therapies)

174