In [1]:
## You need to change the project ID to match your GCP
PROJECT_ID = "halogen-rarity-312520"

In [2]:
import os
from google.colab import auth

#sets dateset
DATASET_PROJECT_ID = 'amsterdamumcdb-data'
DATASET_ID = 'ams102'
LOCATION = 'eu'

#all libraries check this environment variable, so set it:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

auth.authenticate_user()
print('Authenticated')

Authenticated


In [3]:
from google.cloud.bigquery import magics
from google.cloud import bigquery

#sets the default query job configuration
def_config = bigquery.job.QueryJobConfig(default_dataset=DATASET_PROJECT_ID + "." + DATASET_ID)

#sets client options job configuration
client_options = {}
client_options['location'] = LOCATION

In [4]:
import pandas as pd
import numpy as np
import re
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [5]:
pd.set_option('display.max_rows', 100)

In [6]:
config_gbq = {'query': 
          {'defaultDataset': {
              "datasetId": DATASET_ID, 
              "projectId": DATASET_PROJECT_ID
              },
           'Location': LOCATION}
           }

In [7]:
diagnoses_sql = '''
WITH diagnoses AS (
    SELECT admissionid,
        item, 
        value as diagnosis,
        CASE
            WHEN itemid IN (
                --SURGICAL
                13116, --D_Thoraxchirurgie_CABG en Klepchirurgie
                16671, --DMC_Thoraxchirurgie_CABG en Klepchirurgie
                13117, --D_Thoraxchirurgie_Cardio anders
                16672, --DMC_Thoraxchirurgie_Cardio anders
                13118, --D_Thoraxchirurgie_Aorta chirurgie
                16670, --DMC_Thoraxchirurgie_Aorta chirurgie
                13119, --D_Thoraxchirurgie_Pulmonale chirurgie
                16673, --DMC_Thoraxchirurgie_Pulmonale chirurgie

                --Not surgical: 13141, --D_Algemene chirurgie_Algemeen   
                --Not surgical: 16642, --DMC_Algemene chirurgie_Algemeen
                13121, --D_Algemene chirurgie_Buikchirurgie
                16643, --DMC_Algemene chirurgie_Buikchirurgie
                13123, --D_Algemene chirurgie_Endocrinologische chirurgie
                16644, --DMC_Algemene chirurgie_Endocrinologische chirurgie
                13145, --D_Algemene chirurgie_KNO/Overige
                16645, --DMC_Algemene chirurgie_KNO/Overige
                13125, --D_Algemene chirurgie_Orthopedische chirurgie
                16646, --DMC_Algemene chirurgie_Orthopedische chirurgie
                13122, --D_Algemene chirurgie_Transplantatie chirurgie
                16647, --DMC_Algemene chirurgie_Transplantatie chirurgie
                13124, --D_Algemene chirurgie_Trauma
                16648, --DMC_Algemene chirurgie_Trauma
                13126, --D_Algemene chirurgie_Urogenitaal
                16649, --DMC_Algemene chirurgie_Urogenitaal
                13120, --D_Algemene chirurgie_Vaatchirurgie
                16650, --DMC_Algemene chirurgie_Vaatchirurgie

                13128, --D_Neurochirurgie _Vasculair chirurgisch
                16661, --DMC_Neurochirurgie _Vasculair chirurgisch
                13129, --D_Neurochirurgie _Tumor chirurgie
                16660, --DMC_Neurochirurgie _Tumor chirurgie
                13130, --D_Neurochirurgie_Overige
                16662, --DMC_Neurochirurgie_Overige

                18596, --Apache II Operatief  Gastr-intenstinaal
                18597, --Apache II Operatief Cardiovasculair
                18598, --Apache II Operatief Hematologisch
                18599, --Apache II Operatief Metabolisme
                18600, --Apache II Operatief Neurologisch
                18601, --Apache II Operatief Renaal
                18602, --Apache II Operatief Respiratoir

                17008, --APACHEIV Post-operative cardiovascular
                17009, --APACHEIV Post-operative gastro-intestinal
                17010, --APACHEIV Post-operative genitourinary
                17011, --APACHEIV Post-operative hematology
                17012, --APACHEIV Post-operative metabolic
                17013, --APACHEIV Post-operative musculoskeletal /skin
                17014, --APACHEIV Post-operative neurologic
                17015, --APACHEIV Post-operative respiratory
                17016, --APACHEIV Post-operative transplant
                17017 --APACHEIV Post-operative trauma

            ) THEN 1
            WHEN itemid = 18669 AND valueid BETWEEN 1 AND 26 THEN 1 --NICE APACHEII diagnosen
            WHEN itemid = 18671 AND valueid BETWEEN 222 AND 452 THEN 1 --NICE APACHEIV diagnosen
            ELSE 0
        END AS surgical,
        valueid as diagnosis_id,
        CASE 
                WHEN itemid = 18671 THEN 'NICE APACHE IV'
                WHEN itemid = 18669 THEN 'NICE APACHE II'
                WHEN itemid BETWEEN 16998 AND 17017 THEN 'APACHE IV'
                WHEN itemid BETWEEN 18589 AND 18602 THEN 'APACHE II'
                WHEN itemid BETWEEN 13116 AND 13145 THEN 'Legacy ICU'
                WHEN itemid BETWEEN 16642 AND 16673 THEN 'Legacy MCU'
        END AS diagnosis_type,
        ROW_NUMBER() OVER(PARTITION BY admissionid
        ORDER BY 
            CASE --prefer NICE > APACHE IV > II > D
                WHEN itemid = 18671 THEN 6 --NICE APACHEIV diagnosen
                WHEN itemid = 18669 THEN 5 --NICE APACHEII diagnosen                
                WHEN itemid BETWEEN 16998 AND 17017 THEN 4 --APACHE IV diagnosis        
                WHEN itemid BETWEEN 18589 AND 18602 THEN 3 --APACHE II diagnosis
                WHEN itemid BETWEEN 13116 AND 13145 THEN 2 --D diagnosis ICU
                WHEN itemid BETWEEN 16642 AND 16673 THEN 1 --DMC diagnosis Medium Care
            END DESC,
            measuredat DESC) AS rownum
    FROM listitems
    WHERE itemid IN (
        -- Diagnosis - LEVEL 2
        --SURGICAL
        13116, --D_Thoraxchirurgie_CABG en Klepchirurgie
        16671, --DMC_Thoraxchirurgie_CABG en Klepchirurgie
        13117, --D_Thoraxchirurgie_Cardio anders
        16672, --DMC_Thoraxchirurgie_Cardio anders
        13118, --D_Thoraxchirurgie_Aorta chirurgie
        16670, --DMC_Thoraxchirurgie_Aorta chirurgie
        13119, --D_Thoraxchirurgie_Pulmonale chirurgie
        16673, --DMC_Thoraxchirurgie_Pulmonale chirurgie
        
        13141, --D_Algemene chirurgie_Algemeen   
        16642, --DMC_Algemene chirurgie_Algemeen
        13121, --D_Algemene chirurgie_Buikchirurgie
        16643, --DMC_Algemene chirurgie_Buikchirurgie
        13123, --D_Algemene chirurgie_Endocrinologische chirurgie
        16644, --DMC_Algemene chirurgie_Endocrinologische chirurgie
        13145, --D_Algemene chirurgie_KNO/Overige
        16645, --DMC_Algemene chirurgie_KNO/Overige
        13125, --D_Algemene chirurgie_Orthopedische chirurgie
        16646, --DMC_Algemene chirurgie_Orthopedische chirurgie
        13122, --D_Algemene chirurgie_Transplantatie chirurgie
        16647, --DMC_Algemene chirurgie_Transplantatie chirurgie
        13124, --D_Algemene chirurgie_Trauma
        16648, --DMC_Algemene chirurgie_Trauma
        13126, --D_Algemene chirurgie_Urogenitaal
        16649, --DMC_Algemene chirurgie_Urogenitaal
        13120, --D_Algemene chirurgie_Vaatchirurgie
        16650, --DMC_Algemene chirurgie_Vaatchirurgie

        13128, --D_Neurochirurgie _Vasculair chirurgisch
        16661, --DMC_Neurochirurgie _Vasculair chirurgisch
        13129, --D_Neurochirurgie _Tumor chirurgie
        16660, --DMC_Neurochirurgie _Tumor chirurgie
        13130, --D_Neurochirurgie_Overige
        16662, --DMC_Neurochirurgie_Overige
        
        18596, --Apache II Operatief  Gastr-intenstinaal
        18597, --Apache II Operatief Cardiovasculair
        18598, --Apache II Operatief Hematologisch
        18599, --Apache II Operatief Metabolisme
        18600, --Apache II Operatief Neurologisch
        18601, --Apache II Operatief Renaal
        18602, --Apache II Operatief Respiratoir
        
        17008, --APACHEIV Post-operative cardiovascular
        17009, --APACHEIV Post-operative gastro-intestinal
        17010, --APACHEIV Post-operative genitourinary
        17011, --APACHEIV Post-operative hematology
        17012, --APACHEIV Post-operative metabolic
        17013, --APACHEIV Post-operative musculoskeletal /skin
        17014, --APACHEIV Post-operative neurologic
        17015, --APACHEIV Post-operative respiratory
        17016, --APACHEIV Post-operative transplant
        17017, --APACHEIV Post-operative trauma

        --MEDICAL
        13133, --D_Interne Geneeskunde_Cardiovasculair
        16653, --DMC_Interne Geneeskunde_Cardiovasculair
        13134, --D_Interne Geneeskunde_Pulmonaal
        16658, --DMC_Interne Geneeskunde_Pulmonaal
        13135, --D_Interne Geneeskunde_Abdominaal
        16652, --DMC_Interne Geneeskunde_Abdominaal
        13136, --D_Interne Geneeskunde_Infectieziekten
        16655, --DMC_Interne Geneeskunde_Infectieziekten
        13137, --D_Interne Geneeskunde_Metabool
        16656, --DMC_Interne Geneeskunde_Metabool
        13138, --D_Interne Geneeskunde_Renaal
        16659, --DMC_Interne Geneeskunde_Renaal
        13139, --D_Interne Geneeskunde_Hematologisch
        16654, --DMC_Interne Geneeskunde_Hematologisch
        13140, --D_Interne Geneeskunde_Overige
        16657, --DMC_Interne Geneeskunde_Overige

        13131, --D_Neurologie_Vasculair neurologisch
        16664, --DMC_Neurologie_Vasculair neurologisch
        13132, --D_Neurologie_Overige
        16663, --DMC_Neurologie_Overige 
        13127, --D_KNO/Overige
        
        18589, --Apache II Non-Operatief Cardiovasculair
        18590, --Apache II Non-Operatief Gastro-intestinaal
        18591, --Apache II Non-Operatief Hematologisch
        18592, --Apache II Non-Operatief Metabolisme
        18593, --Apache II Non-Operatief Neurologisch
        18594, --Apache II Non-Operatief Renaal
        18595, --Apache II Non-Operatief Respiratoir
        
        16998, --APACHE IV Non-operative cardiovascular
        16999, --APACHE IV Non-operative Gastro-intestinal
        17000, --APACHE IV Non-operative genitourinary
        17001, --APACHEIV  Non-operative haematological
        17002, --APACHEIV  Non-operative metabolic
        17003, --APACHEIV Non-operative musculo-skeletal
        17004, --APACHEIV Non-operative neurologic
        17005, --APACHEIV Non-operative respiratory
        17006, --APACHEIV Non-operative transplant
        17007, --APACHEIV Non-operative trauma
        
        --NICE: surgical/medical combined in same parameter
        18669, --NICE APACHEII diagnosen
        18671 --NICE APACHEIV diagnosen
    )
), sepsis AS (
    SELECT
        admissionid,
        CASE valueid
            WHEN 1 THEN 1 --'Ja'
            WHEN 2 THEN 0 --'Nee'
        END as sepsis_at_admission,
        ROW_NUMBER() OVER(
            PARTITION BY 
                admissionid
            ORDER BY 
                measuredat DESC) AS rownum
    FROM listitems
    WHERE 
        itemid = 15808
), sepsis_antibiotics AS ( --non prophylactic antibiotics
    SELECT
        admissionid,
        CASE 
            WHEN COUNT(*) > 0 THEN 1
            ELSE 0
        END AS sepsis_antibiotics_bool,
        STRING_AGG(DISTINCT item, '; ') AS sepsis_antibiotics_given
    FROM drugitems
    WHERE 
        itemid IN (
            6834, --Amikacine (Amukin)
            6847, --Amoxicilline (Clamoxyl/Flemoxin)
            6871, --Benzylpenicilline (Penicilline)
            6917, --Ceftazidim (Fortum)
            --6919, --Cefotaxim (Claforan) -> prophylaxis
            6948, --Ciprofloxacine (Ciproxin)
            6953, --Rifampicine (Rifadin)
            6958, --Clindamycine (Dalacin)
            7044, --Tobramycine (Obracin)
            --7064, --Vancomycine -> prophylaxis for valve surgery
            7123, --Imipenem (Tienam)
            7185, --Doxycycline (Vibramycine)
            --7187, --Metronidazol (Flagyl) -> often used for GI surgical prophylaxis
            --7208, --Erythromycine (Erythrocine) -> often used for gastroparesis
            7227, --Flucloxacilline (Stafoxil/Floxapen)
            7231, --Fluconazol (Diflucan)
            7232, --Ganciclovir (Cymevene)
            7233, --Flucytosine (Ancotil)
            7235, --Gentamicine (Garamycin)
            7243, --Foscarnet trinatrium (Foscavir)
            7450, --Amfotericine B (Fungizone)
            --7504, --X nader te bepalen --non-stock medication
            8127, --Meropenem (Meronem)
            8229, --Myambutol (ethambutol)
            8374, --Kinine dihydrocloride
            --8375, --Immunoglobuline (Nanogam) -> not anbiotic
            --8394, --Co-Trimoxazol (Bactrimel) -> often prophylactic (unless high dose)
            8547, --Voriconazol(VFEND)
            --9029, --Amoxicilline/Clavulaanzuur (Augmentin) -> often used for ENT surgical prophylaxis
            9030, --Aztreonam (Azactam)
            9047, --Chlooramfenicol
            --9075, --Fusidinezuur (Fucidin) -> prophylaxis
            9128, --Piperacilline (Pipcil)
            9133, --Ceftriaxon (Rocephin)
            --9151, --Cefuroxim (Zinacef) -> often used for GI/transplant surgical prophylaxis
            --9152, --Cefazoline (Kefzol) -> prophylaxis for cardiac surgery
            9458, --Caspofungine
            9542, --Itraconazol (Trisporal)
            --9602, --Tetanusimmunoglobuline -> prophylaxis/not antibiotic
            12398, --Levofloxacine (Tavanic)
            12772, --Amfotericine B lipidencomplex  (Abelcet)
            15739, --Ecalta (Anidulafungine)
            16367, --Research Anidulafungin/placebo
            16368, --Research Caspofungin/placebo
            18675, --Amfotericine B in liposomen (Ambisome )
            19137, --Linezolid (Zyvoxid)
            19764, --Tigecycline (Tygacil)
            19773, --Daptomycine (Cubicin)
            20175 --Colistine
        )
        AND start < 6*60*60*1000 --within 6 hours (to correct for antibiotics administered before ICU)
    GROUP BY admissionid
), other_antibiotics AS ( --'prophylactic' antibiotics that may be used for sepsis
    SELECT
        admissionid,
        CASE 
            WHEN COUNT(*) > 0 THEN 1
            ELSE 0
        END AS other_antibiotics_bool,
        STRING_AGG(DISTINCT item, '; ') AS other_antibiotics_given
    FROM drugitems
    WHERE 
        itemid IN (
            7064, --Vancomycine -> prophylaxis for valve surgery
            7187, --Metronidazol (Flagyl) -> often used for GI surgical prophylaxis
            8394, --Co-Trimoxazol (Bactrimel) -> often prophylactic (unless high dose)
            9029, --Amoxicilline/Clavulaanzuur (Augmentin) -> often used for ENT surgical prophylaxis
            9151, --Cefuroxim (Zinacef) -> often used for GI surgical prophylaxis
            9152 --Cefazoline (Kefzol) -> prophylaxis
        )
        AND start < 24*60*60*1000 --within 24 hours (to correct for antibiotics administered before ICU)
    GROUP BY admissionid    
), cultures AS (
    SELECT
        admissionid,
        CASE 
            WHEN COUNT(*) > 0 THEN 1
            ELSE 0
        END AS sepsis_cultures_bool,
        STRING_AGG(DISTINCT item, '; ') AS sepsis_cultures_drawn
    FROM procedureorderitems
    WHERE 
        itemid IN (
        --8097, --Sputumkweek afnemen -> often used routinely
        --8418, --Urinekweek afnemen
        --8588, --MRSA kweken afnemen 
        9189, --Bloedkweken afnemen
        9190, --Cathetertipkweek afnemen
        --9191, --Drainvochtkweek afnemen
        --9192, --Faeceskweek afnemen -> Clostridium
        --9193, --X-Kweek nader te bepalen
        --9194, --Liquorkweek afnemen
        --9195, --Neuskweek afnemen
        --9197, --Perineumkweek afnemen -> often used routinely
        -9198, --Rectumkweek afnemen -> often used routinely
        9200, --Wondkweek afnemen
        9202, --Ascitesvochtkweek afnemen
        --9203, --Keelkweek afnemen -> often used routinely
        --9204, --SDD-kweken afnemen -> often used routinely
        9205 --Legionella sneltest (urine)
        --1302, --SDD Inventarisatiekweken afnemen -> often used routinely
        --19663, --Research Neuskweek COUrSe
        --19664, --Research Sputumkweek COUrSe
        )
        AND registeredat < 6*60*60*1000 --within 6 hours
    GROUP BY admissionid
)
SELECT 
    admissions.* 
    , diagnosis_type
    , diagnosis, diagnosis_id
    , surgical
    , sepsis_at_admission
    , sepsis_antibiotics_bool
    , sepsis_antibiotics_given
    , other_antibiotics_bool
    , other_antibiotics_given
    , sepsis_cultures_bool
    , sepsis_cultures_drawn
FROM admissions
LEFT JOIN diagnoses on admissions.admissionid = diagnoses.admissionid
LEFT JOIN sepsis on admissions.admissionid = sepsis.admissionid
LEFT JOIN sepsis_antibiotics on admissions.admissionid = sepsis_antibiotics.admissionid
LEFT JOIN other_antibiotics on admissions.admissionid = other_antibiotics.admissionid
LEFT JOIN cultures on admissions.admissionid = cultures.admissionid
WHERE --only last updated record
    (diagnoses.rownum = 1 OR diagnoses.rownum IS NULL) AND 
    (sepsis.rownum = 1 OR sepsis.rownum IS NULL) 
;
'''

In [None]:
af = pd.read_gbq(
    '''
    SELECT
      admissionid
      ,itemid
      --,item
      ,valueid
      --,value
      ,measuredat
      --,registeredat
      --,registeredby
      --,updatedat
      --,updatedby
      --,islabresult
    FROM listitems
    WHERE 
      itemid = 6671	--Hartritme
      AND valueid = 13	--Atr fib
    '''
    , configuration=config_gbq)

In [None]:
# af_inclusion_tf = (af['measuredat'] > 0.5 * 60 * 60 * 1000) & (af['measuredat'] < 5 * 24 * 60 * 60 * 1000)
# af_exclusion_tf = af['measuredat'] < 0

af_inclusion_tf = af['measuredat'] < 5 * 24 * 60 * 60 * 1000
af_exclusion_tf = af['measuredat'] < 0.5 * 60 * 60 * 1000

## Note we will also want to remove all patients who have an AF event before 30 mins if we decide this isn't NOAF
include_admissionid = af.loc[af_inclusion_tf, 'admissionid']
exclude_admissionid = af.loc[af_exclusion_tf, 'admissionid']
include_admissionid = include_admissionid[~include_admissionid.isin(exclude_admissionid)]
include_admissionid.size

In [None]:
admissions_df = pd.read_gbq(diagnoses_sql, configuration = config_gbq)
admissions_df.head()

In [None]:
admissions_df['afib'] = admissions_df['admissionid'].isin(include_admissionid) 
admissions_df['afib'] &= admissions_df['admissioncount'] == 1
admissions_df['afib'].sum()

## patients we assume have AF but not NOAF, by criterion given
admissions_df['previous_afib'] = admissions_df['admissionid'].isin(exclude_admissionid) 
admissions_df['previous_afib'] &= admissions_df['admissioncount'] == 1
admissions_df['previous_afib'].sum()

**Identify cardiothoracic patients**

In [None]:
re_cardiosurg = r'(CABG|AVR|hartchirurgie|heart surgery|Chron. cardiovasculaire ziekte|hartkleppen' + \
r'|cardiovascula|MVP|MVR|mitral|tricuspid|pericard|aortic.*valve|lobectom|segment|thorax|Bentall|aorta-ascendens|aorta-boog' + \
r'|aorta-wortel|aorta-descendens|lung|pneumectomie|bullectom|respiratoir neoplasm|thoracoscop|thoracotom(y|ie)|respirato' + \
r'|vrije wand ruptuur|VSR|ASD|pleurectom|intracardiac|aneurysmectom|congenital defect repair)(?! for esophag)'
re_cardiosurg = r'(CABG|AVR|MVP|MVR|valve)'

In [None]:
# Patients who have CBAG / valve surgery (from cardiology or cardiac surgery)
admissions_df['is_cardiosurg'] = admissions_df['surgical'] == 1
admissions_df['is_cardiosurg'] &= admissions_df['diagnosis'].str.contains(re_cardiosurg, na=False, flags=re.IGNORECASE)
admissions_df['is_cardiosurg'] &= admissions_df['specialty'].isin(['Cardiochirurgie', 'Cardiologie'])

**Identify patients with sepsis**

In [None]:
re_sepsis_surg = r'sepsis|pneumoni|GI perforation|perforation/rupture|infection|abscess|GI Vascular ischemia|diverticular|appendectomy|peritonitis'
re_sepsis_med = r'sepsis|septic|infect|pneumoni|cholangitis|pancr|endocarditis|meningitis|GI perforation|abces|abscess|darm ischaemie|GI vascular|fasciitis' + \
r'|inflammatory|peritonitis'

In [None]:
admissions_df['is_sepsis_surgical'] = admissions_df['surgical'] == 1
admissions_df['is_sepsis_surgical'] &= admissions_df['diagnosis'].str.contains(re_sepsis_surg, na=False, flags=re.IGNORECASE)
admissions_df['is_sepsis_surgical'] &= ~(admissions_df['sepsis_at_admission'] == 0)

# medical admissions with sepsis
admissions_df['is_sepsis'] = (
      (admissions_df['surgical'] == 0) & admissions_df['diagnosis'].str.contains(re_sepsis_med, na=False, flags=re.IGNORECASE)
   ) | (
      admissions_df['sepsis_at_admission'] == 1
   ) | (
      admissions_df['sepsis_antibiotics_bool'] == 1
   ) | (
      (admissions_df['other_antibiotics_bool'] == 1) & (admissions_df['sepsis_cultures_bool'] == 1)
)
admissions_df['is_sepsis'] &= ~(admissions_df['sepsis_at_admission'] == 0)
admissions_df['is_sepsis'] |= admissions_df['is_sepsis_surgical']

# def sepsis_ind(diagnosis_df):
#   ind = (
#     (
#       #use reasons for admission
#       #surgical admissions with sepsis
#       (patient_df['surgical'] == 1) 
#       & (diagnosis_df['diagnosis'].str.contains(re_sepsis_surg, na=False, flags=re.IGNORECASE))
#     ) | (
#       #medical admissions with sepsis
#       (diagnosis_df['surgical'] == 0) 
#       & (diagnosis_df['diagnosis'].str.contains(re_sepsis_med, na=False, flags=re.IGNORECASE))
#       ) | (
#           #uses documentation at admission form (Early Goal Directed Therapy)
#           (diagnosis_df['sepsis_at_admission'] == 1)
#       ) | (
#           #uses administered (therapeutic) antibiotics for determining sepsis
#           (diagnosis_df['sepsis_antibiotics_bool'] == 1)
#       ) | (
#           #uses combination of administered antibiotics (that sometimes are used as prophylaxis) AND 
#           #drawn cultures for determining sepsis
#           (diagnosis_df['other_antibiotics_bool'] == 1) 
#           & (diagnosis_df['sepsis_cultures_bool'] == 1)
#       )
#     ) & ~(diagnosis_df['sepsis_at_admission'] == 0)
#   )
#   return ind

In [None]:
## query numbers
ind_all = (admissions_df['admissioncount'] == 1) & (admissions_df['previous_afib'] == 0)
## CABG/valve surgery subset (no previous AF)
print('CABG/valve surgery (no previous AF)')
print((admissions_df['is_cardiosurg'] & ind_all).sum())
## CABG/valve surgery x NOAF subset
print('CABG/valve surgery x NOAF')
print((admissions_df['is_cardiosurg'] & admissions_df['afib'] & ind_all).sum())
## Sepsis subset (no previous AF)
print('Sepsis (no previous AF)')
print((admissions_df['is_sepsis'] & ind_all).sum())
## Sepsis x NOAF subset
print('Sepsis x NOAF')
print((admissions_df['is_sepsis'] & admissions_df['afib'] & ind_all).sum())
# (admissions_df['admissioncount'] == 1).sum()

**Prepare dataframe**

In [None]:
ind_all = (admissions_df['admissioncount'] == 1) & (admissions_df['previous_afib'] == 0)
admissions_df = admissions_df.loc[ind_all]

In [None]:
weight_max_dict = {'59-': 59, '60-69': 69, '70-79': 79, '80-89': 89, '90-99': 99, '100-109': 109, '110+': 110}
height_min_dict = {'159-': 1.59, '160-169': 1.60, '170-179': 1.70, '180-189': 1.80, '190+': 1.90}

admissions_df['bmi_max'] = (admissions_df['weightgroup'].copy().replace(weight_max_dict) \
                            / (admissions_df['heightgroup'].copy().replace(height_min_dict) ** 2))

In [None]:
# admissions_df['admissionyeargroup'].value_counts()
# admissions_df['admissionyeargroup'].isna().sum()
admissions_df['admissionyear_2010s'] = admissions_df['admissionyeargroup'] == '2010-2016'

In [None]:
admissions_df['female'] = admissions_df['gender'] == 'Vrouw'
admissions_df.loc[admissions_df['gender'] == '', 'female'] = np.nan

In [None]:
admissions_df['agegroup_59-'] = admissions_df['agegroup'].isin(['18-39', '40-49', '50-59'])
admissions_df['agegroup_60-69'] = admissions_df['agegroup'] == '60-69'
admissions_df['agegroup_70-79'] = admissions_df['agegroup'] == '70-79'
admissions_df['agegroup_80+'] = admissions_df['agegroup'] == '80+'

In [None]:
admissions_df['surgical'] = admissions_df['surgical'] == 1

In [None]:
admissions_df['heightgroup'].isna().sum()

In [None]:
exclude_columns = ['patientid', 'admissioncount', 'location', 'origin']
exclude_columns += ['admittedat', 'dischargedat', 'destination', 'dateofdeath']
exclude_columns += ['weightsource', 'heightsource', 'diagnosis_type']
exclude_columns += ['admissionyeargroup', 'gender', 'agegroup']
# exclude_columns += ['agegroup_59-', 'agegroup_60-69', 'agegroup_70-79', 'agegroup_80+'] 
exclude_columns += ['weightgroup', 'heightgroup', 'lengthofstay', 'specialty']
exclude_columns += ['diagnosis', 'diagnosis_id', 'sepsis_at_admission']
exclude_columns += ['sepsis_antibiotics_bool', 'sepsis_antibiotics_given']
exclude_columns += ['other_antibiotics_bool', 'other_antibiotics_given']
exclude_columns += ['sepsis_cultures_bool', 'sepsis_cultures_drawn']
model_df = admissions_df.drop(columns = exclude_columns)

In [None]:
# ohe = OneHotEncoder()
# ohe_age = ohe.fit_transform(model_df[["agegroup"]])
# pd.DataFrame(ohe_age.toarray(), columns=ohe.categories_).head()

In [None]:
sepsis_df = model_df.loc[model_df['is_sepsis'] == 1, :].drop(columns = ['is_cardiosurg', 'is_sepsis', 'surgical'])
sepsis_df.columns

sepsis_df = sepsis_df.loc[sepsis_df.isna().sum(axis = 1) == 0, :]
# sepsis_df['agegroup'] = sepsis_df['agegroup'].cat.rename_categories([])
# s.cat.rename_categories([1, 2, 3])
# sepsis_df.shape

In [None]:
train_df, test_df, train_outcome, test_outcome = train_test_split(sepsis_df.drop(columns = ['admissionid', 'afib']), sepsis_df['afib'], random_state=0)

In [None]:
clf = tree.DecisionTreeClassifier(random_state=0, max_depth=10, min_samples_leaf=10, criterion='gini', class_weight='balanced')
path = clf.cost_complexity_pruning_path(train_df, train_outcome)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas[ccp_alphas > 0]:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha, max_depth=10, min_samples_leaf=10, criterion='gini', class_weight='balanced')
    clf.fit(train_df, train_outcome)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas[ccp_alphas > 0], node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas[ccp_alphas > 0], depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

In [None]:
train_scores = [clf.score(train_df, train_outcome) for clf in clfs]
test_scores = [clf.score(test_df, test_outcome) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas[ccp_alphas > 0], train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas[ccp_alphas > 0], test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
!pip install graphviz
!pip install pydotplus

In [None]:
sepsis_df.columns.drop(['admissionid', 'afib'])

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

clf = tree.DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf=20, criterion='entropy', class_weight='balanced')
clf.fit(train_df, train_outcome)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=False,
                special_characters=True, 
                feature_names = sepsis_df.columns.drop(['admissionid', 'afib']), 
                class_names=['No AF','AF'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())