# All Data Raw - Preparation

The following code pulls data using queries from the MIMIC-III dataset stored in a local server. To first run this notebook, the query *all_data_raw.sql* must be run. <br>
The notebook will then merge the all_data_raw tables with missing information from other tables.

## Setup & Queries

In [1]:
# Import libraries

# Import useful libraries
import numpy as np
import pandas as pd
import os
import re
import pickle
import sys
import time
import psycopg2 
import datetime

In [None]:
# # Setup Repository
# with open("repo_info.txt", "r") as repo_info:
#     path_to_repo = repo_info.readline()

# path_to_data = f"{path_to_repo}data/"
# path_to_raw = f"{path_to_data}raw/"
# path_to_processed = f"{path_to_data}processed/"

In [2]:
##The following is based on personal psql setting
host = 'localhost'  
port = '5432'  
sqluser = 'postgres'  
password = 'mydb'  
dbname = 'mimiciv' 
schema_name1 = 'mimiciv_hosp' 
schema_name2 = 'mimiciv_icu'
schema_name3 = 'mimiciv_derived' 
schema_search_path = f"{schema_name1}, {schema_name2}, {schema_name3}"


try:
    con = psycopg2.connect(
        host=host,
        port=port,
        database=dbname,
        user=sqluser,
        password=password
    )

    #search path to connect to schema
    cur = con.cursor()
    cur.execute(f"SET search_path TO {schema_search_path};")
    
    # SQL query (gets 1st icd_code from diagnoses_icd table for each hadm_id)
    query_1_icd = \
    """
    select hadm_id, icd_code from diagnoses_icd where seq_num = 1
    """
    # Perform SQL query
    cur.execute('SET search_path to ' + schema_search_path)
    icd_code = pd.read_sql_query(query_1_icd,con)


    #SQL Query (gets all icd_code from procedures_icd)
    query_icd_pro = \
    """
    select hadm_id, icd_code from procedures_icd
    """
    # Perform SQL query
    proc_icd = pd.read_sql_query(query_icd_pro,con)

    #SQL Query (gets all icd_code from diagnoses_icd)
    query_icd_diag = \
    """
    select hadm_id, icd_code from diagnoses_icd
    """
    # Perform SQL query
    diag_icd = pd.read_sql_query(query_icd_diag,con)

    #Overall dataset
    # try with 10
    query_10 = """
    SELECT * FROM mimiciv_derived.all_data LIMIT 10;
    """
    df_10 = pd.read_sql_query(query_10, con)
    print(df_10)

    query = """
    SELECT * FROM mimiciv_derived.all_data;
    """
    df = pd.read_sql_query(query, con)
    print(df)

    #For icd code9 there are some codes beginning with 'E' or "V", we need to check the version that they are in
    query_icd_version = \
    """
    select icd_code, icd_version from mimiciv_hosp.d_icd_diagnoses;
    """

    icd_code_version = pd.read_sql_query(query_icd_version,con)
    
finally:
    if con is not None:
        con.close()


  icd_code = pd.read_sql_query(query_1_icd,con)
  proc_icd = pd.read_sql_query(query_icd_pro,con)
  diag_icd = pd.read_sql_query(query_icd_diag,con)
  df_10 = pd.read_sql_query(query_10, con)
  df = pd.read_sql_query(query, con)


    hadm_id  subject_id           admittime           dischtime  \
0  20000094    14046553 2150-03-02 00:00:00 2150-03-03 09:21:00   
1  20000094    14046553 2150-03-02 00:00:00 2150-03-03 09:21:00   
2  20000094    14046553 2150-03-02 00:00:00 2150-03-03 09:21:00   
3  20000094    14046553 2150-03-02 00:00:00 2150-03-03 09:21:00   
4  20000147    14990224 2121-08-30 16:33:00 2121-09-03 15:45:00   
5  20000147    14990224 2121-08-30 16:33:00 2121-09-03 15:45:00   
6  20000147    14990224 2121-08-30 16:33:00 2121-09-03 15:45:00   
7  20000147    14990224 2121-08-30 16:33:00 2121-09-03 15:45:00   
8  20000808    16788749 2180-01-12 20:59:00 2180-01-25 12:10:00   
9  20000808    16788749 2180-01-12 20:59:00 2180-01-25 12:10:00   

            deathtime                    race admission_type  \
0 2150-03-03 09:21:00                   WHITE         URGENT   
1 2150-03-03 09:21:00                   WHITE         URGENT   
2 2150-03-03 09:21:00                   WHITE         URGENT   
3 2150

  icd_code_version = pd.read_sql_query(query_icd_version,con)


## Data processing

In [4]:
%%time
df['icd_code'] = 0
check =[]
for hadm_id in set(df['hadm_id'].values):
    if hadm_id in icd_code['hadm_id'].values:
        df.loc[df['hadm_id']==hadm_id, 'icd_code'] = icd_code.loc[(icd_code['hadm_id']==hadm_id), 'icd_code'].values[0]
    else:
        check.append(hadm_id)

CPU times: total: 4.59 s
Wall time: 55.5 s


In [5]:
%%time
df['proc_icd'] = 0
for hadm_id in set(proc_icd.hadm_id):
    test = proc_icd[proc_icd['hadm_id'] == hadm_id]
    df.loc[df['hadm_id'] == hadm_id, 'proc_icd'] = test.icd_code.str.cat(sep=',')

CPU times: total: 35.1 s
Wall time: 4min 31s


In [6]:
%%time
df['diag_icd'] = 0
for hadm_id in set(diag_icd.hadm_id):
    test = diag_icd[diag_icd['hadm_id'] == hadm_id]
    df.loc[df['hadm_id'] == hadm_id, 'diag_icd'] = test.icd_code.str.cat(sep=',')

CPU times: total: 4min 7s
Wall time: 31min 28s


In [7]:
import copy

data = df.copy()

data["age_cat"] = ''  
data['type_stay'] = ''  
data['prev_adm'] = ''  
data['dest_discharge'] = ''
data['emergency_dpt'] = ''
data['icd_chapter'] = ''

In [8]:
%%time
# gender recoding
data.replace({'F':'2-Female', 'M': '1-Male'}, inplace = True)

CPU times: total: 719 ms
Wall time: 2.73 s


In [9]:
%%time
# age_cat
data.loc[data['age']<18, 'age_cat'] = 1
data.loc[(data['age']>=18) & (df['age'] < 45), 'age_cat'] = 2
data.loc[(data['age']>=45) & (df['age'] < 65), 'age_cat'] = 3
data.loc[(data['age']>=65) & (df['age'] < 85), 'age_cat'] = 4
data.loc[(data['age']>=85), 'age_cat'] = 5

CPU times: total: 15.6 ms
Wall time: 21.7 ms


In [10]:
%%time
# type of stay
medical_stay = {'CMED', 'DENT', 'ENT', 'GU', 'GYN', 'MED', 'NB', 'NBB', 'NMED', 'OBS', 'OMED', 'PSYCH', 'TRAUM'}
surgical_stay = {'CSURG', 'NSURG', 'ORTHO', 'PSURG', 'SURG','TSURG', 'VSURG'}
obstetric_stay = {'OBS'}
data.loc[(data['curr_service'].isin(medical_stay)), 'type_stay'] = '1-Medical'
data.loc[(data['curr_service'].isin(surgical_stay)), 'type_stay'] = '3-Surgical'
data.loc[(data['curr_service'].isin(obstetric_stay)), 'type_stay'] = '2-Obstetrics'

CPU times: total: 31.2 ms
Wall time: 104 ms


In [11]:
%%time
# Origin of patient
home = {'PHYS REFERRAL/NORMAL DELI', 'HMO REFERRAL/SICK', 'TRANSFER FROM SKILLED NUR'}
data.loc[data['admission_location'].isin(home), 'origin_patient'] = '1-Home'
data.loc[~data['admission_location'].isin(home), 'origin_patient'] = '2-Other'

CPU times: total: 0 ns
Wall time: 75.3 ms


In [12]:
%%time
# admission type
urgent = {'URGENT', 'EMERGENCY'}
data.loc[data['admission_type'].isin(urgent), 'emergency_dpt'] = 'Yes'
data.loc[~data['admission_type'].isin(urgent), 'emergency_dpt'] = 'No'

CPU times: total: 31.2 ms
Wall time: 73 ms


In [13]:
%%time
# discharge location
dest = {'HOME', 'HOME HEALTH CARE'}
data.loc[data['discharge_location'].isin(dest), 'dest_discharge'] = '1-Home'
data.loc[~data['discharge_location'].isin(dest), 'dest_discharge'] = '2-Other'

CPU times: total: 0 ns
Wall time: 65.9 ms


In [14]:
%%time
for subject in set(data.subject_id):
    df_adm = data[data['subject_id'] == subject]
    for i in df_adm.index:
        cur_date = df_adm.loc[i, 'admittime']
        hadm_id = df_adm.loc[i, 'hadm_id']
        subject_id = df_adm.loc[i, 'subject_id']
        df_prev_adm = df_adm[(df_adm.admittime < cur_date) & (df_adm['hadm_id'] != hadm_id)]
        test = df_prev_adm['emergency_dpt'] == 'Yes'
        if len(df_prev_adm) == 0:
            data.loc[i, 'prev_adm'] = '1-No hospitalization'
        elif test.any():
            data.loc[i, 'prev_adm'] = '3-At least one with emergency'
        else:
            data.loc[i, 'prev_adm'] = '2-At least one non emergency'

CPU times: total: 1min 53s
Wall time: 20min 9s


In the icd_Version10, the records are more detailed, and the codes begin with letters.

What makes it difficult is that, the chapters in v9 and v10 cannot fully connected.

Therefore we separate them when we try to get the chapter

In [15]:
data['icd_code'] = data['icd_code'].astype(str)
# indexes=[]
for idx in data.index:
    icd_code = data.loc[idx, 'icd_code']
    matching_codes = icd_code_version.loc[icd_code_version['icd_code'] == icd_code, 'icd_version']
    
    if not matching_codes.empty:
        icd_version = matching_codes.iloc[0]
        
        if icd_version == 9: 
            if data.loc[idx, 'icd_code'][0] == 'E': #first find the icd code beginning with letters to avoid errors
                data.loc[idx, 'icd_chapter'] = 19
            elif data.loc[idx, 'icd_code'][0] == 'V':
                data.loc[idx, 'icd_chapter'] = 20
            elif 0 < int(data.loc[idx, 'icd_code'][0:3]) < 140:
                data.loc[idx, 'icd_chapter'] = 1
            elif 140 <= int(data.loc[idx, 'icd_code'][0:3]) < 240:
                data.loc[idx, 'icd_chapter'] = 2
            elif 240 <= int(data.loc[idx, 'icd_code'][0:3]) < 280:
                data.loc[idx, 'icd_chapter'] = 3
            elif 280 <= int(data.loc[idx, 'icd_code'][0:3]) < 290:
                data.loc[idx, 'icd_chapter'] = 4
            elif 290 <= int(data.loc[idx, 'icd_code'][0:3]) < 320:
                data.loc[idx, 'icd_chapter'] = 5
            elif 320 <= int(data.loc[idx, 'icd_code'][0:3]) < 360:
                data.loc[idx, 'icd_chapter'] = 6
            elif 360 <= int(data.loc[idx, 'icd_code'][0:3]) < 390:
                data.loc[idx, 'icd_chapter'] = 7
            elif 390 <= int(data.loc[idx, 'icd_code'][0:3]) < 460:
                data.loc[idx, 'icd_chapter'] = 8
            elif 460 <= int(data.loc[idx, 'icd_code'][0:3]) < 520:
                data.loc[idx, 'icd_chapter'] = 9
            elif 520 <= int(data.loc[idx, 'icd_code'][0:3]) < 580:
                data.loc[idx, 'icd_chapter'] = 10
            elif 580 <= int(data.loc[idx, 'icd_code'][0:3]) < 630:
                data.loc[idx, 'icd_chapter'] = 11
            elif 630 <= int(data.loc[idx, 'icd_code'][0:3]) < 680:
                data.loc[idx, 'icd_chapter'] = 12
            elif 680 <= int(data.loc[idx, 'icd_code'][0:3]) < 710:
                data.loc[idx, 'icd_chapter'] = 13
            elif 710 <= int(data.loc[idx, 'icd_code'][0:3]) < 740:
                data.loc[idx, 'icd_chapter'] = 14
            elif 740 <= int(data.loc[idx, 'icd_code'][0:3]) < 760:
                data.loc[idx, 'icd_chapter'] = 15
            elif 760 <= int(data.loc[idx, 'icd_code'][0:3]) < 780:
                data.loc[idx, 'icd_chapter'] = 16
            elif 780 <= int(data.loc[idx, 'icd_code'][0:3]) < 800:
                data.loc[idx, 'icd_chapter'] = 17
            elif 800 <= int(data.loc[idx, 'icd_code'][0:3]) < 1000:
                data.loc[idx, 'icd_chapter'] = 18
        
        
        elif icd_version == 10:
            if data.loc[idx, 'icd_code'][0] in ['A', 'B']:
                data.loc[idx, 'icd_chapter'] = 21
            elif (data.loc[idx, 'icd_code'][0] == 'C') or  (data.loc[idx, 'icd_code'][0] == 'D' and 0 <= int(data.loc[idx, 'icd_code'][1]) < 5):
                data.loc[idx, 'icd_chapter'] = 22
            elif data.loc[idx, 'icd_code'][0] == 'D' and 5 <= int(data.loc[idx, 'icd_code'][1]) :
                data.loc[idx, 'icd_chapter'] = 23
            elif data.loc[idx, 'icd_code'][0] == 'E':
                data.loc[idx, 'icd_chapter'] = 24
            elif data.loc[idx, 'icd_code'][0] == 'F':
                data.loc[idx, 'icd_chapter'] = 25
            elif data.loc[idx, 'icd_code'][0] == 'G':
                data.loc[idx, 'icd_chapter'] = 26
            elif data.loc[idx, 'icd_code'][0] == 'H' and 0 <= int(data.loc[idx, 'icd_code'][1:3]) < 60:
                data.loc[idx, 'icd_chapter'] = 27
            elif data.loc[idx, 'icd_code'][0] == 'H' and 60 <= int(data.loc[idx, 'icd_code'][1:3]) < 96:
                data.loc[idx, 'icd_chapter'] = 28
            elif data.loc[idx, 'icd_code'][0] == 'I':
                data.loc[idx, 'icd_chapter'] = 29
            elif data.loc[idx, 'icd_code'][0] == 'J':
                data.loc[idx, 'icd_chapter'] = 30
            elif data.loc[idx, 'icd_code'][0] == 'K':
                data.loc[idx, 'icd_chapter'] = 31
            elif data.loc[idx, 'icd_code'][0] == 'L':
                data.loc[idx, 'icd_chapter'] = 32
            elif data.loc[idx, 'icd_code'][0] == 'M':
                data.loc[idx, 'icd_chapter'] = 33
            elif data.loc[idx, 'icd_code'][0] == 'N':
                data.loc[idx, 'icd_chapter'] = 34
            elif data.loc[idx, 'icd_code'][0] == 'O':
                data.loc[idx, 'icd_chapter'] = 35
            elif data.loc[idx, 'icd_code'][0] == 'P':
                data.loc[idx, 'icd_chapter'] = 36
            elif data.loc[idx, 'icd_code'][0] == 'Q':
                data.loc[idx, 'icd_chapter'] = 37
            elif data.loc[idx, 'icd_code'][0] == 'R':
                data.loc[idx, 'icd_chapter'] = 38
            elif data.loc[idx, 'icd_code'][0] in ['S','T']:
                data.loc[idx, 'icd_chapter'] = 39
            elif data.loc[idx, 'icd_code'][0] in ['V','W' ,'X','Y']:
                data.loc[idx, 'icd_chapter'] = 40
            elif data.loc[idx, 'icd_code'][0] == 'Z':
                data.loc[idx, 'icd_chapter'] = 41
            elif data.loc[idx, 'icd_code'][0] == 'U':
                data.loc[idx, 'icd_chapter'] = 42

    else:
        # Handle the case where icd_code is not found, e.g., by setting a default value or skipping
        print(f"No matching ICD code found for index {idx}: {icd_code}")
        # indexes.append(idx)
        continue  # Skip to the next iteration of the loop
    


No matching ICD code found for index 160489: 0
No matching ICD code found for index 160490: 0
No matching ICD code found for index 160491: 0
No matching ICD code found for index 160492: 0
No matching ICD code found for index 160493: 0
No matching ICD code found for index 160494: 0
No matching ICD code found for index 201443: 0
No matching ICD code found for index 201444: 0
No matching ICD code found for index 201445: 0
No matching ICD code found for index 201446: 0
No matching ICD code found for index 201447: 0
No matching ICD code found for index 201448: 0
No matching ICD code found for index 226138: 0
No matching ICD code found for index 226139: 0
No matching ICD code found for index 228537: 0
No matching ICD code found for index 228538: 0
No matching ICD code found for index 228539: 0
No matching ICD code found for index 240897: 0
No matching ICD code found for index 240898: 0
No matching ICD code found for index 240899: 0
No matching ICD code found for index 240900: 0
No matching I

In [16]:
#Remove the rows with icd problem
import re

text = """
No matching ICD code found for index 160489: 0
No matching ICD code found for index 160490: 0
No matching ICD code found for index 160491: 0
No matching ICD code found for index 160492: 0
No matching ICD code found for index 160493: 0
No matching ICD code found for index 160494: 0
No matching ICD code found for index 201443: 0
No matching ICD code found for index 201444: 0
No matching ICD code found for index 201445: 0
No matching ICD code found for index 201446: 0
No matching ICD code found for index 201447: 0
No matching ICD code found for index 201448: 0
No matching ICD code found for index 226138: 0
No matching ICD code found for index 226139: 0
No matching ICD code found for index 228537: 0
No matching ICD code found for index 228538: 0
No matching ICD code found for index 228539: 0
No matching ICD code found for index 240897: 0
No matching ICD code found for index 240898: 0
No matching ICD code found for index 240899: 0
No matching ICD code found for index 240900: 0
No matching ICD code found for index 240901: 0
No matching ICD code found for index 240902: 0
No matching ICD code found for index 240903: 0
No matching ICD code found for index 240904: 0
No matching ICD code found for index 296276: 0
No matching ICD code found for index 296277: 0
No matching ICD code found for index 298619: 0
No matching ICD code found for index 298620: 0
No matching ICD code found for index 298621: 0
No matching ICD code found for index 298622: 0
No matching ICD code found for index 298623: 0
No matching ICD code found for index 324949: 0
No matching ICD code found for index 324950: 0
No matching ICD code found for index 324951: 0
No matching ICD code found for index 324952: 0
No matching ICD code found for index 324953: 0
No matching ICD code found for index 324954: 0
No matching ICD code found for index 324955: 0
No matching ICD code found for index 324956: 0
No matching ICD code found for index 343702: 0
No matching ICD code found for index 343703: 0
No matching ICD code found for index 343704: 0
No matching ICD code found for index 343705: 0
No matching ICD code found for index 343706: 0
No matching ICD code found for index 343707: 0
No matching ICD code found for index 343708: 0
No matching ICD code found for index 343709: 0
No matching ICD code found for index 343710: 0
No matching ICD code found for index 396186: 0
No matching ICD code found for index 396187: 0
No matching ICD code found for index 396188: 0
No matching ICD code found for index 432321: 0
No matching ICD code found for index 432322: 0
No matching ICD code found for index 439688: 0
No matching ICD code found for index 439689: 0
No matching ICD code found for index 439690: 0
No matching ICD code found for index 439691: 0
No matching ICD code found for index 439692: 0
No matching ICD code found for index 439693: 0
No matching ICD code found for index 439694: 0
No matching ICD code found for index 439695: 0
No matching ICD code found for index 439696: 0
No matching ICD code found for index 439697: 0
No matching ICD code found for index 439698: 0
No matching ICD code found for index 439699: 0
No matching ICD code found for index 464584: 0
No matching ICD code found for index 464585: 0
No matching ICD code found for index 464586: 0
No matching ICD code found for index 464587: 0
No matching ICD code found for index 464588: 0
No matching ICD code found for index 464589: 0
No matching ICD code found for index 527080: 0
No matching ICD code found for index 527081: 0
No matching ICD code found for index 527082: 0
No matching ICD code found for index 527083: 0
No matching ICD code found for index 527084: 0
No matching ICD code found for index 596925: 0
No matching ICD code found for index 596926: 0
No matching ICD code found for index 596927: 0
"""

#For running time takes a lot of time, we use this method, but we can append indexes directly,just cancel ignore
indexes = re.findall(r'index (\d+): 0', text)
indexes = [int(index) for index in indexes]
print(indexes)


[160489, 160490, 160491, 160492, 160493, 160494, 201443, 201444, 201445, 201446, 201447, 201448, 226138, 226139, 228537, 228538, 228539, 240897, 240898, 240899, 240900, 240901, 240902, 240903, 240904, 296276, 296277, 298619, 298620, 298621, 298622, 298623, 324949, 324950, 324951, 324952, 324953, 324954, 324955, 324956, 343702, 343703, 343704, 343705, 343706, 343707, 343708, 343709, 343710, 396186, 396187, 396188, 432321, 432322, 439688, 439689, 439690, 439691, 439692, 439693, 439694, 439695, 439696, 439697, 439698, 439699, 464584, 464585, 464586, 464587, 464588, 464589, 527080, 527081, 527082, 527083, 527084, 596925, 596926, 596927]


In [17]:
data_filtered = data.drop(indexes, errors='ignore')

In [19]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 625220 entries, 0 to 625299
Data columns (total 63 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   hadm_id                     625220 non-null  int64         
 1   subject_id                  625220 non-null  int64         
 2   admittime                   625220 non-null  datetime64[ns]
 3   dischtime                   625220 non-null  datetime64[ns]
 4   deathtime                   141883 non-null  datetime64[ns]
 5   race                        625220 non-null  object        
 6   admission_type              625220 non-null  object        
 7   admission_location          625220 non-null  object        
 8   insurance                   625220 non-null  object        
 9   language                    625220 non-null  object        
 10  marital_status              578293 non-null  object        
 11  discharge_location          624494 non-

In [1]:
# %%time
# #The previous chapter range used in mimiciii is not very clear
# #We made some adjustments
# data['icd_code'] = data['icd_code'].astype(str)

# for idx in data.index:
#     icd_code = data.loc[idx, 'icd_code']
#     icd_version = icd_code_version.loc[icd_code_version['icd_code'] == icd_code, 'icd_version'].iloc[0]

#     if icd_version == 9: 
#         if data.loc[idx, 'icd_code'][0] == 'E': #first find the icd code beginning with letters to avoid errors
#             data.loc[idx, 'icd_chapter'] = 19
#         elif data.loc[idx, 'icd_code'][0] == 'V':
#             data.loc[idx, 'icd_chapter'] = 20
#         elif 0 < int(data.loc[idx, 'icd_code'][0:3]) < 140:
#             data.loc[idx, 'icd_chapter'] = 1
#         elif 140 <= int(data.loc[idx, 'icd_code'][0:3]) < 240:
#             data.loc[idx, 'icd_chapter'] = 2
#         elif 240 <= int(data.loc[idx, 'icd_code'][0:3]) < 280:
#             data.loc[idx, 'icd_chapter'] = 3
#         elif 280 <= int(data.loc[idx, 'icd_code'][0:3]) < 290:
#             data.loc[idx, 'icd_chapter'] = 4
#         elif 290 <= int(data.loc[idx, 'icd_code'][0:3]) < 320:
#             data.loc[idx, 'icd_chapter'] = 5
#         elif 320 <= int(data.loc[idx, 'icd_code'][0:3]) < 360:
#             data.loc[idx, 'icd_chapter'] = 6
#         elif 360 <= int(data.loc[idx, 'icd_code'][0:3]) < 390:
#             data.loc[idx, 'icd_chapter'] = 7
#         elif 390 <= int(data.loc[idx, 'icd_code'][0:3]) < 460:
#             data.loc[idx, 'icd_chapter'] = 8
#         elif 460 <= int(data.loc[idx, 'icd_code'][0:3]) < 520:
#             data.loc[idx, 'icd_chapter'] = 9
#         elif 520 <= int(data.loc[idx, 'icd_code'][0:3]) < 580:
#             data.loc[idx, 'icd_chapter'] = 10
#         elif 580 <= int(data.loc[idx, 'icd_code'][0:3]) < 630:
#             data.loc[idx, 'icd_chapter'] = 11
#         elif 630 <= int(data.loc[idx, 'icd_code'][0:3]) < 680:
#             data.loc[idx, 'icd_chapter'] = 12
#         elif 680 <= int(data.loc[idx, 'icd_code'][0:3]) < 710:
#             data.loc[idx, 'icd_chapter'] = 13
#         elif 710 <= int(data.loc[idx, 'icd_code'][0:3]) < 740:
#             data.loc[idx, 'icd_chapter'] = 14
#         elif 740 <= int(data.loc[idx, 'icd_code'][0:3]) < 760:
#             data.loc[idx, 'icd_chapter'] = 15
#         elif 760 <= int(data.loc[idx, 'icd_code'][0:3]) < 780:
#             data.loc[idx, 'icd_chapter'] = 16
#         elif 780 <= int(data.loc[idx, 'icd_code'][0:3]) < 800:
#             data.loc[idx, 'icd_chapter'] = 17
#         elif 800 <= int(data.loc[idx, 'icd_code'][0:3]) < 1000:
#             data.loc[idx, 'icd_chapter'] = 18
        
        
#     elif icd_version == 10:
#         if data.loc[idx, 'icd_code'][0] in ['A', 'B']:
#             data.loc[idx, 'icd_chapter'] = 21
#         elif (data.loc[idx, 'icd_code'][0] == 'C') or  (data.loc[idx, 'icd_code'][0] == 'D' and 0 <= int(data.loc[idx, 'icd_code'][1]) < 5):
#             data.loc[idx, 'icd_chapter'] = 22
#         elif data.loc[idx, 'icd_code'][0] == 'D' and 5 <= int(data.loc[idx, 'icd_code'][1]) :
#             data.loc[idx, 'icd_chapter'] = 23
#         elif data.loc[idx, 'icd_code'][0] == 'E':
#             data.loc[idx, 'icd_chapter'] = 24
#         elif data.loc[idx, 'icd_code'][0] == 'F':
#             data.loc[idx, 'icd_chapter'] = 25
#         elif data.loc[idx, 'icd_code'][0] == 'G':
#             data.loc[idx, 'icd_chapter'] = 26
#         elif data.loc[idx, 'icd_code'][0] == 'H' and 0 <= int(data.loc[idx, 'icd_code'][1:3]) < 60:
#             data.loc[idx, 'icd_chapter'] = 27
#         elif data.loc[idx, 'icd_code'][0] == 'H' and 60 <= int(data.loc[idx, 'icd_code'][1:3]) < 96:
#             data.loc[idx, 'icd_chapter'] = 28
#         elif data.loc[idx, 'icd_code'][0] == 'I':
#             data.loc[idx, 'icd_chapter'] = 29
#         elif data.loc[idx, 'icd_code'][0] == 'J':
#             data.loc[idx, 'icd_chapter'] = 30
#         elif data.loc[idx, 'icd_code'][0] == 'K':
#             data.loc[idx, 'icd_chapter'] = 31
#         elif data.loc[idx, 'icd_code'][0] == 'L':
#             data.loc[idx, 'icd_chapter'] = 32
#         elif data.loc[idx, 'icd_code'][0] == 'M':
#             data.loc[idx, 'icd_chapter'] = 33
#         elif data.loc[idx, 'icd_code'][0] == 'N':
#             data.loc[idx, 'icd_chapter'] = 34
#         elif data.loc[idx, 'icd_code'][0] == 'O':
#             data.loc[idx, 'icd_chapter'] = 35
#         elif data.loc[idx, 'icd_code'][0] == 'P':
#             data.loc[idx, 'icd_chapter'] = 36
#         elif data.loc[idx, 'icd_code'][0] == 'Q':
#             data.loc[idx, 'icd_chapter'] = 37
#         elif data.loc[idx, 'icd_code'][0] == 'R':
#             data.loc[idx, 'icd_chapter'] = 38
#         elif data.loc[idx, 'icd_code'][0] in ['S','T']:
#             data.loc[idx, 'icd_chapter'] = 39
#         elif data.loc[idx, 'icd_code'][0] in ['V','W' ,'X','Y']:
#             data.loc[idx, 'icd_chapter'] = 40
#         elif data.loc[idx, 'icd_code'][0] == 'Z':
#             data.loc[idx, 'icd_chapter'] = 41
#         elif data.loc[idx, 'icd_code'][0] == 'U':
#             data.loc[idx, 'icd_chapter'] = 42

    

### ICD-9 Chapter Classifications and Code Ranges<br>
Infectious and Parasitic Diseases (001-139)<br>
Neoplasms (140-239)<br>
Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders (240-279)<br>
Diseases of the Blood and Blood-forming Organs and Certain Disorders involving the Immune Mechanism (280-289)<br>
Mental Disorders (290-319)<br>
Diseases of the Nervous System (320-359)<br>
Diseases of the Sense Organs (360-389)<br>
Diseases of the Circulatory System (390-459)<br>
Diseases of the Respiratory System (460-519)<br>
Diseases of the Digestive System (520-579)<br>
Diseases of the Genitourinary System (580-629)<br>
Complications of Pregnancy, Childbirth, and the Puerperium (630-679)<br>
Diseases of the Skin and Subcutaneous Tissue (680-709)<br>
Diseases of the Musculoskeletal System and Connective Tissue (710-739)<br>
Congenital Anomalies (740-759)<br>
Certain Conditions originating in the Perinatal Period (760-779)<br>
Symptoms, Signs, and Ill-defined Conditions (780-799)<br>
Injury and Poisoning (800-999)<br>
Supp Factors Health Status (V)<br>
Supp External Causes Injury Poisoning (E)<br>

### ICD-10 Chapter Classifications and Code Ranges<br>
Certain Infectious and Parasitic Diseases (A00-B99)<br>
Neoplasms (C00-D48)<br>
Diseases of the Blood and Blood-forming Organs and Certain Disorders involving the Immune Mechanism (D50-D89)<br>
Endocrine, Nutritional and Metabolic Diseases (E00-E90)<br>
Mental and Behavioural Disorders (F00-F99)<br>
Diseases of the Nervous System (G00-G99)<br>
Diseases of the Eye and Adnexa (H00-H59)<br>
Diseases of the Ear and Mastoid Process (H60-H95)<br>
Diseases of the Circulatory System (I00-I99)<br>
Diseases of the Respiratory System (J00-J99)<br>
Diseases of the Digestive System (K00-K93)<br>
Diseases of the Skin and Subcutaneous Tissue (L00-L99)<br>
Diseases of the Musculoskeletal System and Connective Tissue (M00-M99)<br>
Diseases of the Genitourinary System (N00-N99)<br>
Pregnancy, Childbirth and the Puerperium (O00-O99)<br>
Certain Conditions originating in the Perinatal Period (P00-P96)<br>
Congenital Malformations, Deformations and Chromosomal Abnormalities (Q00-Q99)<br>
Symptoms, Signs and Abnormal Clinical and Laboratory Findings, Not Elsewhere Classified (R00-R99)<br>
Injury, Poisoning and Certain Other Consequences of External Causes (S00-T98)<br>
External Causes of Morbidity and Mortality (V01-Y98)<br>
Factors influencing Health Status and Contact with Health Services (Z00-Z99)

In [18]:
# Finally save the file to a CSV
path_to_raw="raw_data"
data_filtered.to_csv(f'{path_to_raw}/all_data_raw.csv.gzip', compression = 'gzip', index = False)
#data.to_csv(f'{path_to_raw}/all_data_raw.csv', index = False)