# **Setup**

In [1]:
import re
import pandas as pd
# pd.set_option('display.max_rows', None)  ###
# pd.set_option('display.max_columns', None)  ###
# pd.set_option('display.width', None)  ###
# pd.set_option('display.max_colwidth', None)  ###
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/MyDrive/MIMIC Project/data/"

else:
  # Setup Repository
  with open("repo_info.txt", "r") as repo_info:
      path_to_repo = repo_info.readline()


print(path_to_repo)
path_to_processed = f"{path_to_repo}processed/"

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive
/content/gdrive/MyDrive/MIMIC Project/data/


In [2]:
# import dataset we want to add new variables into it
file = f'{path_to_repo}df_mixed_discharge_death.csv.gzip'
death_df = pd.read_csv(file, compression = 'gzip', low_memory=False)

file = f'{path_to_repo}df_mixed_discharge.csv.gzip'
mixed_df = pd.read_csv(file, compression = 'gzip', low_memory=False)

display(death_df.shape)
display(mixed_df.shape)

(35268, 53)

(30985, 53)

In [None]:
# Check for duplicate combinations of 'hadm_id' and 'subject_id'
duplicate_combinations = death_df.groupby(['hadm_id', 'subject_id']).size().reset_index(name='count')
duplicate_combinations = duplicate_combinations[duplicate_combinations['count'] > 1]

if duplicate_combinations.empty:
    print("No duplicate combinations of 'hadm_id' and 'subject_id' found.")
else:
    print("Duplicate combinations of 'hadm_id' and 'subject_id' found:")
    print(duplicate_combinations)

No duplicate combinations of 'hadm_id' and 'subject_id' found.


In [None]:
display(death_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35268 entries, 0 to 35267
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   hadm_id             35268 non-null  int64  
 1   subject_id          35268 non-null  int64  
 2   ethnicity           35268 non-null  object 
 3   admission_type      35268 non-null  object 
 4   admission_location  35268 non-null  object 
 5   insurance           35268 non-null  object 
 6   religion            35268 non-null  object 
 7   marital_status      35268 non-null  object 
 8   icu_los             35268 non-null  float64
 9   gender              35268 non-null  object 
 10  age                 35268 non-null  float64
 11  urea_n_min          35253 non-null  float64
 12  urea_n_max          35253 non-null  float64
 13  urea_n_mean         35253 non-null  float64
 14  platelets_min       35248 non-null  float64
 15  platelets_max       35248 non-null  float64
 16  plat

None

# **1. add mortality bianry variable**

In [None]:
# import dataset
file = f'{path_to_repo}ADMISSIONS.csv.gz'
admissions = pd.read_csv(file, compression = 'gzip', low_memory=False)
# Lowercase column names
admissions.rename(columns=str.lower, inplace=True)
# display(admissions.columns)
# display(death_df.columns)
death_df = pd.merge(death_df, admissions[['hadm_id', 'subject_id','deathtime']], on=['hadm_id', 'subject_id'], how='left')
display(death_df.columns)

Index(['hadm_id', 'subject_id', 'ethnicity', 'admission_type',
       'admission_location', 'insurance', 'religion', 'marital_status',
       'icu_los', 'gender', 'age', 'urea_n_min', 'urea_n_max', 'urea_n_mean',
       'platelets_min', 'platelets_max', 'platelets_mean', 'magnesium_max',
       'albumin_min', 'calcium_min', 'resprate_min', 'resprate_max',
       'resprate_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'hr_min',
       'hr_max', 'hr_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean',
       'diasbp_min', 'diasbp_max', 'diasbp_mean', 'temp_min', 'temp_max',
       'temp_mean', 'sapsii', 'sofa', 'urine_min', 'urine_mean', 'urine_max',
       'patientweight', 'age_cat', 'type_stay', 'prev_adm', 'dest_discharge',
       'emergency_dpt', 'icd_chapter', 'origin_patient', 'los', 'text',
       'deathtime'],
      dtype='object')

In [None]:
# Count the number of null values in 'deathtime' column
print("Number of null values in 'deathtime' column:", death_df['deathtime'].isnull().sum())
# Count the number of non-null values in 'deathtime' column
print("Number of non-null values in 'deathtime' column:", death_df['deathtime'].notnull().sum())

# Define a function to apply the logic
def calculate_mortality(deathtime):
    if pd.isnull(deathtime):
        return 0
    else:
        return 1

# Apply the function to create the new 'mortality' column
death_df['mortality'] = death_df['deathtime'].apply(lambda x: calculate_mortality(x))
print(death_df['mortality'].value_counts())

Number of null values in 'deathtime' column: 30985
Number of non-null values in 'deathtime' column: 4283
0    30985
1     4283
Name: mortality, dtype: int64


# **2. add drg_severity & drg_mortality categorical variables**

In [None]:
# import dataset
file = f'{path_to_repo}DRGCODES.csv.gz'
drgcodes = pd.read_csv(file,compression = 'gzip', low_memory=False)
# Lowercase column names
drgcodes.rename(columns=str.lower, inplace=True)
drgcodes['description'] = drgcodes['description'].str.lower()
drgcodes = drgcodes.drop(columns=['row_id'])
drgcodes = drgcodes.sort_values(by=['hadm_id', 'subject_id'], ascending=True)
# Drop duplicates
drgcodes = drgcodes.drop_duplicates()
drgcodes['drg_type'].value_counts()

APR     41401
HCFA    31644
MS      27279
Name: drg_type, dtype: int64

In [None]:
# Subset for HCFA
# hcfa_subset = drgcodes[drgcodes['drg_type'] == 'HCFA'][['hadm_id', 'subject_id', 'description', 'drg_type']]
description_subset = drgcodes[['hadm_id', 'subject_id', 'description']]
description_subset = description_subset.sort_values(by=['hadm_id', 'subject_id'])

# Subset for APR
apr_subset = drgcodes[drgcodes['drg_type'] == 'APR '][['hadm_id', 'subject_id', 'drg_severity', 'drg_mortality', 'drg_type']]
apr_subset = apr_subset.sort_values(by=['hadm_id', 'subject_id', 'drg_severity', 'drg_mortality', 'drg_type'])

display(description_subset.head())
display(apr_subset.head())

Unnamed: 0,hadm_id,subject_id,description
106054,100001,58526,diabetes w cc
108326,100001,58526,diabetes
70628,100003,54610,peptic ulcer & gastritis
70630,100003,54610,g.i. hemorrhage w cc
5521,100006,9895,chronic obstructive pulmonary disease


Unnamed: 0,hadm_id,subject_id,drg_severity,drg_mortality,drg_type
108326,100001,58526,3.0,3.0,APR
70628,100003,54610,3.0,1.0,APR
36560,100007,23018,3.0,3.0,APR
105742,100009,533,2.0,1.0,APR
98183,100010,55853,2.0,3.0,APR


In [None]:
# Drop duplicates based on specified columns
apr_subset.drop_duplicates(subset=['hadm_id', 'subject_id', 'drg_severity', 'drg_mortality', 'drg_type'], keep='first', inplace=True)
# Convert 'drg_severity' and 'drg_mortality' to integers
apr_subset['drg_severity'] = apr_subset['drg_severity'].astype(int)
apr_subset['drg_mortality'] = apr_subset['drg_mortality'].astype(int)
apr_subset = apr_subset.sort_values(by=['hadm_id', 'subject_id','drg_severity','drg_mortality'], ascending=True)
# Drop duplicates based on 'hadm_id' and 'subject_id', keeping only the first occurrence
apr_subset = apr_subset.drop_duplicates(subset=['hadm_id', 'subject_id'])

In [None]:
# Group by hadm_id and subject_id, then aggregate seq_num into a string separated by comma
description_subset_groupby = description_subset.groupby(['hadm_id', 'subject_id'])['description'].apply(lambda x: ', '.join(map(str, x))).reset_index()

In [None]:
# Merge the selected column with raw_df
raw_df1 = pd.merge(death_df, description_subset_groupby, on=['hadm_id', 'subject_id'], how='left')
raw_df2 = pd.merge(raw_df1, apr_subset, on=['hadm_id', 'subject_id'], how='left')
raw_df2.drop(columns=['drg_type'], inplace=True)
# Replace NaN values with a default value 0
raw_df2['drg_severity'] = raw_df2['drg_severity'].fillna(0)
raw_df2['drg_mortality'] = raw_df2['drg_mortality'].fillna(0)

ordinal_categories = [0, 1, 2, 3, 4]
ordinal_dtype = pd.CategoricalDtype(categories=ordinal_categories, ordered=True)

# Convert the column to the ordered categorical data type
raw_df2['drg_severity'] = raw_df2['drg_severity'].astype(ordinal_dtype)
raw_df2['drg_mortality'] = raw_df2['drg_mortality'].astype(ordinal_dtype)
display(raw_df2['drg_severity'].value_counts())
display(raw_df2['drg_mortality'].value_counts())

4    11123
3    10088
0     7921
2     4838
1     1298
Name: drg_severity, dtype: int64

4    9598
3    8192
0    7921
2    6082
1    3475
Name: drg_mortality, dtype: int64

In [None]:
# Convert integer column to ordinal variable
raw_df2['drg_severity'] = pd.Categorical(raw_df2['drg_severity'], ordered=True)

# Optional: Rename categories if needed
raw_df2['drg_severity'] = raw_df2['drg_severity'].cat.rename_categories(['None', 'Low', 'Medium', 'High', 'Highest'])

# Convert integer column to ordinal variable
raw_df2['drg_mortality'] = pd.Categorical(raw_df2['drg_mortality'], ordered=True)

# Optional: Rename categories if needed
raw_df2['drg_mortality'] = raw_df2['drg_mortality'].cat.rename_categories(['None', 'Low', 'Medium', 'High', 'Highest'])


display(raw_df2['drg_severity'].value_counts())
display(raw_df2['drg_mortality'].value_counts())

Highest    11123
High       10088
None        7921
Medium      4838
Low         1298
Name: drg_severity, dtype: int64

Highest    9598
High       8192
None       7921
Medium     6082
Low        3475
Name: drg_mortality, dtype: int64

# **3. add comorbidity binary variable**

In [None]:
# Fill missing values in the 'description' column with an empty string
raw_df2['description'] = raw_df2['description'].fillna('')
# Create the comorbidity column and set values as binary variable
raw_df2['comorbidity'] = (raw_df2['description'].str.contains('comorbidities', case=False)).astype(int)
raw_df2['comorbidity'] = raw_df2['comorbidity'].map({1: 'Yes', 0: 'No'})
raw_df2['comorbidity'].value_counts()

No     30988
Yes     4280
Name: comorbidity, dtype: int64

# **4. add first_trans_dur numerical variable**

In [None]:
# import dataset
file = f'{path_to_repo}ADMISSIONS.csv.gz'
admissions = pd.read_csv(file, compression = 'gzip', low_memory=False)
# Lowercase column names
admissions.rename(columns=str.lower, inplace=True)
raw_df3 = pd.merge(raw_df2, admissions[['hadm_id', 'subject_id','admittime']], on=['hadm_id', 'subject_id'], how='left')
raw_df3['admittime'] = pd.to_datetime(raw_df3['admittime'])
# raw_df3.info()

In [None]:
# import dataset
file = f'{path_to_repo}TRANSFERS.csv.gz'
transfers = pd.read_csv(file, compression = 'gzip', low_memory=False)
# Lowercase column names
transfers.rename(columns=str.lower, inplace=True)
transfers['intime'] = pd.to_datetime(transfers['intime'])
# Group by 'subject_id' and 'hadm_id', then select the oldest 'admittime'
oldest_intime_subset = transfers.groupby(['subject_id', 'hadm_id'])['intime'].min().reset_index()
oldest_intime_subset.head()

Unnamed: 0,subject_id,hadm_id,intime
0,2,163353,2138-07-17 19:23:42
1,3,145834,2101-10-20 19:10:11
2,4,185777,2191-03-16 00:29:31
3,5,178980,2103-02-02 04:41:29
4,6,107064,2175-05-30 00:16:20


In [None]:
raw_df4 = pd.merge(raw_df3, oldest_intime_subset, on=['hadm_id', 'subject_id'], how='left')
# Calculate the duration between 'intime' and 'admittime' in minutes
raw_df4['first_trans_dur'] = (raw_df4['intime'] - raw_df4['admittime']).dt.total_seconds() / 60

# **5. add count_transfer numerical variable**

In [None]:
# Filter the DataFrame to include only rows where 'eventtype' is 'transfer'
transfer_events = transfers[transfers['eventtype'] == 'transfer']

# Group by 'subject_id' and 'hadm_id', then count the occurrences of 'transfer' eventtype
transfer_counts = transfer_events.groupby(['subject_id', 'hadm_id']).size().reset_index(name='count_transfer')
transfer_counts.head()

Unnamed: 0,subject_id,hadm_id,count_transfer
0,2,163353,3
1,3,145834,1
2,4,185777,1
3,5,178980,2
4,6,107064,3


In [None]:
raw_df5 = pd.merge(raw_df4, transfer_counts, on=['hadm_id', 'subject_id'], how='left')
# Fill missing values in 'count_transfer' column with 0
raw_df5['count_transfer'] = raw_df5['count_transfer'].fillna(0)
# Cast the 'transfer_count' column to integer type
raw_df5['count_transfer'] = raw_df5['count_transfer'].astype(int)
# raw_df5.info()
raw_df5['count_transfer'].value_counts()

1     8695
2     8170
3     6463
4     3963
0     2884
5     2244
6     1248
7      667
8      394
9      230
10     136
11      74
12      31
13      26
14      18
15       8
16       5
17       3
19       3
20       2
18       2
23       1
21       1
Name: count_transfer, dtype: int64

add readmission binary variable

In [None]:
# Sort the DataFrame by 'subject_id' and 'admittime'
raw_df_sorted = raw_df5.sort_values(by=['subject_id', 'admittime'])

# Group by 'subject_id' and select the first 'hadm_id' (oldest admission) for each 'subject_id'
oldest_hadm_id = raw_df_sorted.groupby('subject_id')['hadm_id'].first().reset_index()

# Merge this information back to the original DataFrame
raw_df6 = pd.merge(raw_df_sorted, oldest_hadm_id, on='subject_id', how='left', suffixes=('', '_oldest'))

# Set 'readmission' to 0 if 'hadm_id' matches the oldest admission time for the 'subject_id', and 1 otherwise
raw_df6['readmission'] = (raw_df6['hadm_id'] != raw_df6['hadm_id_oldest']).astype(int)
raw_df6['readmission'] = raw_df6['readmission'].map({1: 'Yes', 0: 'No'})

# Drop the intermediate columns
raw_df6.drop(columns=['hadm_id_oldest'], inplace=True)
raw_df6['readmission'].value_counts()

No     28980
Yes     6288
Name: readmission, dtype: int64

In [None]:
raw_df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35268 entries, 0 to 35267
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   hadm_id             35268 non-null  int64         
 1   subject_id          35268 non-null  int64         
 2   ethnicity           35268 non-null  object        
 3   admission_type      35268 non-null  object        
 4   admission_location  35268 non-null  object        
 5   insurance           35268 non-null  object        
 6   religion            35268 non-null  object        
 7   marital_status      35268 non-null  object        
 8   icu_los             35268 non-null  float64       
 9   gender              35268 non-null  object        
 10  age                 35268 non-null  float64       
 11  urea_n_min          35253 non-null  float64       
 12  urea_n_max          35253 non-null  float64       
 13  urea_n_mean         35253 non-null  float64   

# **6. add readm_duration numerical variable**

In [None]:
# Sort the DataFrame by 'subject_id' and 'admittime'
raw_df_sorted = raw_df6.sort_values(by=['subject_id', 'admittime'])

# Group by 'subject_id' and calculate the duration difference between consecutive 'admittime's
raw_df_sorted['readm_duration'] = raw_df_sorted.groupby('subject_id')['admittime'].diff()


# Convert timedelta to float values in days and handle NaN
raw_df_sorted['readm_duration'] = raw_df_sorted['readm_duration'] / pd.Timedelta(days=1)
raw_df_sorted['readm_duration'] = raw_df_sorted['readm_duration'].fillna(0)

# Reset index
raw_df_sorted.reset_index(drop=True, inplace=True)

# **7. add icustay_count numerical variable**

In [None]:
# Grouping by hadm_id and subject_id and counting the number of icustay_id
grouped_df = transfers.groupby(['hadm_id', 'subject_id']).agg(icustay_count=('icustay_id', 'count')).reset_index()
# Merge this information back to the original DataFrame
raw_df7 = pd.merge(raw_df_sorted, grouped_df, on=['hadm_id', 'subject_id'], how='left')
# raw_df7.info()
raw_df7['icustay_count'].value_counts()

1     24644
2      7478
3      2107
4       669
5       224
6        98
7        26
8        11
9         7
10        3
11        1
Name: icustay_count, dtype: int64

# **8. add to_icu binary variable**

In [None]:
# Create a new binary variable to_icu
raw_df7['to_icu'] = 0  # Initialize all values to 0

# Define the conditions
condition = ((raw_df7['admission_type'] != 'EMERGENCY') & (raw_df7['admission_type'] != 'URGENT')) & (raw_df7['icustay_count'] >= 1)

# Apply the conditions to set the values of 'to_icu' accordingly
raw_df7.loc[condition, 'to_icu'] = 1
raw_df7['to_icu'] = raw_df7['to_icu'].map({1: 'Yes', 0: 'No'})

raw_df7['to_icu'].value_counts()

No     30218
Yes     5050
Name: to_icu, dtype: int64

# **9. Remove the Date and Time from text column**

In [None]:
pattern = r"\w+\s*(Date):\s*\[\*\*[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\*\*\]"
pattern_dob = r"Date of Birth:\s*\[\*\*[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\*\*\]"
pattern_end = r"Completed by:\s*\[\*\*[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\*\*\]"

In [None]:
raw_df7['text'] = raw_df7['text'].apply(lambda x:re.sub(pattern_end,'', re.sub(pattern_dob, '', re.sub(pattern, '', x))))

In [None]:
# Define regex patterns to match "Date of death" and "Time of death" content
pattern_date_of_death = r"Date of death\s*\[\*\*[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}\*\*\]"
pattern_time_of_death = r"Time of death\s*\d{1,2}:\d{2}\s*[ap]\.m\."

# Apply regex substitution to remove the patterns from the 'text' column
raw_df7['text'] = raw_df7['text'].apply(lambda x: re.sub(pattern_date_of_death, '', x, flags=re.IGNORECASE))
raw_df7['text'] = raw_df7['text'].apply(lambda x: re.sub(pattern_time_of_death, '', x, flags=re.IGNORECASE))

# **10. Save the DataFrame to a CSV file**

In [None]:
raw_df7 = raw_df7.drop(columns=['description','admittime', 'intime', 'deathtime'])
raw_df7.info()
# Save the DataFrame to a CSV file
raw_df7.to_csv(f'{path_to_processed}all_raw_data_2024.csv.gz', index=False, compression='gzip')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35268 entries, 0 to 35267
Data columns (total 63 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   hadm_id             35268 non-null  int64   
 1   subject_id          35268 non-null  int64   
 2   ethnicity           35268 non-null  object  
 3   admission_type      35268 non-null  object  
 4   admission_location  35268 non-null  object  
 5   insurance           35268 non-null  object  
 6   religion            35268 non-null  object  
 7   marital_status      35268 non-null  object  
 8   icu_los             35268 non-null  float64 
 9   gender              35268 non-null  object  
 10  age                 35268 non-null  float64 
 11  urea_n_min          35253 non-null  float64 
 12  urea_n_max          35253 non-null  float64 
 13  urea_n_mean         35253 non-null  float64 
 14  platelets_min       35248 non-null  float64 
 15  platelets_max       35248 non-null  

# **11. Save a csv to record observation containing death information which we could not completely delete.**

In [None]:
# Define a regex pattern to match common death-related keywords
death_pattern = r'\b(death|died|deceased|expire)\b'  # Example pattern for common death-related keywords

# Filter rows where 'text' column contains the word "death" (case-insensitive)
death_df = raw_df7[raw_df7['text'].str.contains(death_pattern, case=False)]

# Optionally, reset the index of the new DataFrame
death_df.reset_index(drop=True, inplace=True)

# Specify the file path where you want to save the CSV file
csv_file_path = '/content/gdrive/MyDrive/MIMIC Project/data/death_records.csv'

# Save the filtered DataFrame to a CSV file
death_df.to_csv(csv_file_path, index=False)

print(f"Filtered DataFrame saved to {csv_file_path}")

  death_df = raw_df7[raw_df7['text'].str.contains(death_pattern, case=False)]


Filtered DataFrame saved to /content/gdrive/MyDrive/MIMIC Project/data/death_records.csv
