In [31]:
# imports

import pandas as pd
import pickle
from datetime import datetime

In [32]:
# load admission data
admissions = pd.read_csv('../../../data/originalData/admissions.csv')

# load X data
X_filePath = '../../../data/processedData/X_Admin.pkl'
with open(X_filePath, 'rb') as f:
    data = pickle.load(f)

In [33]:
# Ensure datetime columns are in proper datetime format
admissions['admittime'] = pd.to_datetime(admissions['admittime'])
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'])

In [34]:
# Sort Admissions by first subj, then by admittime
admissions = admissions.sort_values(by=['subject_id', 'admittime'])

In [35]:
# Shift admittime to get the next admission time for each subject
admissions['next_admittime'] = admissions.groupby('subject_id')['admittime'].shift(-1)

# Calculate days to next admission based on dischtime
admissions['days_to_next_admission'] = (admissions['next_admittime'] - admissions['dischtime']).dt.total_seconds() / (60 * 60 * 24)

# Define target variables for readmission within 30 and 60 days
admissions['readmitted_30'] = ((admissions['days_to_next_admission'] > 0) & 
                               (admissions['days_to_next_admission'] <= 30)).astype(int)
admissions['readmitted_60'] = ((admissions['days_to_next_admission'] > 0) & 
                               (admissions['days_to_next_admission'] <= 60)).astype(int)

# Fill NaN values in the target variables with 0 (no readmission)
admissions[['readmitted_30', 'readmitted_60']] = admissions[['readmitted_30', 'readmitted_60']].fillna(0).astype(int)


In [36]:
print(admissions.head(10))

   subject_id   hadm_id           admittime           dischtime deathtime  \
0    10000032  22595853 2180-05-06 22:23:00 2180-05-07 17:15:00       NaN   
1    10000032  22841357 2180-06-26 18:27:00 2180-06-27 18:49:00       NaN   
3    10000032  29079034 2180-07-23 12:35:00 2180-07-25 17:55:00       NaN   
2    10000032  25742920 2180-08-05 23:44:00 2180-08-07 17:50:00       NaN   
4    10000068  25022803 2160-03-03 23:16:00 2160-03-04 06:26:00       NaN   
5    10000084  23052089 2160-11-21 01:56:00 2160-11-25 14:52:00       NaN   
6    10000084  29888819 2160-12-28 05:11:00 2160-12-28 16:07:00       NaN   
7    10000108  27250926 2163-09-27 23:17:00 2163-09-28 09:04:00       NaN   
8    10000117  22927623 2181-11-15 02:05:00 2181-11-15 14:52:00       NaN   
9    10000117  27988844 2183-09-18 18:10:00 2183-09-21 16:30:00       NaN   

      admission_type      admission_location discharge_location insurance  \
0             URGENT  TRANSFER FROM HOSPITAL               HOME     Other  

In [37]:
# Add the calculated fields to the data dictionary

for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        # Find the matching row in admissions
        match = admissions[(admissions['subject_id'] == subject_id) & (admissions['hadm_id'] == hadm_id)]
        if not match.empty:
            row = match.iloc[0]
            # Add the calculated fields
            record['days_to_next_admission'] = row['days_to_next_admission']
            record['readmitted_30'] = row['readmitted_30']
            record['readmitted_60'] = row['readmitted_60']

In [39]:
def explore_nested_dict(data, top_key):
    if top_key in data:
        nested_dict = data[top_key]
        print(f"Top-level Key: {top_key}")
        print(f"Keys in nested dictionary: {nested_dict.keys()}")
        
        # Inspect one of the nested keys in detail
        for nested_key in nested_dict:
            print(f"\nNested Key: {nested_key}")
            print(f"Type of value: {type(nested_dict[nested_key])}")
            print(f"Value sample: {nested_dict[nested_key]}")
    else:
        print(f"Key {top_key} not found in the data.")


top_key_to_inspect = 10000032

explore_nested_dict(data, top_key_to_inspect)

Top-level Key: 10000032
Keys in nested dictionary: dict_keys([22595853, 22841357, 29079034])

Nested Key: 22595853
Type of value: <class 'dict'>
Value sample: {'diagnoses': ['d_572', 'd_789', 'd_571', 'd_070', 'd_496', 'd_296', 'd_309'], 'procedures': ['pcs_549'], 'drugs': ['p_NACLFLUSH', 'p_SPIR25', 'p_RALT400', 'p_ALBU17H', 'p_FURO20', 'p_MICROK10', 'p_NICO14P', 'p_HEPA5I', 'p_SPIR25', 'p_IPRA2H', 'p_INFL0.5LF', 'p_TRUV200/300', 'p_FURO40', 'p_APAP500'], 'admitdate': '2180-05-06', 'timespent': 18, 'admission_type': 'URGENT', 'admission_location': 'TRANSFER FROM HOSPITAL', 'insurance': 'Other', 'marital_status': 'WIDOWED', 'hospital_expire_flag': 0, 'days_to_next_admission': 50.05, 'readmitted_30': 0, 'readmitted_60': 1}

Nested Key: 22841357
Type of value: <class 'dict'>
Value sample: {'diagnoses': ['d_070', 'd_789', 'd_287', 'd_276', 'd_496', 'd_571', 'd_305'], 'procedures': ['pcs_549'], 'drugs': ['p_BACTDS', 'p_TIOT', 'p_RIFA550', 'p_APAP500', 'p_ALBU25', 'p_NACLFLUSH', 'p_RALT400'

In [38]:
output_file_path = '../../../data/processedData/X_Admin_Targets.pkl'
with open(output_file_path, 'wb') as f:
    pickle.dump(data, f)

print(f"Updated dictionary saved to {output_file_path}")

Updated dictionary saved to ../../../data/processedData/X_Admin_Targets.pkl


In [None]:
# calculate the percentage of patients readmitted within 30 and 60 days

total_admissions = 0
readmitted_30_count = 0
readmitted_60_count = 0

for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        total_admissions += 1  
        if record.get('readmitted_30', 0) == 1:  
            readmitted_30_count += 1
        if record.get('readmitted_60', 0) == 1:  
            readmitted_60_count += 1


percentage_readmitted_30 = (readmitted_30_count / total_admissions) * 100
percentage_readmitted_60 = (readmitted_60_count / total_admissions) * 100


print(f"Total Admissions: {total_admissions}")
print(f"Readmitted within 30 days: {readmitted_30_count} ({percentage_readmitted_30:.2f}%)")
print(f"Readmitted within 60 days: {readmitted_60_count} ({percentage_readmitted_60:.2f}%)")


Total Admissions: 214752
Readmitted within 30 days: 74456 (34.67%)
Readmitted within 60 days: 99901 (46.52%)


In [41]:
explore_nested_dict(data, top_key_to_inspect)

Top-level Key: 10000032
Keys in nested dictionary: dict_keys([22595853, 22841357, 29079034])

Nested Key: 22595853
Type of value: <class 'dict'>
Value sample: {'diagnoses': ['d_572', 'd_789', 'd_571', 'd_070', 'd_496', 'd_296', 'd_309'], 'procedures': ['pcs_549'], 'drugs': ['p_NACLFLUSH', 'p_SPIR25', 'p_RALT400', 'p_ALBU17H', 'p_FURO20', 'p_MICROK10', 'p_NICO14P', 'p_HEPA5I', 'p_SPIR25', 'p_IPRA2H', 'p_INFL0.5LF', 'p_TRUV200/300', 'p_FURO40', 'p_APAP500'], 'admitdate': '2180-05-06', 'timespent': 18, 'admission_type': 'URGENT', 'admission_location': 'TRANSFER FROM HOSPITAL', 'insurance': 'Other', 'marital_status': 'WIDOWED', 'hospital_expire_flag': 0, 'days_to_next_admission': 50.05, 'readmitted_30': 0, 'readmitted_60': 1}

Nested Key: 22841357
Type of value: <class 'dict'>
Value sample: {'diagnoses': ['d_070', 'd_789', 'd_287', 'd_276', 'd_496', 'd_571', 'd_305'], 'procedures': ['pcs_549'], 'drugs': ['p_BACTDS', 'p_TIOT', 'p_RIFA550', 'p_APAP500', 'p_ALBU25', 'p_NACLFLUSH', 'p_RALT400'