In [5]:
# ==============================================================================
# PART 0: SETUP
# ==============================================================================
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import keras_tuner as kt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# ==============================================================================
# PART 1: DATA LOADING AND PREPROCESSING
# ==============================================================================
print("--- PART 1: Loading and Preprocessing Initial Data ---")

code_columns = {
    'ICD9_DGNS_CD_1': str, 'ICD9_DGNS_CD_2': str, 'ICD9_DGNS_CD_3': str,
    'ICD9_DGNS_CD_4': str, 'ICD9_DGNS_CD_5': str, 'ICD9_DGNS_CD_6': str,
    'ICD9_DGNS_CD_7': str, 'ICD9_DGNS_CD_8': str, 'ICD9_DGNS_CD_9': str,
    'ICD9_DGNS_CD_10': str, 'ADMTNG_ICD9_DGNS_CD': str, 'CLM_DRG_CD': str,
    'ICD9_PRCDR_CD_1': str, 'ICD9_PRCDR_CD_2': str, 'ICD9_PRCDR_CD_3': str,
    'ICD9_PRCDR_CD_4': str, 'ICD9_PRCDR_CD_5': str, 'ICD9_PRCDR_CD_6': str
}

beneficiary_2008 = pd.read_csv("D:/Jupyter/HealthArk_data/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv")
beneficiary_2009 = pd.read_csv("D:/Jupyter/HealthArk_data/DE1_0_2009_Beneficiary_Summary_File_Sample_1.csv")
beneficiary_2010 = pd.read_csv("D:/Jupyter/HealthArk_data/DE1_0_2010_Beneficiary_Summary_File_Sample_1.csv")

chunk_size = 100000
    
inpatient_agg_list, inpatient_codes_list, inpatient_readmission_list = [], [], []
inpatient_iterator = pd.read_csv("D:/Jupyter/HealthArk_data/DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv", dtype=code_columns, chunksize=chunk_size)
for chunk in inpatient_iterator:
    inpatient_agg_list.append(chunk.groupby('DESYNPUF_ID').agg(Inpatient_Claim_Count=('CLM_ID', 'count'), Total_Inpatient_Payments=('CLM_PMT_AMT', 'sum')))
    inpatient_codes_list.append(chunk[['DESYNPUF_ID', 'ICD9_DGNS_CD_1']])
    chunk['CLM_ADMSN_DT'] = pd.to_datetime(chunk['CLM_ADMSN_DT'], format='%Y%m%d')
    chunk['CLM_THRU_DT'] = pd.to_datetime(chunk['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')
    inpatient_readmission_list.append(chunk)

inpatient_agg = pd.concat(inpatient_agg_list).groupby(level=0).sum()
inpatient_codes = pd.concat(inpatient_codes_list)
inpatient_claims_raw = pd.concat(inpatient_readmission_list)
    
outpatient_agg_list, outpatient_codes_list = [], []
outpatient_iterator = pd.read_csv("D:/Jupyter/HealthArk_data/DE1_0_2008_to_2010_Outpatient_Claims_Sample_1.csv", dtype=code_columns, engine='python', chunksize=chunk_size)
for chunk in outpatient_iterator:
    outpatient_agg_list.append(chunk.groupby('DESYNPUF_ID').agg(Outpatient_Claim_Count=('CLM_ID', 'count'), Total_Outpatient_Payments=('CLM_PMT_AMT', 'sum')))
    outpatient_codes_list.append(chunk[['DESYNPUF_ID', 'ICD9_DGNS_CD_1']])
        
outpatient_agg = pd.concat(outpatient_agg_list).groupby(level=0).sum()
outpatient_codes = pd.concat(outpatient_codes_list)

--- PART 1: Loading and Preprocessing Initial Data ---


In [3]:
# ==============================================================================
# PART 2: FEATURE ENGINEERING AND MERGING
# ==============================================================================
print("\n--- PART 2: Engineering Features and Merging Data ---")

all_beneficiaries = pd.concat([beneficiary_2008, beneficiary_2009, beneficiary_2010], ignore_index=True)
all_beneficiaries = all_beneficiaries.drop_duplicates(subset=['DESYNPUF_ID'], keep='last')
    
all_beneficiaries['BENE_BIRTH_DT'] = pd.to_datetime(all_beneficiaries['BENE_BIRTH_DT'], format='%m-%d-%Y')
all_beneficiaries['BENE_DEATH_DT'] = pd.to_datetime(all_beneficiaries['BENE_DEATH_DT'], format='%m-%d-%Y', errors='coerce')
reference_date = datetime(2010, 12, 31)
all_beneficiaries['Age'] = ((reference_date - all_beneficiaries['BENE_BIRTH_DT']).dt.days / 365.25).astype(int)
all_beneficiaries['Is_Dead'] = all_beneficiaries['BENE_DEATH_DT'].notna().astype(int)
chronic_condition_cols = [col for col in all_beneficiaries.columns if col.startswith('SP_')]
for col in chronic_condition_cols:
    all_beneficiaries[col] = all_beneficiaries[col].replace(2, 0)
all_beneficiaries['Chronic_Condition_Count'] = all_beneficiaries[chronic_condition_cols].sum(axis=1)
    
master_df = all_beneficiaries.merge(inpatient_agg, on='DESYNPUF_ID', how='left')
master_df = master_df.merge(outpatient_agg, on='DESYNPUF_ID', how='left')
claims_cols_to_fill = ['Inpatient_Claim_Count', 'Total_Inpatient_Payments', 'Outpatient_Claim_Count', 'Total_Outpatient_Payments']
master_df[claims_cols_to_fill] = master_df[claims_cols_to_fill].fillna(0)

inpatient_claims_raw = inpatient_claims_raw.sort_values(by=['DESYNPUF_ID', 'CLM_ADMSN_DT'])
inpatient_claims_raw['Next_Admission_Date'] = inpatient_claims_raw.groupby('DESYNPUF_ID')['CLM_ADMSN_DT'].shift(-1)
days_to_next_admission = (inpatient_claims_raw['Next_Admission_Date'] - inpatient_claims_raw['CLM_THRU_DT']).dt.days
inpatient_claims_raw['Was_Readmitted_in_30_Days'] = (days_to_next_admission <= 30).astype(int)
readmission_summary = inpatient_claims_raw.groupby('DESYNPUF_ID')['Was_Readmitted_in_30_Days'].max().reset_index()
readmission_summary = readmission_summary.rename(columns={'Was_Readmitted_in_30_Days': 'Had_30Day_Readmission_Ever'})
master_df_readmission = master_df.merge(readmission_summary, on='DESYNPUF_ID', how='left')
master_df_readmission['Had_30Day_Readmission_Ever'] = master_df_readmission['Had_30Day_Readmission_Ever'].fillna(0)
    
all_codes = pd.concat([inpatient_codes, outpatient_codes], ignore_index=True)
diagnosis_counts = all_codes.groupby('DESYNPUF_ID').size().reset_index(name='Total_Diagnosis_Count')
unique_diagnosis_counts = all_codes.groupby('DESYNPUF_ID')['ICD9_DGNS_CD_1'].nunique().reset_index(name='Unique_Diagnosis_Count')
master_df_enhanced = master_df_readmission.merge(diagnosis_counts, on='DESYNPUF_ID', how='left')
master_df_enhanced = master_df_enhanced.merge(unique_diagnosis_counts, on='DESYNPUF_ID', how='left')
master_df_enhanced[['Total_Diagnosis_Count', 'Unique_Diagnosis_Count']] = master_df_enhanced[['Total_Diagnosis_Count', 'Unique_Diagnosis_Count']].fillna(0)
categorical_cols = ['BENE_SEX_IDENT_CD', 'BENE_RACE_CD']
master_df_enhanced = pd.get_dummies(master_df_enhanced, columns=categorical_cols, drop_first=True)
    
try:
    drug_exposure = pd.read_excel("D:/Jupyter/HealthArk_data/drug_exposure.xlsx")
    person_mapping = pd.read_excel("D:/Jupyter/HealthArk_data/person.xlsx")
    person_id_map = person_mapping[['PERSON_ID', 'PERSON_SOURCE_VALUE']].rename(columns={'PERSON_SOURCE_VALUE': 'DESYNPUF_ID'})
    drug_exposure = drug_exposure.merge(person_id_map, on='PERSON_ID', how='left')
    
    if 'DESYNPUF_ID' in drug_exposure.columns:
        drug_counts = drug_exposure.groupby('DESYNPUF_ID').size().reset_index(name='Total_Drug_Count')
        unique_drug_counts = drug_exposure.groupby('DESYNPUF_ID')['DRUG_CONCEPT_ID'].nunique().reset_index(name='Unique_Drug_Count')
        avg_days_supply = drug_exposure.groupby('DESYNPUF_ID')['DAYS_SUPPLY'].mean().reset_index(name='Avg_Days_Supply')
        master_df_final = master_df_enhanced.merge(drug_counts, on='DESYNPUF_ID', how='left')
        master_df_final = master_df_final.merge(unique_drug_counts, on='DESYNPUF_ID', how='left')
        master_df_final = master_df_final.merge(avg_days_supply, on='DESYNPUF_ID', how='left')
        drug_feature_cols = ['Total_Drug_Count', 'Unique_Drug_Count', 'Avg_Days_Supply']
        master_df_final[drug_feature_cols] = master_df_final[drug_feature_cols].fillna(0)
except FileNotFoundError:
    print("Drug or Person files not found. Skipping drug features.")
    master_df_final = master_df_enhanced.copy()
    master_df_final[['Total_Drug_Count', 'Unique_Drug_Count', 'Avg_Days_Supply']] = 0



--- PART 2: Engineering Features and Merging Data ---


In [6]:
# ==============================================================================
# PART 3: NEURAL NETWORK DATA PREPARATION AND TUNING
# ==============================================================================
print("\n--- PART 3: Preparing Data and Tuning the Neural Network ---")

df_for_nn = master_df_final.copy()
y = df_for_nn['Had_30Day_Readmission_Ever']
features_to_drop = ['DESYNPUF_ID', 'BENE_BIRTH_DT', 'BENE_DEATH_DT', 'Had_30Day_Readmission_Ever']
X = df_for_nn.drop(columns=features_to_drop)
X = X.select_dtypes(include=['number'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train_scaled.shape[1],)))
    hp_units_1 = hp.Int('units_1', min_value=32, max_value=128, step=32)
    model.add(Dense(units=hp_units_1, activation='relu'))
    hp_dropout_1 = hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)
    model.add(Dropout(hp_dropout_1))
    hp_units_2 = hp.Int('units_2', min_value=16, max_value=64, step=16)
    model.add(Dense(units=hp_units_2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['recall'])
    return model

tuner = kt.RandomSearch(build_model,
                        objective='val_recall',
                        max_trials=10,
                        executions_per_trial=2,
                        directory='keras_tuner_dir_nn',
                        project_name='readmission_tuning_nn')

neg_cases, pos_cases = y_train.value_counts()
total = neg_cases + pos_cases
weight_for_0 = (1 / neg_cases) * (total / 2.0)
weight_for_1 = (1 / pos_cases) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

print("\n--- Starting the Search for the Best Neural Network Model ---")
tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2, class_weight=class_weight)

best_hps = tuner.get_best_hyperparameters()[0]

print(f"""
Search complete. The optimal hyperparameters are:
- Neurons in first layer: {best_hps.get('units_1')}
- Dropout rate: {best_hps.get('dropout_1'):.2f}
- Neurons in second layer: {best_hps.get('units_2')}
- Learning rate: {best_hps.get('learning_rate')}
""")

Trial 10 Complete [00h 03m 53s]
val_recall: 1.0

Best val_recall So Far: 1.0
Total elapsed time: 00h 35m 28s

Search complete. The optimal hyperparameters are:
- Neurons in first layer: 128
- Dropout rate: 0.20
- Neurons in second layer: 64
- Learning rate: 0.01



In [8]:
# ==============================================================================
# PART 4: TRAINING AND EVALUATING THE BEST NEURAL NETWORK
# ==============================================================================
print("\n--- PART 4: Training the Best Model on the Full Training Data ---")
best_model = tuner.hypermodel.build(best_hps)
best_model.save('neural_network_champion_model.keras')
print("\nSaved trained Neural Network to 'neural_network_champion_model.keras'")
history = best_model.fit(X_train_scaled, y_train, epochs=20, validation_split=0.2, class_weight=class_weight)

print("\n--- Evaluating the Tuned Neural Network Performance ---")
y_probs = best_model.predict(X_test_scaled)
y_pred = (y_probs > 0.5).astype(int)

print("\nClassification Report (Tuned Neural Network):")
print(classification_report(y_test, y_pred))


--- PART 4: Training the Best Model on the Full Training Data ---

Saved trained Neural Network to 'neural_network_champion_model.keras'
Epoch 1/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 0.2269 - recall: 0.9695 - val_loss: 0.2347 - val_recall: 0.9907
Epoch 2/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.2007 - recall: 0.9851 - val_loss: 0.1985 - val_recall: 0.9969
Epoch 3/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.1678 - recall: 0.9906 - val_loss: 0.2430 - val_recall: 0.9990
Epoch 4/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.1927 - recall: 0.9876 - val_loss: 0.1812 - val_recall: 1.0000
Epoch 5/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.2139 - recall: 0.9945 - val_loss: 0.1776 - val_recall: 1.0000
Epoch 6/20
[1m2327/2327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3