In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [4]:
readmit_df = pd.read_csv("patient_readmission_data_clean.csv")

In [5]:
readmit_df.head()

Unnamed: 0,patient_id,visit_id,arrival_datetime,discharge_datetime,triage_category,visit_type,next_visit,readmitted_30d,days_until_next_visit,age,sex,known_chronic_condition,num_chronic_diagnoses,num_procedures
0,P000001,V0032631,2023-10-18 12:54:57.266905,2023-10-18 21:30:20.296551,GREEN,emergency,,0,0.0,37,0.0,True,0.0,0.0
1,P000002,V0005217,2023-06-27 04:38:39.246809,2024-05-19 14:19:56.200858,GREEN,outpatient,2024-04-14 09:09:11.494422,0,292.0,7,0.0,False,0.0,0.0
2,P000002,V0028298,2024-04-14 09:09:11.494422,2024-12-04 12:08:31.219782,RED,outpatient,,0,0.0,7,0.0,False,0.0,0.0
3,P000003,V0089563,2023-08-25 12:00:25.100818,2023-08-25 13:39:06.308672,YELLOW,outpatient,2023-11-12 22:26:57.020621,0,79.0,86,0.0,False,0.0,1.0
4,P000003,V0046395,2023-11-12 22:26:57.020621,2023-11-13 02:24:15.934903,GREEN,outpatient,2024-07-14 23:48:45.443898,0,245.0,86,0.0,False,0.0,1.0


In [6]:
readmit_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   patient_id               120000 non-null  object 
 1   visit_id                 120000 non-null  object 
 2   arrival_datetime         120000 non-null  object 
 3   discharge_datetime       120000 non-null  object 
 4   triage_category          120000 non-null  object 
 5   visit_type               120000 non-null  object 
 6   next_visit               74582 non-null   object 
 7   readmitted_30d           120000 non-null  int64  
 8   days_until_next_visit    120000 non-null  float64
 9   age                      120000 non-null  int64  
 10  sex                      120000 non-null  float64
 11  known_chronic_condition  120000 non-null  bool   
 12  num_chronic_diagnoses    120000 non-null  float64
 13  num_procedures           120000 non-null  float64
dtypes: b

In [7]:
readmit_df['readmitted_30d'].value_counts()

readmitted_30d
0    108625
1     11375
Name: count, dtype: int64

In [11]:
# Check missing values percentage
missing_summary = readmit_df.isnull().sum()
missing_percentage = (missing_summary / len(readmit_df)) * 100
print("Missing values summary:")
print(missing_percentage[missing_percentage > 0])

Missing values summary:
next_visit    37.848333
dtype: float64


In [12]:
readmit_df.isnull().sum()

patient_id                     0
visit_id                       0
arrival_datetime               0
discharge_datetime             0
triage_category                0
visit_type                     0
next_visit                 45418
readmitted_30d                 0
days_until_next_visit          0
age                            0
sex                            0
known_chronic_condition        0
num_chronic_diagnoses          0
num_procedures                 0
dtype: int64

In [14]:
# Convert all datetime columns
print("Converting all datetime columns to proper format...")

datetime_cols = ['arrival_datetime', 'discharge_datetime', 'next_visit']

for col in datetime_cols:
    new_col_name = f"{col}_dt" if col == 'next_visit' else col + '_dt'
    
    print(f"\n--- Converting {col} to {new_col_name} ---")
    
    # Show sample before conversion
    sample = readmit_df[col].dropna().iloc[0] if readmit_df[col].notna().any() else "No data"
    print(f"  Sample value: {sample}")
    
    # Convert to datetime
    readmit_df[new_col_name] = pd.to_datetime(readmit_df[col], errors='coerce')
    
    # Check results
    original_non_null = readmit_df[col].notna().sum()
    converted_non_null = readmit_df[new_col_name].notna().sum()
    
    print(f"  Original non-null: {original_non_null}")
    print(f"  Converted non-null: {converted_non_null}")
    print(f"  Conversion success rate: {(converted_non_null/original_non_null*100):.1f}%" if original_non_null > 0 else "  No data to convert")
    
    # Keep original column for reference
    readmit_df[f"{col}_original"] = readmit_df[col]

print("\n✓ All datetime conversions complete!")

Converting all datetime columns to proper format...

--- Converting arrival_datetime to arrival_datetime_dt ---
  Sample value: 2023-10-18 12:54:57.266905
  Original non-null: 120000
  Converted non-null: 120000
  Conversion success rate: 100.0%

--- Converting discharge_datetime to discharge_datetime_dt ---
  Sample value: 2023-10-18 21:30:20.296551
  Original non-null: 120000
  Converted non-null: 120000
  Conversion success rate: 100.0%

--- Converting next_visit to next_visit_dt ---
  Sample value: 2024-04-14 09:09:11.494422
  Original non-null: 74582
  Converted non-null: 74582
  Conversion success rate: 100.0%

✓ All datetime conversions complete!


In [15]:
print("=== CALCULATING RELEVANT LENGTH OF STAY FEATURES ===")

# Calculate length of stay in hours (known predictor of readmission)
readmit_df['length_of_stay_hours'] = (
    readmit_df['discharge_datetime_dt'] - readmit_df['arrival_datetime_dt']
).dt.total_seconds() / 3600

print(f"✓ Length of stay calculated")

# Check if longer stays correlate with readmission
print("\nLength of Stay vs Readmission Analysis:")
print("Mean LOS by readmission status:")

for readmit_status in [0, 1]:
    mean_los = readmit_df.loc[readmit_df['readmitted_30d'] == readmit_status, 'length_of_stay_hours'].mean()
    median_los = readmit_df.loc[readmit_df['readmitted_30d'] == readmit_status, 'length_of_stay_hours'].median()
    count = (readmit_df['readmitted_30d'] == readmit_status).sum()
    
    print(f"\nReadmitted = {readmit_status} ({count} patients):")
    print(f"  Mean LOS: {mean_los:.2f} hours")
    print(f"  Median LOS: {median_los:.2f} hours")

# Calculate if stay was overnight (common readmission predictor)
readmit_df['overnight_stay'] = (readmit_df['length_of_stay_hours'] > 12).astype(int)

print("\nOvernight stays (LOS > 12 hours):")
overnight_by_readmit = readmit_df.groupby('readmitted_30d')['overnight_stay'].value_counts(normalize=True).unstack()
print(overnight_by_readmit)

print("\n✓ Only clinically relevant LOS features created:")
print("  1. length_of_stay_hours (continuous predictor)")
print("  2. overnight_stay (binary predictor)")

=== CALCULATING RELEVANT LENGTH OF STAY FEATURES ===
✓ Length of stay calculated

Length of Stay vs Readmission Analysis:
Mean LOS by readmission status:

Readmitted = 0 (108625 patients):
  Mean LOS: 3003.12 hours
  Median LOS: 316.51 hours

Readmitted = 1 (11375 patients):
  Mean LOS: 3146.21 hours
  Median LOS: 495.22 hours

Overnight stays (LOS > 12 hours):
overnight_stay         0         1
readmitted_30d                    
0               0.306863  0.693137
1               0.302505  0.697495

✓ Only clinically relevant LOS features created:
  1. length_of_stay_hours (continuous predictor)
  2. overnight_stay (binary predictor)


In [16]:
print("\n=== EXTRACTING TEMPORAL READMISSION PREDICTORS ===")

# 1. Day of week (weekend vs weekday admissions often have different outcomes)
readmit_df['arrival_day_of_week'] = readmit_df['arrival_datetime_dt'].dt.dayofweek
readmit_df['is_weekend_admission'] = (readmit_df['arrival_day_of_week'] >= 5).astype(int)

print("✓ Day of week features extracted")

# 2. Hour of arrival (late night/early morning often predictive)
readmit_df['arrival_hour'] = readmit_df['arrival_datetime_dt'].dt.hour
readmit_df['late_night_admission'] = ((readmit_df['arrival_hour'] >= 22) | (readmit_df['arrival_hour'] <= 6)).astype(int)

print("✓ Time of day features extracted")

# 3. Month/season (some months have higher readmission rates)
readmit_df['arrival_month'] = readmit_df['arrival_datetime_dt'].dt.month

print("✓ Month feature extracted")

print("\nTemporal features that predict readmission:")
temporal_features = ['is_weekend_admission', 'late_night_admission', 'arrival_month']

for feature in temporal_features:
    if feature in readmit_df.columns:
        readmit_rate_by_feature = readmit_df.groupby(feature)['readmitted_30d'].mean()
        print(f"\n{feature}:")
        print(f"  Readmission rates: {readmit_rate_by_feature.to_dict()}")


=== EXTRACTING TEMPORAL READMISSION PREDICTORS ===
✓ Day of week features extracted
✓ Time of day features extracted
✓ Month feature extracted

Temporal features that predict readmission:

is_weekend_admission:
  Readmission rates: {0: 0.09535184516249315, 1: 0.0933852140077821}

late_night_admission:
  Readmission rates: {0: 0.09536755547817702, 1: 0.09384306545189826}

arrival_month:
  Readmission rates: {1: 0.10046155356967496, 2: 0.10291323622545916, 3: 0.09911659062226968, 4: 0.0951853344077357, 5: 0.09200276106892812, 6: 0.09395017793594305, 7: 0.09732003129890454, 8: 0.09551944854751354, 9: 0.0977358877164225, 10: 0.09652509652509653, 11: 0.0933952528379773, 12: 0.0733815842380378}


In [18]:
print("=== CREATING HAS_NEXT_VISIT INDICATOR ===")

# Create binary indicator for missing next_visit
readmit_df['has_next_visit'] = readmit_df['next_visit'].notna().astype(int)

print(f"✓ Created 'has_next_visit' indicator")
print(f"\nDistribution:")
print(f"  0 (no next visit scheduled): {(readmit_df['has_next_visit'] == 0).sum()} patients")
print(f"  1 (has next visit scheduled): {(readmit_df['has_next_visit'] == 1).sum()} patients")
print(f"  Percentage with follow-up: {(readmit_df['has_next_visit'].mean()*100):.1f}%")

# Check readmission rates by follow-up status
print("\nReadmission rates by follow-up status:")
readmit_rate_by_followup = readmit_df.groupby('has_next_visit')['readmitted_30d'].mean()
print(readmit_rate_by_followup)

print(f"\nKey insight:")
print(f"  Patients WITHOUT scheduled follow-up: {readmit_rate_by_followup[0]*100:.1f}% readmitted")
print(f"  Patients WITH scheduled follow-up: {readmit_rate_by_followup[1]*100:.1f}% readmitted")

print("\n✓ This is a STRONG predictor of readmission!")

=== CREATING HAS_NEXT_VISIT INDICATOR ===
✓ Created 'has_next_visit' indicator

Distribution:
  0 (no next visit scheduled): 45418 patients
  1 (has next visit scheduled): 74582 patients
  Percentage with follow-up: 62.2%

Readmission rates by follow-up status:
has_next_visit
0    0.000000
1    0.152517
Name: readmitted_30d, dtype: float64

Key insight:
  Patients WITHOUT scheduled follow-up: 0.0% readmitted
  Patients WITH scheduled follow-up: 15.3% readmitted

✓ This is a STRONG predictor of readmission!


In [19]:
print("=== ANALYZING TIME BETWEEN VISITS ===")

# First, check the current days_until_next_visit distribution
print("Current days_until_next_visit column analysis:")
print(f"Mean: {readmit_df['days_until_next_visit'].mean():.2f} days")
print(f"Median: {readmit_df['days_until_next_visit'].median():.2f} days")
print(f"Min: {readmit_df['days_until_next_visit'].min():.2f} days")
print(f"Max: {readmit_df['days_until_next_visit'].max():.2f} days")

print(f"\nUnique values: {sorted(readmit_df['days_until_next_visit'].unique()[:10])}...")

# Check relationship with has_next_visit
print("\nRelationship with has_next_visit:")
print("For patients WITHOUT scheduled follow-up (has_next_visit = 0):")
no_followup = readmit_df[readmit_df['has_next_visit'] == 0]
print(f"  days_until_next_visit values: {sorted(no_followup['days_until_next_visit'].unique())}")
print(f"  Always 0 days: {(no_followup['days_until_next_visit'] == 0).all()}")

print("\nFor patients WITH scheduled follow-up (has_next_visit = 1):")
with_followup = readmit_df[readmit_df['has_next_visit'] == 1]
print(f"  days_until_next_visit stats:")
print(f"    Mean: {with_followup['days_until_next_visit'].mean():.2f} days")
print(f"    Min: {with_followup['days_until_next_visit'].min():.2f} days")
print(f"    Max: {with_followup['days_until_next_visit'].max():.2f} days")

# Create binned feature for follow-up timing
print("\n=== CREATING FOLLOW-UP TIMING FEATURES ===")

# Create meaningful bins for follow-up timing
readmit_df['followup_timing'] = 'no_followup'  # Default

# For patients with follow-up, bin by days
has_followup_mask = readmit_df['has_next_visit'] == 1

readmit_df.loc[has_followup_mask, 'followup_timing'] = pd.cut(
    readmit_df.loc[has_followup_mask, 'days_until_next_visit'],
    bins=[0, 1, 7, 14, 30, 90, float('inf')],
    labels=['same_day', 'within_week', '1_2_weeks', '2_4_weeks', '1_3_months', '3+_months']
)

print("Follow-up timing distribution:")
print(readmit_df['followup_timing'].value_counts())

print("\nReadmission rates by follow-up timing:")
readmit_rates = readmit_df.groupby('followup_timing')['readmitted_30d'].mean().sort_values(ascending=False)
for timing, rate in readmit_rates.items():
    count = (readmit_df['followup_timing'] == timing).sum()
    print(f"  {timing}: {rate*100:.1f}% readmitted ({count} patients)")

print("\n✓ Created followup_timing feature - strong predictor of readmission")

=== ANALYZING TIME BETWEEN VISITS ===
Current days_until_next_visit column analysis:
Mean: 101.18 days
Median: 40.00 days
Min: 0.00 days
Max: 724.00 days

Unique values: [np.float64(0.0), np.float64(13.0), np.float64(79.0), np.float64(89.0), np.float64(122.0), np.float64(135.0), np.float64(245.0), np.float64(281.0), np.float64(292.0), np.float64(293.0)]...

Relationship with has_next_visit:
For patients WITHOUT scheduled follow-up (has_next_visit = 0):
  days_until_next_visit values: [np.float64(0.0)]
  Always 0 days: True

For patients WITH scheduled follow-up (has_next_visit = 1):
  days_until_next_visit stats:
    Mean: 162.80 days
    Min: 0.00 days
    Max: 724.00 days

=== CREATING FOLLOW-UP TIMING FEATURES ===
Follow-up timing distribution:
followup_timing
no_followup    45418
3+_months      45349
1_3_months     17858
2_4_weeks       5702
1_2_weeks       2608
within_week     2266
same_day         431
Name: count, dtype: int64

Readmission rates by follow-up timing:
  1_2_weeks: 

In [20]:
print("=== INVESTIGATING DATA LEAKAGE ===")

# Check if days_until_next_visit is future-dated relative to discharge
print("Checking temporal relationship:")

# Sample check: For patients readmitted, what's their next visit timing?
readmitted = readmit_df[readmit_df['readmitted_30d'] == 1]
print(f"\nReadmitted patients ({len(readmitted)}):")
print(f"  Mean days_until_next_visit: {readmitted['days_until_next_visit'].mean():.2f}")
print(f"  Min days_until_next_visit: {readmitted['days_until_next_visit'].min():.2f}")
print(f"  Max days_until_next_visit: {readmitted['days_until_next_visit'].max():.2f}")

not_readmitted = readmit_df[readmit_df['readmitted_30d'] == 0]
print(f"\nNot readmitted patients ({len(not_readmitted)}):")
print(f"  Mean days_until_next_visit: {not_readmitted['days_until_next_visit'].mean():.2f}")
print(f"  Min days_until_next_visit: {not_readmitted['days_until_next_visit'].min():.2f}")
print(f"  Max days_until_next_visit: {not_readmitted['days_until_next_visit'].max():.2f}")

# Quick fix: Drop followup_timing column (leaking target info)
readmit_df = readmit_df.drop(columns=['followup_timing'], errors='ignore')
print("\n✓ Dropped problematic followup_timing column")

# Keep only has_next_visit as safe feature
print("\nUsing only safe temporal features:")
safe_features = ['length_of_stay_hours', 'overnight_stay', 
                 'is_weekend_admission', 'late_night_admission',
                 'arrival_month', 'has_next_visit']
print(f"Features: {safe_features}")

=== INVESTIGATING DATA LEAKAGE ===
Checking temporal relationship:

Readmitted patients (11375):
  Mean days_until_next_visit: 14.73
  Min days_until_next_visit: 0.00
  Max days_until_next_visit: 30.00

Not readmitted patients (108625):
  Mean days_until_next_visit: 110.24
  Min days_until_next_visit: 0.00
  Max days_until_next_visit: 724.00

✓ Dropped problematic followup_timing column

Using only safe temporal features:
Features: ['length_of_stay_hours', 'overnight_stay', 'is_weekend_admission', 'late_night_admission', 'arrival_month', 'has_next_visit']


In [21]:
print("\n=== CREATING PATIENT HISTORY FEATURES ===")

# Count previous visits per patient
patient_visit_counts = readmit_df.groupby('patient_id').size()
readmit_df['previous_visit_count'] = readmit_df['patient_id'].map(patient_visit_counts) - 1

print(f"✓ Created previous_visit_count")
print(f"\nDistribution:")
print(readmit_df['previous_visit_count'].describe())

# Check if frequent visitors have higher readmission risk
print("\nReadmission rate by visit frequency:")
readmit_df['frequent_visitor'] = (readmit_df['previous_visit_count'] > 2).astype(int)
freq_readmit = readmit_df.groupby('frequent_visitor')['readmitted_30d'].mean()
print(f"  Non-frequent visitors: {freq_readmit[0]*100:.1f}% readmitted")
print(f"  Frequent visitors (>2 previous visits): {freq_readmit[1]*100:.1f}% readmitted")

# Time since last visit (if we had historical data, but we can't calculate without dates)
print("\n✓ Patient history features created safely")


=== CREATING PATIENT HISTORY FEATURES ===
✓ Created previous_visit_count

Distribution:
count    120000.000000
mean          2.404083
std           1.555277
min           0.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          10.000000
Name: previous_visit_count, dtype: float64

Readmission rate by visit frequency:
  Non-frequent visitors: 5.3% readmitted
  Frequent visitors (>2 previous visits): 15.0% readmitted

✓ Patient history features created safely


In [22]:
print("\n=== PREPARING FINAL FEATURE SET ===")

# Select features that don't leak target information
final_features = [
    # Demographics
    'age', 'sex',
    
    # Clinical factors
    'known_chronic_condition', 'num_chronic_diagnoses', 'num_procedures',
    
    # Visit characteristics
    'triage_category', 'visit_type',
    
    # Temporal features (safe)
    'length_of_stay_hours', 'overnight_stay',
    'is_weekend_admission', 'late_night_admission', 'arrival_month',
    'has_next_visit',
    
    # Patient history
    'previous_visit_count', 'frequent_visitor'
]

print(f"Total features for model: {len(final_features)}")
print("\nFeature categories:")
print(f"  1. Demographics (2)")
print(f"  2. Clinical factors (3)")
print(f"  3. Visit characteristics (2)")
print(f"  4. Temporal features (6)")
print(f"  5. Patient history (2)")

# Check feature availability
missing = [f for f in final_features if f not in readmit_df.columns]
if missing:
    print(f"\n⚠️ Missing features: {missing}")
else:
    print("\n✓ All features available")


=== PREPARING FINAL FEATURE SET ===
Total features for model: 15

Feature categories:
  1. Demographics (2)
  2. Clinical factors (3)
  3. Visit characteristics (2)
  4. Temporal features (6)
  5. Patient history (2)

✓ All features available


In [23]:
print("=== ENCODING CATEGORICAL VARIABLES ===")

# 1. Check categorical columns
categorical_cols = ['triage_category', 'visit_type']
print(f"Categorical columns to encode: {categorical_cols}")

for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {readmit_df[col].unique()}")
    print(f"  Value counts:\n{readmit_df[col].value_counts()}")

# 2. Apply one-hot encoding (safer than label encoding for tree models)
print("\nApplying one-hot encoding...")
readmit_df_encoded = pd.get_dummies(readmit_df, columns=categorical_cols, drop_first=True)

print(f"Original shape: {readmit_df.shape}")
print(f"Encoded shape: {readmit_df_encoded.shape}")
print(f"Added {readmit_df_encoded.shape[1] - readmit_df.shape[1]} new columns")

# 3. Update final features list
encoded_features = [col for col in readmit_df_encoded.columns 
                    if col.startswith(tuple(categorical_cols)) and col not in categorical_cols]
print(f"\nNew encoded features: {encoded_features[:5]}... ({len(encoded_features)} total)")

print("\n✓ Categorical encoding complete")

=== ENCODING CATEGORICAL VARIABLES ===
Categorical columns to encode: ['triage_category', 'visit_type']

triage_category:
  Unique values: ['GREEN' 'RED' 'YELLOW' 'ORANGE']
  Value counts:
triage_category
YELLOW    30117
RED       30022
GREEN     29956
ORANGE    29905
Name: count, dtype: int64

visit_type:
  Unique values: ['emergency' 'outpatient' 'inpatient']
  Value counts:
visit_type
outpatient    53962
emergency     49494
inpatient     16544
Name: count, dtype: int64

Applying one-hot encoding...
Original shape: (120000, 30)
Encoded shape: (120000, 33)
Added 3 new columns

New encoded features: ['triage_category_ORANGE', 'triage_category_RED', 'triage_category_YELLOW', 'visit_type_inpatient', 'visit_type_outpatient']... (5 total)

✓ Categorical encoding complete


In [24]:
print("\n=== PREPARING DATA FOR MODELING ===")

# Define feature set (update with encoded features)
model_features = [
    'age', 'sex',
    'known_chronic_condition', 'num_chronic_diagnoses', 'num_procedures',
    'length_of_stay_hours', 'overnight_stay',
    'is_weekend_admission', 'late_night_admission', 'arrival_month',
    'has_next_visit', 'previous_visit_count', 'frequent_visitor'
] + encoded_features

print(f"Total features for modeling: {len(model_features)}")

# Prepare X and y
X = readmit_df_encoded[model_features]
y = readmit_df_encoded['readmitted_30d']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Positive class (readmitted): {y.sum()} ({y.mean()*100:.1f}%)")

# Quick check for missing values
print(f"\nMissing values in X: {X.isna().sum().sum()}")
print(f"Missing values in y: {y.isna().sum()}")


=== PREPARING DATA FOR MODELING ===
Total features for modeling: 18
X shape: (120000, 18)
y shape: (120000,)
Positive class (readmitted): 11375 (9.5%)

Missing values in X: 0
Missing values in y: 0


In [25]:
print("\n=== TIME-BASED DATA SPLIT ===")

# Sort by arrival datetime for time-based split
readmit_df_encoded = readmit_df_encoded.sort_values('arrival_datetime_dt')

# Use 80% for training, 20% for testing (chronological split)
split_idx = int(len(readmit_df_encoded) * 0.8)

X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nTraining period: {readmit_df_encoded.iloc[0]['arrival_datetime_dt']} to {readmit_df_encoded.iloc[split_idx-1]['arrival_datetime_dt']}")
print(f"Test period: {readmit_df_encoded.iloc[split_idx]['arrival_datetime_dt']} to {readmit_df_encoded.iloc[-1]['arrival_datetime_dt']}")

print("\nClass distribution:")
print(f"Train - Readmitted: {y_train.mean()*100:.1f}%")
print(f"Test - Readmitted: {y_test.mean()*100:.1f}%")


=== TIME-BASED DATA SPLIT ===
Training set: 96000 samples (80.0%)
Test set: 24000 samples (20.0%)

Training period: 2023-01-01 00:06:37.648042 to 2024-08-06 06:06:23.591299
Test period: 2024-08-06 06:08:03.833945 to 2024-12-30 23:58:38.454968

Class distribution:
Train - Readmitted: 9.5%
Test - Readmitted: 9.6%


In [27]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   --------------------- ------------------ 0.8/1.5 MB 958.5 kB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 1.0 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.2 MB/s  0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [28]:
print("=== TRAINING LIGHTGBM MODEL ===")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report

# Train LightGBM with class weighting
print("Training model...")
model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    class_weight='balanced',  # Handle imbalance
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

print("✓ Model training complete")

=== TRAINING LIGHTGBM MODEL ===
Training model...
✓ Model training complete


In [29]:
print("\n=== MODEL EVALUATION ===")

# Predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Precision-Recall AUC (better for imbalanced data)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall, precision)
print(f"PR-AUC Score: {pr_auc:.4f}")

# Classification report
print("\nClassification Report (threshold=0.5):")
print(classification_report(y_test, y_pred, target_names=['Not Readmitted', 'Readmitted']))

# Find optimal threshold for readmission prediction
print("\nFinding optimal threshold for readmission prediction...")
f1_scores = []
thresholds = np.arange(0.1, 0.6, 0.05)
for thresh in thresholds:
    y_pred_thresh = (y_pred_proba > thresh).astype(int)
    report = classification_report(y_test, y_pred_thresh, output_dict=True, zero_division=0)
    f1_scores.append(report['1']['f1-score'])

optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.2f} (maximizes F1-score)")


=== MODEL EVALUATION ===
ROC-AUC Score: 0.7887
PR-AUC Score: 0.2373

Classification Report (threshold=0.5):
                precision    recall  f1-score   support

Not Readmitted       0.98      0.55      0.70     21702
    Readmitted       0.17      0.90      0.29      2298

      accuracy                           0.58     24000
     macro avg       0.58      0.72      0.50     24000
  weighted avg       0.90      0.58      0.66     24000


Finding optimal threshold for readmission prediction...
Optimal threshold: 0.55 (maximizes F1-score)


In [30]:
print("\n=== FEATURE IMPORTANCE ===")

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': model_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features for readmission prediction:")
for i, row in feature_importance.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.3f}")

# Visual summary
print("\nKey predictors identified:")
print("1. Clinical factors (chronic conditions, procedures)")
print("2. Patient history (previous visits)")
print("3. Visit characteristics (triage, type)")
print("4. Temporal patterns (LOS, timing)")


=== FEATURE IMPORTANCE ===
Top 10 most important features for readmission prediction:
  length_of_stay_hours: 111.000
  previous_visit_count: 103.000
  arrival_month: 89.000
  age: 72.000
  num_procedures: 16.000
  overnight_stay: 14.000
  has_next_visit: 13.000
  triage_category_YELLOW: 8.000
  visit_type_inpatient: 8.000
  late_night_admission: 6.000

Key predictors identified:
1. Clinical factors (chronic conditions, procedures)
2. Patient history (previous visits)
3. Visit characteristics (triage, type)
4. Temporal patterns (LOS, timing)


In [32]:
print("\n=== GENERATING PATIENT RISK PREDICTIONS ===")

# Predict on all data
readmit_df_encoded['readmission_risk_score'] = model.predict_proba(
    readmit_df_encoded[model_features]
)[:, 1]

# Apply optimal threshold
readmit_df_encoded['predicted_readmission'] = (
    readmit_df_encoded['readmission_risk_score'] > optimal_threshold
).astype(int)

# Identify high-risk patients
high_risk_threshold = np.percentile(readmit_df_encoded['readmission_risk_score'], 90)
high_risk_patients = readmit_df_encoded[
    readmit_df_encoded['readmission_risk_score'] > high_risk_threshold
]

print(f"High-risk patients (top 10%): {len(high_risk_patients)}")
print(f"Mean risk score: {high_risk_patients['readmission_risk_score'].mean():.3f}")
print(f"Actual readmission rate in high-risk group: {high_risk_patients['readmitted_30d'].mean()*100:.1f}%")




=== GENERATING PATIENT RISK PREDICTIONS ===
High-risk patients (top 10%): 11998
Mean risk score: 0.636
Actual readmission rate in high-risk group: 26.6%


In [33]:
print("=== MAPPING PREDICTIONS TO ORIGINAL DATA ===")

# Create a clean results dataframe with original data and predictions
results_df = readmit_df.copy()

# Add predictions from the encoded dataframe
prediction_cols = ['readmission_risk_score', 'predicted_readmission', 'has_next_visit']
for col in prediction_cols:
    if col in readmit_df_encoded.columns:
        results_df[col] = readmit_df_encoded[col].values

print(f"✓ Added predictions to original dataframe")
print(f"New columns added: {prediction_cols}")

# Verify the mapping
print(f"\nShape of original dataframe: {readmit_df.shape}")
print(f"Shape of results dataframe: {results_df.shape}")
print(f"Columns in results: {list(results_df.columns)}")

# Show sample of results with key information
print("\nSample of high-risk patients with full information:")
sample_cols = ['patient_id', 'visit_id', 'age', 'triage_category', 'visit_type',
               'length_of_stay_hours', 'previous_visit_count', 
               'readmission_risk_score', 'predicted_readmission', 'readmitted_30d']

high_risk_sample = results_df[
    results_df['readmission_risk_score'] > np.percentile(results_df['readmission_risk_score'], 90)
].head(3)[sample_cols]

print(high_risk_sample.to_string(index=False))

# Calculate performance metrics on original data
print(f"\n=== PERFORMANCE ON ORIGINAL DATA ===")
print(f"Total patients: {len(results_df)}")
print(f"High-risk patients (top 10%): {(results_df['readmission_risk_score'] > np.percentile(results_df['readmission_risk_score'], 90)).sum()}")
print(f"Predicted to be readmitted: {results_df['predicted_readmission'].sum()} (using threshold {optimal_threshold})")

# Create risk categories for operational use
print(f"\n=== CREATING RISK CATEGORIES ===")

results_df['risk_category'] = pd.cut(
    results_df['readmission_risk_score'],
    bins=[0, 0.3, 0.5, 0.7, 1.0],
    labels=['Low', 'Medium', 'High', 'Critical']
)

print("Risk category distribution:")
print(results_df['risk_category'].value_counts().sort_index())

print("\nActual readmission rates by risk category:")
for category in ['Low', 'Medium', 'High', 'Critical']:
    category_data = results_df[results_df['risk_category'] == category]
    readmission_rate = category_data['readmitted_30d'].mean() * 100
    count = len(category_data)
    print(f"  {category}: {readmission_rate:.1f}% readmitted ({count} patients)")

=== MAPPING PREDICTIONS TO ORIGINAL DATA ===
✓ Added predictions to original dataframe
New columns added: ['readmission_risk_score', 'predicted_readmission', 'has_next_visit']

Shape of original dataframe: (120000, 30)
Shape of results dataframe: (120000, 32)
Columns in results: ['patient_id', 'visit_id', 'arrival_datetime', 'discharge_datetime', 'triage_category', 'visit_type', 'next_visit', 'readmitted_30d', 'days_until_next_visit', 'age', 'sex', 'known_chronic_condition', 'num_chronic_diagnoses', 'num_procedures', 'next_visit_dt', 'arrival_datetime_dt', 'arrival_datetime_original', 'discharge_datetime_dt', 'discharge_datetime_original', 'next_visit_original', 'length_of_stay_hours', 'overnight_stay', 'arrival_day_of_week', 'is_weekend_admission', 'arrival_hour', 'late_night_admission', 'arrival_month', 'has_next_visit', 'previous_visit_count', 'frequent_visitor', 'readmission_risk_score', 'predicted_readmission']

Sample of high-risk patients with full information:
patient_id visit_

In [34]:
print("=== SAVING PREDICTIONS TO CSV ===")

# Create the dataframe from Step 24 (before additional modifications)
output_df = readmit_df.copy()

# Add only the essential prediction columns
output_df['readmission_risk_score'] = readmit_df_encoded['readmission_risk_score'].values
output_df['predicted_readmission'] = readmit_df_encoded['predicted_readmission'].values
output_df['has_next_visit'] = readmit_df_encoded['has_next_visit'].values

# Keep original column order plus new prediction columns
original_cols = list(readmit_df.columns)
new_cols = ['readmission_risk_score', 'predicted_readmission', 'has_next_visit']
output_cols = original_cols + new_cols

output_df = output_df[output_cols]

# Save to CSV
output_file = 'patient_readmission_predictions.csv'
output_df.to_csv(output_file, index=False)

print(f"✓ Saved predictions to: {output_file}")
print(f"✓ Rows: {len(output_df)}")
print(f"✓ Columns: {len(output_df.columns)}")
print(f"\nFirst 5 columns: {output_df.columns[:5].tolist()}")
print(f"Last 5 columns: {output_df.columns[-5:].tolist()}")
print(f"\nPrediction columns added:")
print(f"  1. readmission_risk_score: Probability of readmission (0-1)")
print(f"  2. predicted_readmission: Binary prediction (1=readmission likely)")
print(f"  3. has_next_visit: Binary indicator from original data")

=== SAVING PREDICTIONS TO CSV ===
✓ Saved predictions to: patient_readmission_predictions.csv
✓ Rows: 120000
✓ Columns: 33

First 5 columns: ['patient_id', 'visit_id', 'arrival_datetime', 'discharge_datetime', 'triage_category']
Last 5 columns: ['previous_visit_count', 'frequent_visitor', 'readmission_risk_score', 'predicted_readmission', 'has_next_visit']

Prediction columns added:
  1. readmission_risk_score: Probability of readmission (0-1)
  2. predicted_readmission: Binary prediction (1=readmission likely)
  3. has_next_visit: Binary indicator from original data
