In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Path to the raw CSV
los_csv = "../data/raw/kaggle-hospital-los/hospital_length_of_stay.csv"


In [19]:
# Load the data
df = pd.read_csv(los_csv)

# Show shape and columns
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Summarize missing values
missing = df.isnull().sum()
print("\nMissing values per column:\n", missing)


Shape: (100000, 28)
Columns: ['eid', 'vdate', 'rcount', 'gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother', 'fibrosisandother', 'malnutrition', 'hemo', 'hematocrit', 'neutrophils', 'sodium', 'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration', 'secondarydiagnosisnonicd9', 'discharged', 'facid', 'lengthofstay']

Missing values per column:
 eid                           0
vdate                         0
rcount                        0
gender                        0
dialysisrenalendstage         0
asthma                        0
irondef                       0
pneum                         0
substancedependence           0
psychologicaldisordermajor    0
depress                       0
psychother                    0
fibrosisandother              0
malnutrition                  0
hemo                          0
hematocrit                    0
neutrophils                   0
sod

In [20]:
df['vdate'] = pd.to_datetime(df['vdate'])
df['year']  = df['vdate'].dt.year
df['month'] = df['vdate'].dt.month
df['day']   = df['vdate'].dt.day


In [21]:
# Map gender strings to binary
df['gender'] = df['gender'].map({'M': 1, 'F': 0})

# Confirm no nulls introduced
assert df['gender'].isnull().sum() == 0


In [22]:
# 1. Auto-cast numeric binary columns (excluding target and discharge flag)
bin_cols = [
    c for c in df.columns
    if df[c].dtype in ['int64', 'float64']
    and df[c].nunique() == 2
    and c not in ['lengthofstay', 'discharged']
]
for c in bin_cols:
    df[c] = df[c].astype(int)

print("Binary columns cast to int:", bin_cols)

# 2. Drop identifier and original date columns
df = df.drop(columns=['eid', 'facid', 'vdate'])

print("Remaining columns after drop:", df.columns.tolist())


Binary columns cast to int: ['gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother', 'fibrosisandother', 'malnutrition', 'hemo']
Remaining columns after drop: ['rcount', 'gender', 'dialysisrenalendstage', 'asthma', 'irondef', 'pneum', 'substancedependence', 'psychologicaldisordermajor', 'depress', 'psychother', 'fibrosisandother', 'malnutrition', 'hemo', 'hematocrit', 'neutrophils', 'sodium', 'glucose', 'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration', 'secondarydiagnosisnonicd9', 'discharged', 'lengthofstay', 'year', 'month', 'day']


In [23]:
from sklearn.model_selection import train_test_split

# 1. Define features (X) and target (y)
X = df.drop(columns=['lengthofstay'])
y = df['lengthofstay']

# 2. Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test  shape:", y_test.shape)

# 3. Create processed data directories if they don't exist
import os
os.makedirs("../data/processed", exist_ok=True)

# 4. Save to CSV
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("Processed files saved to data/processed/")


X_train shape: (80000, 27)
X_test  shape: (20000, 27)
y_train shape: (80000,)
y_test  shape: (20000,)
Processed files saved to data/processed/


In [24]:
# --- BEGIN PATCH FOR CATEGORICAL ENCODING ---

# 1. Identify object-dtype columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns to encode:", cat_cols)

# 2. One-hot encode train and test, dropping the first level to avoid collinearity
X_train_enc = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_enc  = pd.get_dummies(X_test,  columns=cat_cols, drop_first=True)

# 3. Align train/test so they have the same dummy columns
X_train_enc, X_test_enc = X_train_enc.align(
    X_test_enc, join='left', axis=1, fill_value=0
)

print("After encoding, X_train shape:", X_train_enc.shape)
print("After encoding, X_test  shape:", X_test_enc.shape)

# Replace X_train/X_test with the encoded versions
X_train = X_train_enc
X_test  = X_test_enc


Categorical columns to encode: ['rcount', 'discharged']
After encoding, X_train shape: (80000, 407)
After encoding, X_test  shape: (20000, 407)
