In [10]:
# Data handling
import pandas as pd
import numpy as np
from io import StringIO

# ML pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_predict

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [4]:
df = pd.read_csv("data\hospital_readmissions.csv")
df.head()


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [11]:
df = pd.read_csv("data\hospital_readmissions.csv")
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [13]:
# Function to convert age like "[70-80)" -> midpoint (75)
def age_to_num(age_str):
    if pd.isna(age_str):
        return np.nan
    import re
    m = re.search(r"(\d+)-(\d+)", str(age_str))
    if m:
        return (int(m.group(1)) + int(m.group(2))) / 2
    return np.nan

# Clean yes/no columns
for col in ['glucose_test','A1Ctest','change','diabetes_med','readmitted']:
    df[col] = df[col].astype(str).str.strip().replace({'nan': None, '': None})

# Drop rows with missing target
df = df.dropna(subset=['readmitted']).reset_index(drop=True)

# Encode yes/no -> 1/0
binary_map = {'yes':1, 'no':0}
for col in ['glucose_test','A1Ctest','change','diabetes_med','readmitted']:
    df[col] = df[col].map(binary_map).astype(int)

# Convert age
df['age_num'] = df['age'].apply(age_to_num)

df.head()


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [6]:
# Encode categorical columns
label_enc = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':   # if column is string/categorical
        X[col] = label_enc.fit_transform(X[col].astype(str))

# Encode target also (yes/no -> 1/0)
y = label_enc.fit_transform(y.astype(str))


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [9]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.605
