In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
df = pd.read_csv('Synthetic_Patient_Dataset.csv')

In [4]:

# 1. Split features and target
X = df.drop('highRisk', axis=1)
y = df['highRisk']

# 2. Identify categorical and numeric features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 3. Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 4. Create the full pipeline with a Random Forest classifier
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [5]:

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:

# 6. Train the model
clf.fit(X_train, y_train)


In [7]:
# 7. Predictions and evaluation
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_proba))

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.74      0.67       103
           1       0.64      0.49      0.56        97

    accuracy                           0.62       200
   macro avg       0.62      0.62      0.61       200
weighted avg       0.62      0.62      0.61       200

ROC AUC Score:  0.6696526874186768


In [8]:
import joblib

# Save the trained pipeline to disk
joblib.dump(clf, 'hospitalization_model.pkl')

['hospitalization_model.pkl']

In [11]:
import joblib
import pandas as pd
import json

# Load the saved pipeline
model = joblib.load('hospitalization_model.pkl')

# Sample one example from X_test
X_example = X_test.sample(1, random_state=42)
example_dict = X_example.iloc[0].to_dict()

# Run inference
prediction = model.predict(pd.DataFrame([example_dict]))[0]
probability = model.predict_proba(pd.DataFrame([example_dict]))[0, 1]

# Print as JSON
json_input = json.dumps(example_dict, indent=2)
print("=== Example Input as JSON ===")
print(json_input)
print("\n=== Inference Result ===")
print(f"High Risk Prediction: {bool(prediction)}")
print(f"Probability of High Risk: {probability:.2f}")


=== Example Input as JSON ===
{
  "medicationsTaken": 0,
  "painReport": 1,
  "mood": "tired",
  "memoryIssuesNoted": 0,
  "foodIntake": "increased",
  "sleepQuality": "normal",
  "ableToLeaveHouse": 1,
  "needsFollowUp": 1,
  "appointmentMissed": 0,
  "smallTalkTopic": "Gardening",
  "enthusiasmLevel": "medium",
  "topicInterest": "low",
  "conversationFlow": "smooth",
  "nKeyInsights": 1,
  "nRecommendations": 0,
  "nRiskFactors": 2,
  "nFollowUpTopics": 1,
  "age": 67,
  "sex": "Male",
  "race": "Hispanic",
  "livingSituation": "alone",
  "socialSupportScore": 2,
  "admissions6m": 1,
  "edVisits6m": 0,
  "pcVisits1y": 2,
  "daysSinceLastDischarge": 137,
  "priorFall": 0,
  "systolicBP": 104.0,
  "diastolicBP": 80.0,
  "heartRate": 61.0,
  "respRate": 16.0,
  "temperature": 36.8,
  "spo2": 95.0,
  "weight": 73.3,
  "height": 1.79,
  "BMI": 22.9,
  "hemoglobin": 15.9,
  "wbc": 4.2,
  "bun": 18.2,
  "creatinine": 1.15,
  "sodium": 142.4,
  "potassium": 4.6,
  "hba1c": 6.6,
  "ntprobnp"