In [2]:
"""
train_model.py

- Loads donors.csv
- Creates target label 'is_rare' (1 if blood_type == 'Bombay(Oh)')
- Trains RandomForest to predict rare blood
- Saves model as raktsetu_rf.joblib
- Outputs metrics summary and feature importances
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

# Config
RANDOM_STATE = 42
MODEL_OUT = "raktsetu_rf.joblib"

def load_data(path="donors.csv"):
    df = pd.read_csv(path)
    # target: is rare (Bombay)
    df["is_rare"] = (df["blood_type"] == "Bombay(Oh)").astype(int)
    return df

def make_features(df):
    # features: surname, location, age, sex
    X = df[["surname","location","age","sex"]].copy()
    y = df["is_rare"]
    return X, y

if __name__ == "__main__":
    df = load_data("donors.csv")
    X, y = make_features(df)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    cat_cols = ["surname","location","sex"]
    num_cols = ["age"]

    # Fixed OneHotEncoder for scikit-learn >=1.2
    preproc = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols)
    ])

    model = Pipeline([
        ("pre", preproc),
        ("clf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1))
    ])

    print("Training model...")
    model.fit(X_train, y_train)

    # Evaluate
    y_proba = model.predict_proba(X_test)[:,1]
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_proba) if len(np.unique(y_test))>1 else float("nan")
    acc = accuracy_score(y_test, y_pred)

    print(f"AUC: {auc:.4f} | ACC: {acc:.4f}")

    # Show some sample predictions
    sample = X_test.sample(5, random_state=RANDOM_STATE)
    print("\nSample predictions:")
    for idx, row in sample.iterrows():
        # Convert Series to 1-row DataFrame
        p = model.predict_proba(row.to_frame().T)[0][1]
        print(row.to_dict(), "-> prob_rare:", round(p,3))

    # Feature importance (rough)
    clf = model.named_steps["clf"]
    try:
        ohe = model.named_steps["pre"].named_transformers_["cat"]
        ohe_cols = ohe.get_feature_names_out(cat_cols).tolist()
        feature_names = ohe_cols + num_cols
        importances = clf.feature_importances_
        imp_df = pd.DataFrame({"feature":feature_names, "importance":importances})\
                     .sort_values("importance", ascending=False).head(10)
        print("\nTop feature importances:")
        print(imp_df.to_string(index=False))
    except Exception as e:
        print("Could not compute human-readable importances:", e)

    # Save model
    joblib.dump(model, MODEL_OUT)
    print("\nSaved model to", MODEL_OUT)

    # Add model probabilities to donors.csv as a column
    df["pred_prob"] = model.predict_proba(df[["surname","location","age","sex"]])[:,1]
    df.to_csv("donors_with_pred.csv", index=False)
    print("Saved donors_with_pred.csv with predicted probabilities.")


Training model...
AUC: 0.4730 | ACC: 0.8250

Sample predictions:
{'surname': 'Khan', 'location': 'Goa', 'age': 43, 'sex': 'M'} -> prob_rare: 0.0
{'surname': 'Fernandes', 'location': 'Bengaluru', 'age': 20, 'sex': 'F'} -> prob_rare: 0.015
{'surname': 'Sawant', 'location': 'Goa', 'age': 44, 'sex': 'M'} -> prob_rare: 0.111
{'surname': 'Khan', 'location': 'Bengaluru', 'age': 18, 'sex': 'M'} -> prob_rare: 0.012
{'surname': 'Sharma', 'location': 'Mumbai', 'age': 31, 'sex': 'F'} -> prob_rare: 0.68

Top feature importances:
          feature  importance
              age    0.788082
            sex_F    0.015099
            sex_M    0.014565
    location_Pune    0.014399
     location_Goa    0.014061
  location_Nagpur    0.013986
  location_Mumbai    0.012918
surname_Fernandes    0.012741
   surname_Bhagat    0.012480
   surname_Sawant    0.012394

Saved model to raktsetu_rf.joblib
Saved donors_with_pred.csv with predicted probabilities.
