In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
import joblib


def process_newborn_data(
    newborn_path: str,
    vaccine_schedule_path: str,
    drop_name: bool = True
) -> pd.DataFrame:
    """
    Full preprocessing pipeline for newborn health monitoring and vaccination scheduling.

    Steps:
      1) Data cleaning
      2) Data integration (with Rwanda vaccine schedule)
      3) Data reduction (drop unnecessary columns)
      4) Data transformation (derived features)
      5) Data discretization (age groups, flags)
      6) Data augmentation (vaccine due/overdue features)
    """
    # -----------------------------
    # 1. Load and basic data cleaning
    # -----------------------------
    df = pd.read_csv(newborn_path)
    vacc = pd.read_csv(vaccine_schedule_path)

    # Ensure expected columns exist
    required_cols = [
        "babyid","name","gender","gestationalageweeks",
        "birthweightkg","birthlengthcm","birthheadcircumferencecm",
        "date","agedays","weightkg","lengthcm","headcircumferencecm",
        "temperature","heartratebpm","respiratoryratebpm","oxygensaturation",
        "feedingtype","feedingfrequencyperday","urineoutputcount","stoolcount",
        "jaundicelevelmgdl","apgarscore","immunizationsdone","reflexesnormal","risklevel"
    ]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing expected columns in newborn file: {missing}")

    # Convert types
    numeric_cols = [
        "gestationalageweeks","birthweightkg","birthlengthcm","birthheadcircumferencecm",
        "agedays","weightkg","lengthcm","headcircumferencecm",
        "temperature","heartratebpm","respiratoryratebpm","oxygensaturation",
        "feedingfrequencyperday","urineoutputcount","stoolcount",
        "jaundicelevelmgdl","apgarscore"
    ]
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Drop rows with missing critical fields
    critical_cols = ["babyid","gender","agedays","weightkg","lengthcm","apgarscore"]
    df = df.dropna(subset=critical_cols).reset_index(drop=True)

    # -----------------------------
    # 2. Data integration: Vaccine schedule
    # -----------------------------
    vacc = vacc.copy()
    vacc["In_days"] = pd.to_numeric(vacc["In_days"], errors="coerce")

    # -----------------------------
    # 3. Data reduction: drop non-essential columns
    # -----------------------------
    if drop_name and "name" in df.columns:
        df = df.drop(columns=["name"])

    # -----------------------------
    # 4. Data transformation: derived features
    # -----------------------------
    # Age in months
    df["age_months"] = df["agedays"] / 30.4375

    # BMI-like metric
    df["bmi_like"] = df["weightkg"] / (df["lengthcm"] / 100.0) ** 2

    # Low birth weight & prematurity
    df["low_birth_weight"] = (df["birthweightkg"] < 2.5).astype(int)
    df["premature"] = (df["gestationalageweeks"] < 37).astype(int)

    # Fever
    df["fever_flag"] = (df["temperature"] >= 37.5).astype(int)

    # Simple global thresholds for vitals
    df["oxygen_low_flag"] = (df["oxygensaturation"] < 95).astype(int)

    def tachycardia_threshold(age_days):
        if age_days <= 30:
            return 180
        elif age_days <= 365:
            return 170
        else:
            return 160

    def tachypnea_threshold(age_days):
        if age_days <= 30:
            return 60
        elif age_days <= 365:
            return 50
        else:
            return 40

    df["tachycardia_flag"] = df.apply(
        lambda row: int(row["heartratebpm"] > tachycardia_threshold(row["agedays"])),
        axis=1,
    )
    df["tachypnea_flag"] = df.apply(
        lambda row: int(row["respiratoryratebpm"] > tachypnea_threshold(row["agedays"])),
        axis=1,
    )

    # -----------------------------
    # 5. Data discretization: age groups
    # -----------------------------
    def age_group(days):
        if days <= 28:
            return "Neonate"
        elif days <= 365:
            return "Infant"
        elif days <= 3 * 365:
            return "Toddler"
        else:
            return "Preschool"

    df["age_group"] = df["agedays"].apply(age_group)

    # -----------------------------
    # 6. Vaccine-derived features
    # -----------------------------
    # Pre-calc: list of vaccine milestones (ignore pre-birth)
    schedule = vacc[vacc["In_days"] >= 0][["In_days", "Vaccine"]].copy()

    def vacc_counts(row):
        age = row["agedays"]
        status = row["immunizationsdone"]
        due = (schedule["In_days"] <= age).sum()

        if status == "Yes":
            past_due = 0
        elif status == "No":
            past_due = due
        else:  # "Partial"
            past_due = int(round(due * 0.5))
        return pd.Series({"due_vaccines_count": due, "past_due_vaccines_count": past_due})

    vacc_features = df.apply(vacc_counts, axis=1)
    df = pd.concat([df, vacc_features], axis=1)

    # MR1 at 270 days, MR2 at 450 days
    MR1_DAY = int(
        schedule[(schedule["Vaccine"] == "MR") & (schedule["In_days"] == 270)]["In_days"].min()
    ) if any((schedule["Vaccine"] == "MR") & (schedule["In_days"] == 270)) else 270

    MR2_DAY = int(
        schedule[(schedule["Vaccine"] == "MR") & (schedule["In_days"] == 450)]["In_days"].min()
    ) if any((schedule["Vaccine"] == "MR") & (schedule["In_days"] == 450)) else 450

    df["mr1_due"] = (df["agedays"] >= MR1_DAY).astype(int)
    df["mr2_due"] = (df["agedays"] >= MR2_DAY).astype(int)

    def overdue_flag(age, day, status):
        if age <= day:
            return 0
        if status == "Yes":
            return 0
        elif status == "No":
            return 1
        else:  # Partial
            return 1

    df["mr1_overdue"] = df.apply(
        lambda r: overdue_flag(r["agedays"], MR1_DAY, r["immunizationsdone"]), axis=1
    )
    df["mr2_overdue"] = df.apply(
        lambda r: overdue_flag(r["agedays"], MR2_DAY, r["immunizationsdone"]), axis=1
    )

    return df


# ============================
# Run preprocessing
# ============================
processed_df = process_newborn_data(
    "newborn_health_monitoring_100k_rows.csv",
    "Rwanda_Vaccine_Schedule.csv",
)
print(processed_df.head())
print(processed_df.columns)

# ============================
# 1. Define features and target
# ============================
df = processed_df.copy()
df = df.dropna(subset=["risklevel"])

y_health = df["risklevel"].map({"Low": 0, "Medium": 1, "High": 2})

numeric_features = [
    "gestationalageweeks","agedays","age_months",
    "birthweightkg","birthlengthcm","birthheadcircumferencecm",
    "weightkg","lengthcm","headcircumferencecm","bmi_like",
    "temperature","heartratebpm","respiratoryratebpm","oxygensaturation",
    "feedingfrequencyperday","urineoutputcount","stoolcount",
    "jaundicelevelmgdl","apgarscore",
    "due_vaccines_count","past_due_vaccines_count"
]

binary_features = [
    "low_birth_weight","premature",
    "fever_flag","tachycardia_flag","tachypnea_flag","oxygen_low_flag",
    "mr1_due","mr1_overdue","mr2_due","mr2_overdue"
]

categorical_features = [
    "gender","feedingtype","reflexesnormal","immunizationsdone","age_group"
]

feature_cols = numeric_features + binary_features + categorical_features
X_health = df[feature_cols]

# ============================
# 2. Train/validation split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X_health, y_health, test_size=0.2, random_state=42, stratify=y_health
)

# ============================
# 3. Preprocessing
# ============================
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features + binary_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Fit preprocessor, then SMOTE
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

print("Class counts before SMOTE:", np.bincount(y_train))
print("Class counts after SMOTE:", np.bincount(y_train_balanced))

# ============================
# 4. Define the 5 models (with class_weight)
# ============================
models = {
    "log_reg": LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
        class_weight="balanced"
    ),
    "knn": KNeighborsClassifier(n_neighbors=7),
    "decision_tree": DecisionTreeClassifier(
        max_depth=None,
        random_state=42,
        class_weight="balanced"
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    ),
     "gbm": GradientBoostingClassifier(
        random_state=42
    ),
}

# ============================
# 5. Train and evaluate models
# ============================
results = {}
best_model_name = None
best_macro_f1 = -1.0
best_clf = None

for name, clf in models.items():
    print(f"\n=== Training {name} ===")
    clf.fit(X_train_balanced, y_train_balanced)
    y_pred = clf.predict(X_test_processed)

    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    macro_f1 = f1_score(y_test, y_pred, average="macro")
    results[name] = macro_f1
    print(f"Macro F1: {macro_f1:.3f}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        best_model_name = name
        best_clf = clf

print("\nModel macro F1 scores:", results)
print("Best model:", best_model_name, "with macro F1 =", best_macro_f1)

# ============================
# 6. Save full HealthRiskModel pipeline
# ============================
from sklearn.pipeline import Pipeline as SkPipeline

health_risk_pipeline = SkPipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", best_clf)
])

joblib.dump(health_risk_pipeline, "/content/drive/MyDrive/HealthRiskModel_best_smote.joblib")
print("Saved best HealthRiskModel to /content/drive/MyDrive/HealthRiskModel_best_smote.joblib")
 
################################################################################################
#model 2 Vaccination predict
################################################################################################

df_v = processed_df.copy()

def compute_vaccination_risk(row):
    # Good: no overdue and status Yes
    if row["past_due_vaccines_count"] == 0 and row["immunizationsdone"] == "Yes":
        return 0  # Good
    # High: overdue and status No
    if row["past_due_vaccines_count"] > 0 and row["immunizationsdone"] == "No":
        return 2  # High
    # Medium: all other cases (overdue + Partial, or due but Partial)
    return 1  # Medium

df_v["vaccination_risk"] = df_v.apply(compute_vaccination_risk, axis=1)
df_v = df_v.dropna(subset=["vaccination_risk"])
y_vacc = df_v["vaccination_risk"].astype(int)


numeric_vacc = [
    "agedays","age_months",
    "gestationalageweeks",
    "due_vaccines_count","past_due_vaccines_count"
]

binary_vacc = [
    "premature","low_birth_weight",
    "mr1_due","mr2_due","mr1_overdue","mr2_overdue"
]

categorical_vacc = [
    "gender","age_group","immunizationsdone","feedingtype"
]

X_vacc = df_v[numeric_vacc + binary_vacc + categorical_vacc]

X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(
    X_vacc, y_vacc, test_size=0.2, random_state=42, stratify=y_vacc
)

num_trans_v = Pipeline(steps=[("scaler", StandardScaler())])
cat_trans_v = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preproc_v = ColumnTransformer(
    transformers=[
        ("num", num_trans_v, numeric_vacc + binary_vacc),
        ("cat", cat_trans_v, categorical_vacc),
    ]
)

models_v = {
    "log_reg": LogisticRegression(max_iter=1000, n_jobs=-1, class_weight="balanced"),
    "knn": KNeighborsClassifier(n_neighbors=7),
    "decision_tree": DecisionTreeClassifier(max_depth=None, random_state=42, class_weight="balanced"),
    "random_forest": RandomForestClassifier(
        n_estimators=200, max_depth=None, random_state=42, n_jobs=-1, class_weight="balanced"
    ),
    "svm_rbf": SVC(kernel="rbf", probability=True, random_state=42, class_weight="balanced"),
}

results_v = {}
best_name_v, best_f1_v, best_pipe_v = None, -1.0, None

for name, clf in models_v.items():
    print(f"\n=== Vaccination model: {name} ===")
    pipe = Pipeline(steps=[("preprocess", preproc_v), ("clf", clf)])
    pipe.fit(X_train_v, y_train_v)
    y_pred_v = pipe.predict(X_test_v)

    print(classification_report(y_test_v, y_pred_v, digits=3))
    print(confusion_matrix(y_test_v, y_pred_v))

    mf1 = f1_score(y_test_v, y_pred_v, average="macro")
    results_v[name] = mf1
    print(f"Macro F1: {mf1:.3f}")

    if mf1 > best_f1_v:
        best_f1_v, best_name_v, best_pipe_v = mf1, name, pipe

print("\nVaccination model macro F1:", results_v)
print("Best vaccination model:", best_name_v, "with macro F1 =", best_f1_v)

joblib.dump(best_pipe_v, "/content/drive/MyDrive/VaccinationRiskModel_best.joblib")
print("Saved to /content/drive/MyDrive/VaccinationRiskModel_best.joblib")

