In [2]:
# ===================== group10_streamlitApp.py =====================
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import sys, sklearn
import os
import gdown   # <-- Added for Google Drive model download

# ----------------------------------------------------------
# Streamlit Page Setup
# ----------------------------------------------------------
st.set_page_config(page_title="Insurance Renewal Predictor", layout="wide")

# ----------------------------------------------------------
# Sidebar info
# ----------------------------------------------------------
st.sidebar.write("**Python:**", sys.executable)
st.sidebar.write("**scikit-learn version:**", sklearn.__version__)

# ----------------------------------------------------------
# Download model & preprocessor from Google Drive if not found
# ----------------------------------------------------------
def download_from_drive(file_id, output):
    """Downloads a file from Google Drive using its file ID."""
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, output, quiet=False)

# Replace the below IDs with your own Drive file IDs
MODEL_FILE_ID = "1opOSebBGWWbjQ5vC8UFJBsKmsXbzGi9g"        # <-- put your model ID here
PREPROCESSOR_FILE_ID = "13uBo1zebpZNM78xpvoskLIRwTN83kJKV"  # <-- (optional) preprocessor ID

# Download files if missing
if not os.path.exists("rf_model.pkl"):
    with st.spinner("Downloading model from Google Drive..."):
        download_from_drive(MODEL_FILE_ID, "rf_model.pkl")
        st.success("Model downloaded successfully!")

if not os.path.exists("preprocessor.pkl"):
    with st.spinner("Downloading preprocessor from Google Drive..."):
        download_from_drive(PREPROCESSOR_FILE_ID, "preprocessor.pkl")
        st.success("Preprocessor downloaded successfully!")

# ----------------------------------------------------------
# Load Model and Preprocessor
# ----------------------------------------------------------
@st.cache_resource
def load_artifacts():
    try:
        preprocessor = joblib.load("preprocessor.pkl")
        model = joblib.load("rf_model.pkl")
        return preprocessor, model
    except Exception as e:
        st.error(f"Error loading model/preprocessor: {e}")
        return None, None

preprocessor, model = load_artifacts()
if preprocessor is None or model is None:
    st.stop()

st.sidebar.success("Model and Preprocessor loaded successfully!")

# ----------------------------------------------------------
# Helper: Derived Feature Computation
# ----------------------------------------------------------
def add_derived_features(df):
    """Safely adds all features required by the model, even if dataset has only base columns."""

    # Always derive from base inputs if they exist
    if "age_in_days" in df.columns:
        df["age_in_years"] = (df["age_in_days"] / 365).astype(int)

    if all(c in df.columns for c in [
        "Count_3-6_months_late",
        "Count_6-12_months_late",
        "Count_more_than_12_months_late"
    ]):
        df["total_late_payments"] = (
            df["Count_3-6_months_late"]
            + df["Count_6-12_months_late"]
            + df["Count_more_than_12_months_late"]
        )
        df["late_payment_score"] = (
            df["Count_3-6_months_late"] * 1
            + df["Count_6-12_months_late"] * 2
            + df["Count_more_than_12_months_late"] * 3
        )
    else:
        df["total_late_payments"] = 0
        df["late_payment_score"] = 0

    # Derived fields for compatibility (even if not in original CSV)
    df["premium_to_income"] = df.get("premium", 0) / (df.get("Income", 1))
    df["late_ratio"] = df.get("late_payment_score", 0) / (df.get("no_of_premiums_paid", 1))
    df["age_bucket_tree"] = pd.cut(
        df.get("age_in_days", 0) / 365,
        bins=[0, 30, 40, 50, 60, 70, 120],
        labels=["<30", "31–40", "41–50", "51–60", "61–70", "70+"],
        include_lowest=True
    ).astype(str)

    return df

# ----------------------------------------------------------
# Main Title
# ----------------------------------------------------------
st.title("Insurance Renewal Prediction App")
st.markdown(
    "Predict whether a customer will renew their insurance policy using a trained **Random Forest** model. "
    "You can enter data manually or upload a CSV file for batch predictions."
)

# ----------------------------------------------------------
# Input Mode
# ----------------------------------------------------------
st.sidebar.header("Select Input Mode")
input_mode = st.sidebar.radio("Choose input method:", ["Manual Entry", "Upload CSV File"])

# ----------------------------------------------------------
# Manual Input Section
# ----------------------------------------------------------
if input_mode == "Manual Entry":
    st.subheader("Enter Customer Details")

    col1, col2 = st.columns(2)

    with col1:
        income = st.number_input("Income", min_value=0.0, step=1000.0, value=84140.0)
        premium = st.number_input("Premium", min_value=0.0, step=100.0, value=3300.0)
        application_underwriting_score = st.number_input(
            "Application Underwriting Score", min_value=0.0, max_value=100.0, step=0.1, value=99.0
        )
        age_in_days = st.number_input("Age (in days)", min_value=0, step=1, value=17531)
        age_in_years = int(age_in_days / 365)

    with col2:
        no_of_premiums_paid = st.number_input("Number of Premiums Paid", min_value=0, step=1, value=7)
        count_3_6_months_late = st.number_input("Count 3–6 months late", min_value=0, step=1, value=2)
        count_6_12_months_late = st.number_input("Count 6–12 months late", min_value=0, step=1, value=3)
        count_more_than_12_months_late = st.number_input("Count >12 months late", min_value=0, step=1, value=1)

    # --- Additional categorical inputs ---
    st.subheader("Additional Details")
    perc_premium_paid_by_cash_credit = st.number_input(
        "Percentage of Premium Paid by Cash/Credit (%)",
        min_value=0.0, max_value=100.0, step=0.1, value=75.0
    )
    residence_area_type = st.selectbox("Residence Area Type", options=["Urban", "Rural", "Semi-Urban"], index=0)
    sourcing_channel = st.selectbox("Sourcing Channel", options=["A", "B", "C", "D", "E"], index=0)

    # --- Build DataFrame ---
    input_data = pd.DataFrame({
        "Income": [income],
        "premium": [premium],
        "application_underwriting_score": [application_underwriting_score],
        "no_of_premiums_paid": [no_of_premiums_paid],
        "age_in_days": [age_in_days],
        "Count_3-6_months_late": [count_3_6_months_late],
        "Count_6-12_months_late": [count_6_12_months_late],
        "Count_more_than_12_months_late": [count_more_than_12_months_late],
        "perc_premium_paid_by_cash_credit": [perc_premium_paid_by_cash_credit],
        "residence_area_type": [residence_area_type],
        "sourcing_channel": [sourcing_channel]
    })

    # Add derived features
    input_data = add_derived_features(input_data)

    st.subheader("Entered Data")
    st.dataframe(input_data)

    # --- Predict ---
    if st.button("Predict Renewal"):
        try:
            X_transformed = preprocessor.transform(input_data)
            y_pred = model.predict(X_transformed)
            y_prob = model.predict_proba(X_transformed)[:, 1]

            # Make it accessible to What-If section
            st.session_state["last_input"] = input_data.copy()
            st.session_state["last_prob"] = y_prob[0]
            st.session_state["last_pred"] = y_pred[0]

            result = " Customer will **Renew** the policy." if y_pred[0] == 1 else " Customer will **Not Renew** the policy."
            st.success(result)
            st.metric(label="Renewal Probability", value=f"{y_prob[0]*100:.2f}%")
        except Exception as e:
            st.error(f"Prediction failed: {e}")

    # ----------------------------------------------------------
    # What-If Analysis: Sensitivity Simulation
    # ----------------------------------------------------------
    st.markdown("---")
    st.subheader(" What-If Analysis — Explore Feature Impact")

    with st.expander("Try adjusting key features to see impact"):
        # Ensure we have a previous prediction stored
        if "last_prob" not in st.session_state:
            st.warning(" Please run a prediction first using the 'Predict Renewal' button above.")
            st.stop()
        else:
            y_prob = [st.session_state["last_prob"]]
            input_data = st.session_state["last_input"].copy()

        feature_to_change = st.selectbox(
            "Select feature to modify:",
            options=[
                "perc_premium_paid_by_cash_credit",
                "no_of_premiums_paid",
                "late_ratio",
                "premium",
                "late_payment_score",
                "application_underwriting_score"
            ],
            index=0
        )

        # Copy original input
        whatif_data = input_data.copy()

        # Compute dependent features before simulation
        if "total_late_payments" in whatif_data.columns and "no_of_premiums_paid" in whatif_data.columns:
            whatif_data["late_ratio"] = whatif_data["total_late_payments"] / (whatif_data["no_of_premiums_paid"] + 1)

        # Slider range based on feature type
        current_value = float(whatif_data[feature_to_change].iloc[0])
        if feature_to_change in ["perc_premium_paid_by_cash_credit", "application_underwriting_score"]:
            new_value = st.slider(f"Adjust {feature_to_change}", 0.0, 100.0, current_value, step=1.0)
        elif feature_to_change in ["late_ratio"]:
            new_value = st.slider(f"Adjust {feature_to_change}", 0.0, 5.0, current_value, step=0.1)
        elif feature_to_change in ["premium", "Income"]:
            new_value = st.slider(f"Adjust {feature_to_change}", 0.0, current_value * 2, current_value, step=100.0)
        else:
            new_value = st.slider(f"Adjust {feature_to_change}", 0.0, current_value * 3, current_value, step=1.0)

        whatif_data[feature_to_change] = new_value

        # Recalculate dependent fields
        if feature_to_change in [
            "Count_3-6_months_late", "Count_6-12_months_late",
            "Count_more_than_12_months_late", "no_of_premiums_paid"
        ]:
            whatif_data["total_late_payments"] = (
                whatif_data["Count_3-6_months_late"]
                + whatif_data["Count_6-12_months_late"]
                + whatif_data["Count_more_than_12_months_late"]
            )
            whatif_data["late_payment_score"] = (
                whatif_data["Count_3-6_months_late"] * 1
                + whatif_data["Count_6-12_months_late"] * 2
                + whatif_data["Count_more_than_12_months_late"] * 3
            )
            whatif_data["late_ratio"] = whatif_data["total_late_payments"] / (whatif_data["no_of_premiums_paid"] + 1)

        # Re-predict with modified input
        try:
            X_transformed_whatif = preprocessor.transform(whatif_data)
            new_prob = model.predict_proba(X_transformed_whatif)[:, 1][0]
            st.metric(
                label=f"Renewal Probability after changing '{feature_to_change}'",
                value=f"{new_prob*100:.2f}%",
                delta=f"{(new_prob - y_prob[0]) * 100:+.2f}%"
            )
        except Exception as e:
            st.warning(f"Could not compute What-If impact: {e}")


# ----------------------------------------------------------
# CSV Upload Mode
# ----------------------------------------------------------
else:
    uploaded_file = st.file_uploader("Upload CSV file for batch predictions", type=["csv"])
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        st.write("### Uploaded Data Preview")
        st.dataframe(df.head())

        # Add derived features
        df = add_derived_features(df)

        if st.button("Predict for All Records"):
            try:
                X_transformed = preprocessor.transform(df)
                preds = model.predict(X_transformed)
                probs = model.predict_proba(X_transformed)[:, 1]

                df["Predicted_Renewal"] = np.where(preds == 1, "Renew", "Not Renew")
                df["Renewal_Probability"] = np.round(probs * 100, 2)

                st.write("### Prediction Results")
                st.dataframe(df.head())

                # Optional: Download button
                st.download_button(
                    label="Download Results as CSV",
                    data=df.to_csv(index=False),
                    file_name="insurance_predictions.csv",
                    mime="text/csv"
                )

            except Exception as e:
                st.error(f"Prediction failed: {e}")


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


2025-10-19 17:12:23.865 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
Downloading...
From (original): https://drive.google.com/uc?id=1RDymPSGEf4VcmxrWj5HHV5F0mPdmb2it
From (redirected): https://drive.google.com/uc?id=1RDymPSGEf4VcmxrWj5HHV5F0mPdmb2it&confirm=t&uuid=53ae9f15-2476-4e39-ad59-f60b5b75af6c
To: /content/rf_model.pkl
100%|██████████| 189M/189M [00:03<00:00, 60.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1A0wY35G3Pqjxc8owRiJGhEhV-DP2zHYI
To: /content/preprocessor.pkl
100%|██████████| 4.82k/4.82k [00:00<00:00, 13.3MB/s]
2025-10-19 17:12:36.724 Session state does not function when running a script without `streamlit run`
