In [2]:
# ==========================================================
# pipeline_model.py — for Streamlit deployment
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ==========================================================
# 1. Load Data
# ==========================================================
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/train_ZoGVYWq.csv")
print(f"Data shape: {df.shape}")
display(df.head())

TARGET_COL = "renewal"

# ==========================================================
# 2. Feature Engineering
# ==========================================================
print("Applying feature engineering...")

df["age_in_years"] = (df["age_in_days"] / 365).astype(int)

df["total_late_payments"] = (
    df["Count_3-6_months_late"]
    + df["Count_6-12_months_late"]
    + df["Count_more_than_12_months_late"]
)

df["late_payment_score"] = (
    df["Count_3-6_months_late"] * 1
    + df["Count_6-12_months_late"] * 2
    + df["Count_more_than_12_months_late"] * 3
)

# ratio-based engineered features ---
df["premium_to_income"] = df["premium"] / (df["Income"] + 1)
df["late_ratio"] = df["total_late_payments"] / (df["no_of_premiums_paid"] + 1)

# Remove ID if present
if "id" in df.columns:
    df = df.drop(columns=["id"])

# ==========================================================
# 3. Split Features / Target
# ==========================================================
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ==========================================================
# 4. Decision-tree-based Age Buckets
# ==========================================================
tree = DecisionTreeClassifier(max_leaf_nodes=5, random_state=42)
tree.fit(df[["age_in_years"]], y)
thresholds = sorted(tree.tree_.threshold[tree.tree_.threshold > 0])
bins = [df["age_in_years"].min()] + thresholds + [df["age_in_years"].max()]
labels = []
for i in range(len(bins)-1):
    if i == len(bins)-2:
        labels.append(f"{int(bins[i])}+")
    else:
        labels.append(f"{int(bins[i])}-{int(bins[i+1])}")

df["age_bucket_tree"] = pd.cut(
    df["age_in_years"],
    bins=bins,
    labels=labels,
    include_lowest=True
)


# Recompute X, y (including new bucket)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Identify column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
print(f"Numeric columns: {len(num_cols)}, Categorical columns: {len(cat_cols)}")

# ==========================================================
# 5. Preprocessing Pipelines
# ==========================================================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# ==========================================================
# 6. 5-Fold Stratified Cross Validation (SMOTE inside pipeline)
# ==========================================================
print("Performing 5-Fold Stratified Cross-Validation for ROC-AUC...")

rf_cv = RandomForestClassifier(
    n_estimators=800,
    max_depth=30,
    min_samples_leaf=3,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)

cv_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", rf_cv)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = cross_val_score(cv_pipeline, X, y, cv=cv, scoring="roc_auc")

print(f"Mean CV ROC-AUC : {auc_scores.mean():.4f}")
print(f"Std Dev ROC-AUC : {auc_scores.std():.4f}")
print(f"All Fold Scores : {auc_scores}")

# ==========================================================
# 7. Train/Test Split
# ==========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ==========================================================
# 8. Apply Preprocessing + SMOTE + Train Final Model
# ==========================================================
print("Fitting preprocessor and applying SMOTE...")
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_train_prep, y_train)

rf_model = RandomForestClassifier(
    n_estimators=800,
    max_depth=30,
    min_samples_leaf=3,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)

print("Training final Random Forest...")
rf_model.fit(X_bal, y_bal)

# ==========================================================
# 9. Evaluate Final Model
# ==========================================================
print("Evaluating model...")
y_pred = rf_model.predict(X_test_prep)
y_proba = rf_model.predict_proba(X_test_prep)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\n=== Model Metrics ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {auc:.4f}")

# ==========================================================
#  Optional: Save Predictions with Clean Encoding
# ==========================================================
output_df = X_test.copy()
output_df["Actual"] = y_test.values
output_df["Predicted_Renewal"] = np.where(y_pred == 1, "Renew", "Not Renew")
output_df["Renewal_Probability"] = np.round(y_proba * 100, 2)

# Save with UTF-8 encoding to prevent Excel garbling issues
output_df.to_csv("predictions.csv", index=False, encoding="utf-8-sig")
print("\nPredictions saved successfully as 'predictions.csv'")


# ==========================================================
# 10. Save Preprocessor and Model
# ==========================================================
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(rf_model, "rf_model.pkl", compress=3)

print("\n Saved 'preprocessor.pkl' and 'rf_model.pkl' successfully!")

# ==========================================================
# 11. Feature Importance (optional)
# ==========================================================
ohe = preprocessor.named_transformers_["cat"].named_steps["encoder"]
encoded_cat_cols = list(ohe.get_feature_names_out(cat_cols))
all_feature_names = num_cols + encoded_cat_cols
importances = rf_model.feature_importances_

feat_imp = pd.DataFrame({
    "Feature": all_feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

feat_imp.to_csv("feature_importances.csv", index=False)
print("\nTop 10 Important Features:")
print(feat_imp.head(10))


Mounted at /content/drive
Data shape: (79853, 13)


Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,renewal
0,110936,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,3300,1
1,41492,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,18000,1
2,31300,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,3300,0
3,19415,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,9600,1
4,99379,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,9600,1


Applying feature engineering...
Numeric columns: 14, Categorical columns: 2
Performing 5-Fold Stratified Cross-Validation for ROC-AUC...
Mean CV ROC-AUC : 0.8195
Std Dev ROC-AUC : 0.0081
All Fold Scores : [0.81674845 0.81021315 0.81250598 0.83101484 0.82677461]
Fitting preprocessor and applying SMOTE...
Training final Random Forest...
Evaluating model...

=== Model Metrics ===
Accuracy : 0.9168
Precision: 0.9556
Recall   : 0.9556
F1-score : 0.9556
ROC-AUC  : 0.8196

Predictions saved successfully as 'predictions.csv'

 Saved 'preprocessor.pkl' and 'rf_model.pkl' successfully!

Top 10 Important Features:
                             Feature  Importance
0   perc_premium_paid_by_cash_credit    0.141211
7                no_of_premiums_paid    0.092148
13                        late_ratio    0.089000
8                            premium    0.088964
11                late_payment_score    0.070690
10               total_late_payments    0.067730
1                        age_in_days    0.0669