In [26]:
import pandas as pd

data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")

edu_categories = ['High School', 'Graduate', 'Post Graduate', 'Professional']
company_categories = ['Startup', 'Small', 'Mid-size', 'Large Indian', 'MNC']
house_type_categories = ["Rented", "Family", "Own"]
emi_eligibility_categories = ['Not_Eligible', 'High_Risk', 'Eligible']

data['credit_score'] = pd.to_numeric(data['credit_score'], errors='coerce')
data = data[(data['credit_score'] <= 900) & data['credit_score'].notna()].reset_index(drop=True)

data = data[~((data["house_type"] == "Rented") & data["monthly_rent"].isna())].reset_index(drop=True)

data['age'] = (
    data['age'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
    .astype(float)
)

data['gender'] = data['gender'].astype(str).str.strip().str.lower()
data['gender'] = data['gender'].replace({'^m$': 'male', '^f$': 'female'}, regex=True)
data['gender'] = data['gender'].replace(r'^\s*$', pd.NA, regex=True)
data['gender'] = data['gender'].map({'male': 0, 'female': 1})

data['marital_status'] = data['marital_status'].astype(str).str.strip().str.lower()
data['marital_status'] = data['marital_status'].replace(r'^\s*$', pd.NA, regex=True)
data['marital_status'] = data['marital_status'].map({'single': 0, 'married': 1})

data["education"] = pd.Series(
    pd.Categorical(data["education"], categories=edu_categories, ordered=True).codes
).replace({-1: pd.NA})

data["monthly_salary"] = (
    data['monthly_salary'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
    .astype(float)
)

# data["monthly_salary"] = pd.to_numeric(data["monthly_salary"], errors='coerce')

# data["employment_type"] = pd.get_dummies(data["employment_type"]).add_suffix("_job")

data = pd.get_dummies(data, columns=["employment_type"], dtype="int") 


data['company_type'] = pd.Series(
    pd.Categorical(data["company_type"], categories=company_categories, ordered=True).codes
).replace({-1: pd.NA})


data['house_type'] = pd.Series(
    pd.Categorical(data["house_type"], categories=house_type_categories, ordered=True).codes
).replace({-1: pd.NA})

data["bank_balance"] = (
    data['bank_balance'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
)
data["bank_balance"] = pd.to_numeric(data["bank_balance"], errors="coerce")

data['existing_loans'] = data['existing_loans'].astype(str).str.strip().str.lower()
data['existing_loans'] = data['existing_loans'].replace(r'^\s*$', pd.NA, regex=True)
data['existing_loans'] = data['existing_loans'].map({'no': 0, 'yes': 1})

data["emi_eligibility"] = pd.Series(
    pd.Categorical(data["emi_eligibility"], categories=emi_eligibility_categories, ordered=True).codes
).replace({-1: pd.NA})

  data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")


In [5]:
data.head()

Unnamed: 0,age,gender,marital_status,education,monthly_salary,years_of_employment,company_type,house_type,monthly_rent,family_size,...,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi,employment_type_Government,employment_type_Private,employment_type_Self-employed
0,38.0,1,1,3,82600.0,0.9,2,Rented,20000.0,3,...,303200.0,70200.0,Personal Loan EMI,850000.0,15,Not_Eligible,500.0,0,1,0
1,38.0,1,1,1,21500.0,7.0,4,Family,0.0,2,...,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0,0,1,0
2,38.0,0,1,3,86100.0,5.8,0,Own,0.0,4,...,672100.0,324200.0,Education EMI,306000.0,16,Eligible,27775.0,0,1,0
3,58.0,1,1,0,66800.0,2.2,2,Own,0.0,5,...,440900.0,178100.0,Vehicle EMI,304000.0,83,Eligible,16170.0,0,1,0
4,48.0,1,1,3,57300.0,3.4,2,Family,0.0,4,...,97300.0,28200.0,Home Appliances EMI,252000.0,7,Not_Eligible,500.0,0,1,0


In [27]:
data.isna().sum()

age                                 0
gender                              0
marital_status                      0
education                        2370
monthly_salary                      0
years_of_employment                 0
company_type                        0
house_type                          0
monthly_rent                     1471
family_size                         0
dependents                          0
school_fees                         0
college_fees                        0
travel_expenses                     0
groceries_utilities                 0
other_monthly_expenses              0
existing_loans                      0
current_emi_amount                  0
credit_score                        0
bank_balance                     2384
emergency_fund                   2321
emi_scenario                        0
requested_amount                    0
requested_tenure                    0
emi_eligibility                     0
max_monthly_emi                     0
employment_t

In [28]:
len(data)

398245

In [29]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=["emi_eligibility", "max_monthly_emi"])
emi_eligibility = data["emi_eligibility"]
max_monthly_emi = data["max_monthly_emi"]

# classification model train_test_split for emi eligibility
x_train_classification, x_test_classification, y_train_classification, y_test_classification = train_test_split(x, emi_eligibility, test_size=0.2, random_state=42)

# regression model train test split for max monthly emi
x_train_regression, x_test_regression, y_train_regression, y_test_regression = train_test_split(x, max_monthly_emi, test_size=0.2, random_state=42)



In [None]:
"""
preprocess_train_mlflow.py
Usage:
    python preprocess_train_mlflow.py [--data PATH] [--quick] [--outdir OUTDIR]

This script:
 - Loads CSV dataset
 - Cleans data, imputes, adds derived features
 - Runs EDA summary outputs (value counts, missingness, correlations)
 - Builds preprocessing pipeline
 - Trains 3 classification models and 3 regression models
 - Logs experiments to MLflow (local by default)
 - Saves preprocessor and best models to OUTDIR
"""

import argparse, os, sys, json
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, mean_squared_error, mean_absolute_error, r2_score
import joblib
import mlflow, mlflow.sklearn
import warnings
warnings.filterwarnings("ignore")

def load_data(path):
    df = pd.read_csv(path)
    return df

def basic_clean(df):
    # strip strings, coerce numeric columns
    for c in df.select_dtypes(include=["object","category"]).columns:
        df[c] = df[c].astype(str).str.strip()
    numeric_guess = ["monthly_salary","current_emi_amount","requested_amount","bank_balance","emergency_fund",
                     "monthly_rent","school_fees","college_fees","travel_expenses","groceries_utilities","other_monthly_expenses","credit_score"]
    for c in numeric_guess:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def add_derived(df):
    if {"monthly_salary","current_emi_amount"}.issubset(df.columns):
        df["debt_to_income"] = df["current_emi_amount"] / df["monthly_salary"].replace(0, np.nan)
    if {"monthly_salary","other_monthly_expenses"}.issubset(df.columns):
        df["expense_to_income"] = df["other_monthly_expenses"] / df["monthly_salary"].replace(0, np.nan)
    if {"monthly_salary","requested_amount","requested_tenure"}.issubset(df.columns):
        df["requested_monthly_no_interest"] = df["requested_amount"] / df["requested_tenure"].replace(0, np.nan)
        df["affordability_ratio"] = df["requested_monthly_no_interest"] / df["monthly_salary"].replace(0, np.nan)
    return df

def build_preprocessor(df, max_ohe_cardinality=50):
    numeric_features = df.select_dtypes(include=["int64","float64"]).columns.tolist()
    numeric_features = [c for c in numeric_features if c not in ["max_monthly_emi"]]
    cat_features = [c for c in df.select_dtypes(include=["object","category"]).columns.tolist() if c not in ["emi_eligibility"]]

    # collapse high-cardinality categories to top-K + OTHER
    for c in cat_features:
        if df[c].nunique() > max_ohe_cardinality:
            top = df[c].value_counts().index[:max_ohe_cardinality].astype(str)
            df[c] = df[c].apply(lambda x: x if x in top else "OTHER").astype(str)

    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
    cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))])
    preprocessor = ColumnTransformer([("num", num_pipe, numeric_features), ("cat", cat_pipe, cat_features)], remainder="drop")
    return preprocessor, numeric_features, cat_features

def train_and_evaluate(X_train, X_test, y_train_cl, y_test_cl, y_train_rg, y_test_rg, preprocessor, outdir, quick=False):
    # Transform
    preprocessor.fit(X_train)
    X_train_t = preprocessor.transform(X_train)
    X_test_t = preprocessor.transform(X_test)

    # Classification models
    cl_models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForest": RandomForestClassifier(n_estimators=100 if not quick else 30, n_jobs=-1, random_state=42),
        "XGBoost": None
    }
    # Try to use XGBoost if available
    try:
        import xgboost as xgb
        from xgboost import XGBClassifier, XGBRegressor
        cl_models["XGBoost"] = XGBClassifier(n_estimators=200 if not quick else 50, use_label_encoder=False, eval_metric="mlogloss", tree_method="hist")
        rg_xgb_present = True
    except Exception:
        cl_models.pop("XGBoost", None)
        rg_xgb_present = False

    rg_models = {
        "LinearRegression": LinearRegression(),
        "RandomForestRegressor": RandomForestRegressor(n_estimators=100 if not quick else 30, random_state=42, n_jobs=-1),
    }
    if rg_xgb_present:
        rg_models["XGBoostRegressor"] = XGBRegressor(n_estimators=200 if not quick else 50)

    # MLflow experiment
    mlflow.set_experiment("EMI_eligibility_and_max_emi")
    best_cl = (None, -1.0)  # (name, accuracy)
    best_rg = (None, 1e12)  # (name, rmse)
    for name, model in cl_models.items():
        with mlflow.start_run(run_name=f"cl_{name}"):
            mlflow.log_param("model", name)
            model.fit(X_train_t, y_train_cl)
            preds = model.predict(X_test_t)
            acc = accuracy_score(y_test_cl, preds)
            mlflow.log_metric("accuracy", float(acc))
            # save model artifact
            mlflow.sklearn.log_model(model, artifact_path=f"models/{name}")
            if acc > best_cl[1]:
                best_cl = (name, acc)
    for name, model in rg_models.items():
        with mlflow.start_run(run_name=f"rg_{name}"):
            mlflow.log_param("model", name)
            model.fit(X_train_t, y_train_rg)
            preds = model.predict(X_test_t)
            rmse = mean_squared_error(y_test_rg, preds, squared=False)
            mae = mean_absolute_error(y_test_rg, preds)
            r2 = r2_score(y_test_rg, preds)
            mlflow.log_metric("rmse", float(rmse))
            mlflow.log_metric("mae", float(mae))
            mlflow.log_metric("r2", float(r2))
            mlflow.sklearn.log_model(model, artifact_path=f"models/{name}")
            if rmse < best_rg[1]:
                best_rg = (name, rmse)

    # Save preprocessor & best models locally
    os.makedirs(outdir, exist_ok=True)
    joblib.dump(preprocessor, os.path.join(outdir, "preprocessor.joblib"))

    # For reproducibility, re-fit best models on full training set and save
    print("Best classification model:", best_cl)
    print("Best regression model:", best_rg)
    return

def main(args):
    df = load_data(args.data)
    df = basic_clean(df)
    df = add_derived(df)
    # drop rows missing targets
    df = df.dropna(subset=["emi_eligibility","max_monthly_emi"])
    print("Data shape after cleaning:", df.shape)

    preprocessor, numeric_features, cat_features = build_preprocessor(df)
    # quick mode: small stratified sample
    if args.quick:
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["emi_eligibility"], random_state=42)
        # sample per class
        train_df = train_df.groupby("emi_eligibility", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 2000), random_state=42))
        test_df = test_df.groupby("emi_eligibility", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 500), random_state=42))
    else:
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["emi_eligibility"], random_state=42)

    X_train = train_df[numeric_features + cat_features]
    X_test = test_df[numeric_features + cat_features]
    y_train_cl = train_df["emi_eligibility"]
    y_test_cl = test_df["emi_eligibility"]
    y_train_rg = train_df["max_monthly_emi"].astype(float)
    y_test_rg = test_df["max_monthly_emi"].astype(float)

    train_and_evaluate(X_train, X_test, y_train_cl, y_test_cl, y_train_rg, y_test_rg, preprocessor, args.outdir, quick=args.quick)
    print("Done.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", default="/mnt/data/emi_prediction_dataset.csv", help="path to CSV")
    parser.add_argument("--outdir", default="./artifacts", help="where to save preprocessor and models")
    parser.add_argument("--quick", action="store_true", help="use small sample for fast run")
    args = parser.parse_args()
    main(args)


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
import mlflow
import mlflow.sklearn
import warnings

warnings.filterwarnings("ignore")

# ============ Step 1: Data Loading & Preprocessing ============
file_path = "../dataset/emi_prediction_dataset.csv"  # change if needed
data = pd.read_csv(file_path)
print(f"Loaded dataset shape: {data.shape}")

# Drop duplicates & handle missing
data = data.drop_duplicates()
print(f"After cleaning: {data.shape}")

edu_categories = ['High School', 'Graduate', 'Post Graduate', 'Professional']
company_categories = ['Startup', 'Small', 'Mid-size', 'Large Indian', 'MNC']
house_type_categories = ["Rented", "Family", "Own"]
emi_eligibility_categories = ['Not_Eligible', 'High_Risk', 'Eligible']

data['credit_score'] = pd.to_numeric(data['credit_score'], errors='coerce')
data = data[(data['credit_score'] <= 900) & data['credit_score'].notna()].reset_index(drop=True)

data = data[~((data["house_type"] == "Rented") & data["monthly_rent"].isna())].reset_index(drop=True)

data['age'] = (
    data['age'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
    .astype(float)
)

data['gender'] = data['gender'].astype(str).str.strip().str.lower()
data['gender'] = data['gender'].replace({'^m$': 'male', '^f$': 'female'}, regex=True)
data['gender'] = data['gender'].replace(r'^\s*$', pd.NA, regex=True)
data['gender'] = data['gender'].map({'male': 0, 'female': 1})

data['marital_status'] = data['marital_status'].astype(str).str.strip().str.lower()
data['marital_status'] = data['marital_status'].replace(r'^\s*$', pd.NA, regex=True)
data['marital_status'] = data['marital_status'].map({'single': 0, 'married': 1})

data["education"] = pd.Series(
    pd.Categorical(data["education"], categories=edu_categories, ordered=True).codes
).replace({-1: pd.NA})

data["monthly_salary"] = (
    data['monthly_salary'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
    .astype(float)
)

# data["monthly_salary"] = pd.to_numeric(data["monthly_salary"], errors='coerce')

# data["employment_type"] = pd.get_dummies(data["employment_type"]).add_suffix("_job")

data = pd.get_dummies(data, columns=["employment_type"], dtype="int") 


data['company_type'] = pd.Series(
    pd.Categorical(data["company_type"], categories=company_categories, ordered=True).codes
).replace({-1: pd.NA})


data['house_type'] = pd.Series(
    pd.Categorical(data["house_type"], categories=house_type_categories, ordered=True).codes
).replace({-1: pd.NA})

data["bank_balance"] = (
    data['bank_balance'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
)
data["bank_balance"] = pd.to_numeric(data["bank_balance"], errors="coerce")

data['existing_loans'] = data['existing_loans'].astype(str).str.strip().str.lower()
data['existing_loans'] = data['existing_loans'].replace(r'^\s*$', pd.NA, regex=True)
data['existing_loans'] = data['existing_loans'].map({'no': 0, 'yes': 1})

data["emi_eligibility"] = pd.Series(
    pd.Categorical(data["emi_eligibility"], categories=emi_eligibility_categories, ordered=True).codes
).replace({-1: pd.NA})

# # Encode categorical variables
# cat_cols = data.select_dtypes(include=['object']).columns
# for col in cat_cols:
#     data[col] = LabelEncoder().fit_transform(data[col].astype(str))

# Split targets
X = data.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)
y_class = data['emi_eligibility']
y_reg = data['max_monthly_emi']

# Split train/test/validation
X_train, X_temp, y_class_train, y_class_temp, y_reg_train, y_reg_temp = train_test_split(
    X, y_class, y_reg, test_size=0.3, random_state=42, stratify=y_class
)
X_val, X_test, y_class_val, y_class_test, y_reg_val, y_reg_test = train_test_split(
    X_temp, y_class_temp, y_reg_temp, test_size=0.5, random_state=42, stratify=y_class_temp
)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# ============ Step 2: MLflow Setup ============
mlflow.set_experiment("EMI_Prediction_Experiment")

def log_classification_results(model_name, model, X_train, y_train, X_test, y_test):
    mlflow.start_run(run_name=f"{model_name}_classification")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    metrics = {
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds, average='weighted'),
        "recall": recall_score(y_test, preds, average='weighted'),
        "f1_score": f1_score(y_test, preds, average='weighted')
    }
    mlflow.log_params(model.get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(model, model_name)
    mlflow.end_run()
    
    print(f"\n[{model_name} Classification Results]")
    for k,v in metrics.items():
        print(f"{k}: {v:.4f}")

def log_regression_results(model_name, model, X_train, y_train, X_test, y_test):
    mlflow.start_run(run_name=f"{model_name}_regression")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    metrics = {
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds),
        "MAPE": np.mean(np.abs((y_test - preds) / y_test)) * 100
    }
    mlflow.log_params(model.get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(model, model_name)
    mlflow.end_run()
    
    print(f"\n[{model_name} Regression Results]")
    for k,v in metrics.items():
        print(f"{k}: {v:.4f}")

# ============ Step 3: Classification Models ============
print("\n=== Classification Models ===")
log_classification_results("LogisticRegression", LogisticRegression(max_iter=1000), X_train, y_class_train, X_test, y_class_test)
log_classification_results("RandomForestClassifier", RandomForestClassifier(n_estimators=100, random_state=42), X_train, y_class_train, X_test, y_class_test)
log_classification_results("XGBoostClassifier", XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42), X_train, y_class_train, X_test, y_class_test)

# ============ Step 4: Regression Models ============
print("\n=== Regression Models ===")
log_regression_results("LinearRegression", LinearRegression(), X_train, y_reg_train, X_test, y_reg_test)
log_regression_results("RandomForestRegressor", RandomForestRegressor(n_estimators=100, random_state=42), X_train, y_reg_train, X_test, y_reg_test)
log_regression_results("XGBoostRegressor", XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42), X_train, y_reg_train, X_test, y_reg_test)


KeyboardInterrupt: 

# complete code

In [None]:
# emi_full_pipeline_allinone.py
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import streamlit as st

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import f1_score, r2_score
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

warnings.filterwarnings("ignore")

# ================== 1️⃣ Data Loading & Cleaning ==================
def load_and_clean_data(file_path):
    data = pd.read_csv(file_path)
    data = data.drop_duplicates()
    # Categories
    edu_categories = ['High School', 'Graduate', 'Post Graduate', 'Professional']
    company_categories = ['Startup', 'Small', 'Mid-size', 'Large Indian', 'MNC']
    house_type_categories = ["Rented", "Family", "Own"]
    emi_eligibility_categories = ['Not_Eligible', 'High_Risk', 'Eligible']
    # Cleaning numeric columns
    data['credit_score'] = pd.to_numeric(data['credit_score'], errors='coerce')
    data = data[(data['credit_score'] <= 900) & data['credit_score'].notna()].reset_index(drop=True)
    data = data[~((data["house_type"] == "Rented") & data["monthly_rent"].isna())].reset_index(drop=True)
    data['age'] = (data['age'].astype(str).str.strip().str.replace(r'[^0-9.]', '', regex=True)
                   .str.replace(r'(\.\d*)\..*', r'\1', regex=True).astype(float))
    data['gender'] = data['gender'].astype(str).str.strip().str.lower()
    data['gender'] = data['gender'].replace({'^m$': 'male', '^f$': 'female'}, regex=True)
    data['gender'] = data['gender'].replace(r'^\s*$', pd.NA, regex=True).map({'male': 0, 'female': 1})
    data['marital_status'] = data['marital_status'].astype(str).str.strip().str.lower()
    data['marital_status'] = data['marital_status'].replace(r'^\s*$', pd.NA, regex=True).map({'single':0,'married':1})
    data["education"] = pd.Series(pd.Categorical(data["education"], categories=edu_categories, ordered=True).codes).replace({-1: pd.NA})
    data["monthly_salary"] = (data['monthly_salary'].astype(str).str.strip().str.replace(r'[^0-9.]','',regex=True)
                              .str.replace(r'(\.\d*)\..*', r'\1', regex=True).astype(float))
    data = pd.get_dummies(data, columns=["employment_type"], dtype="int")
    data['company_type'] = pd.Series(pd.Categorical(data["company_type"], categories=company_categories, ordered=True).codes).replace({-1: pd.NA})
    data['house_type'] = pd.Series(pd.Categorical(data["house_type"], categories=house_type_categories, ordered=True).codes).replace({-1: pd.NA})
    data["bank_balance"] = pd.to_numeric(data['bank_balance'].astype(str).str.strip()
                                        .str.replace(r'[^0-9.]','',regex=True).str.replace(r'(\.\d*)\..*',r'\1',regex=True),
                                        errors="coerce")
    data['existing_loans'] = data['existing_loans'].astype(str).str.strip().str.lower()
    data['existing_loans'] = data['existing_loans'].replace(r'^\s*$', pd.NA, regex=True).map({'no':0,'yes':1})
    data["emi_eligibility"] = pd.Series(pd.Categorical(data["emi_eligibility"], categories=emi_eligibility_categories, ordered=True).codes).replace({-1: pd.NA})
    data = pd.get_dummies(data, columns=["emi_scenario"], dtype="int")
    data = data.dropna()
    return data

# ================== 2️⃣ Feature Engineering ==================
def feature_engineering(data):
    data['debt_to_income'] = data['current_emi_amount'] / data['monthly_salary']
    data['expense_to_income'] = (data['school_fees'] + data['college_fees'] + data['travel_expenses'] + 
                                 data['groceries_utilities'] + data['other_monthly_expenses']) / data['monthly_salary']
    data['affordability_ratio'] = (data['monthly_salary'] - data['current_emi_amount'] - 
                                   data['monthly_rent'] - data['other_monthly_expenses']) / data['requested_amount']
    return data

# ================== 3️⃣ Detect Features ==================
def detect_features(data, target_cols=['emi_eligibility','max_monthly_emi']):
    return [col for col in data.columns if col not in target_cols]

# ================== 4️⃣ Split & Scale ==================
def split_and_scale_data(data):
    X = data.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)
    y_class = data['emi_eligibility']
    y_reg = data['max_monthly_emi']
    X_train, X_temp, y_class_train, y_class_temp, y_reg_train, y_reg_temp = train_test_split(
        X, y_class, y_reg, test_size=0.3, random_state=42, stratify=y_class)
    X_val, X_test, y_class_val, y_class_test, y_reg_val, y_reg_test = train_test_split(
        X_temp, y_class_temp, y_reg_temp, test_size=0.5, random_state=42, stratify=y_class_temp)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    return X_train,X_val,X_test,y_class_train,y_class_val,y_class_test,y_reg_train,y_reg_val,y_reg_test,scaler

# ================== 5️⃣ EDA ==================
def exploratory_data_analysis(data):
    st.subheader("Dataset Overview")
    st.dataframe(data.head())
    st.subheader("Statistical Summary")
    st.dataframe(data.describe().T)
    st.subheader("EMI Eligibility Distribution")
    st.bar_chart(data['emi_eligibility'].value_counts())
    st.subheader("Correlation Heatmap")
    fig, ax = plt.subplots(figsize=(10,8))
    sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
    st.pyplot(fig)

# ================== 6️⃣ MLflow Logging ==================
def log_classification_mlflow(model_name, model, X_train, X_val, y_train, y_val):
    with mlflow.start_run(run_name=f"{model_name}_Classification"):
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds, average='weighted')
        mlflow.log_params(model.get_params())
        mlflow.log_metric("F1_weighted", f1)
        mlflow.sklearn.log_model(model, artifact_path=model_name, input_example=X_val[:2], signature=infer_signature(X_val,preds))
    return f1

def log_regression_mlflow(model_name, model, X_train, X_val, y_train, y_val):
    with mlflow.start_run(run_name=f"{model_name}_Regression"):
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        r2 = r2_score(y_val, preds)
        mlflow.log_params(model.get_params())
        mlflow.log_metric("R2", r2)
        mlflow.sklearn.log_model(model, artifact_path=model_name, input_example=X_val[:2], signature=infer_signature(X_val,preds))
    return r2

# ================== 7️⃣ Full Pipeline Wrapper ==================
def run_full_emi_pipeline(file_path):
    # Load & clean
    data = load_and_clean_data(file_path)
    data = feature_engineering(data)
    feature_names = detect_features(data)
    
    # ML Prep
    X_train,X_val,X_test,y_class_train,y_class_val,y_class_test,y_reg_train,y_reg_val,y_reg_test,scaler = split_and_scale_data(data)
    mlflow.set_experiment("EMI_Prediction_Experiment")
    
    # Classification
    classifiers = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForestClassifier": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoostClassifier": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
    }
    best_clf, best_clf_score, best_clf_name = None, -1, ""
    for name, model in classifiers.items():
        score = log_classification_mlflow(name, model, X_train, X_val, y_class_train, y_class_val)
        if score > best_clf_score:
            best_clf_score = score
            best_clf = model
            best_clf_name = name
    
    # Regression
    regressors = {
        "LinearRegression": LinearRegression(),
        "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoostRegressor": XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
    }
    best_reg, best_reg_score, best_reg_name = None, -float('inf'), ""
    for name, model in regressors.items():
        score = log_regression_mlflow(name, model, X_train, X_val, y_reg_train, y_reg_val)
        if score > best_reg_score:
            best_reg_score = score
            best_reg = model
            best_reg_name = name
    
    # Save models
    clf_model_path = f"best_classifier_{best_clf_name}.pkl"
    reg_model_path = f"best_regressor_{best_reg_name}.pkl"
    joblib.dump(best_clf, clf_model_path)
    joblib.dump(best_reg, reg_model_path)
    
    # Launch Streamlit
    run_emi_streamlit_app_interactive(clf_model_path, reg_model_path, feature_names)

# ================== 8️⃣ Interactive Streamlit ==================
def run_emi_streamlit_app_interactive(classifier_model_path, regressor_model_path, feature_names):
    clf_model = joblib.load(classifier_model_path)
    reg_model = joblib.load(regressor_model_path)
    st.set_page_config(page_title="EMI Prediction Dashboard", layout="wide")
    st.title("EMI Prediction Platform")
    
    uploaded_file = st.file_uploader("Upload EMI dataset CSV", type="csv")
    if uploaded_file:
        data = pd.read_csv(uploaded_file)
        data = feature_engineering(data)
        exploratory_data_analysis(data)
        st.subheader("Predictions")
        preds_class = clf_model.predict(data[feature_names])
        preds_reg = reg_model.predict(data[feature_names])
        data["EMI_Eligibility_Pred"] = preds_class
        data["Max_EMI_Pred"] = preds_reg
        st.dataframe(data[['EMI_Eligibility_Pred','Max_EMI_Pred']].head())
        st.success("Predictions completed successfully!")

# ================== 9️⃣ Main ==================
if __name__ == "__main__":
    file_path = "emi_prediction_dataset.csv"  # change to your dataset path
    run_full_emi_pipeline(file_path)


In [None]:
streamlit run emi_full_pipeline_allinone.py