In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor, LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn

In [9]:
df = pd.read_csv(r'C:\Users\hameedbf\Documents\Python\emi_prediction\cleaned_emi_dataset.csv')
# df = pd.read_csv(r'C:\emi_prediction\cleaned_emi_dataset.csv')
df

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan EMI,850000.0,15,Not_Eligible,500.0
1,38,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0
2,38,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education EMI,306000.0,16,Eligible,27775.0
3,58,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle EMI,304000.0,83,Eligible,16170.0
4,48,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances EMI,252000.0,7,Not_Eligible,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392894,27,Male,Married,Graduate,32400.0,Private,5.0,Large Indian,Rented,10400.0,...,No,0.0,649.0,62000.0,32600.0,Personal Loan EMI,506000.0,47,Not_Eligible,500.0
392895,38,Male,Married,Post Graduate,49200.0,Private,1.9,MNC,Own,0.0,...,No,0.0,712.0,142200.0,38100.0,Personal Loan EMI,708000.0,33,Not_Eligible,5200.0
392896,32,Male,Single,Graduate,25700.0,Private,3.2,MNC,Rented,6300.0,...,No,0.0,676.0,191600.0,39700.0,Home Appliances EMI,93000.0,21,High_Risk,5665.0
392897,48,Male,Married,Graduate,47200.0,Private,3.0,MNC,Own,0.0,...,No,0.0,784.0,170400.0,45600.0,Home Appliances EMI,144000.0,36,Eligible,14460.0


In [11]:
# Encode object columns
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

print("✅ Encoded columns:", list(label_encoders.keys()))

✅ Encoded columns: ['gender', 'marital_status', 'education', 'employment_type', 'company_type', 'house_type', 'existing_loans', 'emi_scenario', 'emi_eligibility']


In [5]:
# Linear Regression Model
# --- Features and target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)  # drop both targets
y = df['max_monthly_emi']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Set MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="Linear_Regression_Model"):

    # Initialize and train Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    # Predict
    y_pred = lr.predict(X_test)

    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_param("model_type", "Linear Regression")
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)

    # Log model
    mlflow.sklearn.log_model(lr, "linear_regression_model")

    # Display metrics
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})
print("\nSample Predictions:")
print(comparison.head(10))

2025/10/20 10:56:24 INFO mlflow.tracking.fluent: Experiment with name 'EMI_Prediction' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



MSE   : 17285507.58
RMSE  : 4157.58
R²    : 0.710
MAPE  : 1.932

Sample Predictions:
    Actual     Predicted
0    500.0   1411.427191
1  37422.0  24941.917461
2   4080.0   7418.362991
3   2485.0   3622.169202
4  22500.0  19484.301376
5    500.0   5669.376359
6   7459.2   9798.986564
7    500.0    808.428989
8    500.0   3250.681760
9   2240.0   2571.559795


In [7]:
#Linear Regression with HyperParameter Tuning
# --- Features and target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)
y = df['max_monthly_emi']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Choose model ---
# model = Ridge(alpha=1.0)  # L2 regularization
model = Lasso(alpha=0.001)  # L1 regularization

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="Linear_Regression_L1_L2"):

    # Train model
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)

    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log hyperparameters
    mlflow.log_param("model_type", f"{type(model).__name__}")
    mlflow.log_param("alpha", model.alpha)

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)

    # Log model
    mlflow.sklearn.log_model(model, "linear_regression_regularized_model")

    # Print results
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Predictions:")
print(comparison.head(10))



MSE   : 17285507.30
RMSE  : 4157.58
R²    : 0.710
MAPE  : 1.932

Sample Predictions:
    Actual     Predicted
0    500.0   1411.425701
1  37422.0  24941.906784
2   4080.0   7418.362345
3   2485.0   3622.165669
4  22500.0  19484.300840
5    500.0   5669.379987
6   7459.2   9798.985228
7    500.0    808.429507
8    500.0   3250.676502
9   2240.0   2571.562059


In [8]:
# Random Forest Regressor Model
# --- Features and target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)
y = df['max_monthly_emi']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="RandomForest_Regression"):

    # Initialize model with default parameters
    rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    
    # Train model
    rf_model.fit(X_train, y_train)
    
    # Predict
    y_pred = rf_model.predict(X_test)
    
    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Log hyperparameters
    mlflow.log_param("model_type", "RandomForestRegressor")
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)
    
    # Log model
    mlflow.sklearn.log_model(rf_model, "random_forest_model")
    
    # Print results
    print("Random Forest Results:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Predictions:")
print(comparison.head(10))



Random Forest Results:
MSE   : 1805278.49
RMSE  : 1343.61
R²    : 0.970
MAPE  : 0.226

Sample Predictions:
    Actual   Predicted
0    500.0    502.0560
1  37422.0  39660.3500
2   4080.0   5815.2660
3   2485.0   2062.0184
4  22500.0  22965.2300
5    500.0    512.4240
6   7459.2   8339.1644
7    500.0    510.1800
8    500.0    513.0000
9   2240.0   2308.5720


In [10]:
# XGBoost Regressor Model
# --- Features and target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)
y = df['max_monthly_emi']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

with mlflow.start_run(run_name="XGBoost_Regression"):

    # Initialize XGBoost Regressor with default parameters
    xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
    
    # Train model
    xgb_model.fit(X_train, y_train)
    
    # Predict
    y_pred = xgb_model.predict(X_test)
    
    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Log parameters
    mlflow.log_param("model_type", "XGBoostRegressor")
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)
    
    # Log the model
    mlflow.sklearn.log_model(xgb_model, "xgboost_model")
    
    # Print results
    print("XGBoost Regressor Results:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Predictions:")
print(comparison.head(10))



XGBoost Regressor Results:
MSE   : 1305867.70
RMSE  : 1142.75
R²    : 0.978
MAPE  : 0.449

Sample Predictions:
    Actual     Predicted
0    500.0    340.452911
1  37422.0  38931.449219
2   4080.0   4368.336426
3   2485.0   3486.114746
4  22500.0  23246.728516
5    500.0   1884.916382
6   7459.2   7575.224609
7    500.0    536.273987
8    500.0   -142.545471
9   2240.0   2212.819092


In [11]:
# Random Forest Regressor with HyperParameter Tuning
# --- Features and target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)
y = df['max_monthly_emi']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction")

# --- Best hyperparameters ---
best_params = {
    'n_estimators': 200,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

with mlflow.start_run(run_name="RandomForest_Tuning"):

    # Initialize Random Forest with best hyperparameters
    rf_best = RandomForestRegressor(random_state=42, **best_params, n_jobs=-1)
    
    # Train model
    rf_best.fit(X_train, y_train)
    
    # Predict
    y_pred = rf_best.predict(X_test)
    
    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Log parameters
    mlflow.log_params(best_params)
    mlflow.log_param("model_type", "RandomForestRegressor_Tuning")
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape", mape)
    
    # Log the model
    mlflow.sklearn.log_model(rf_best, "random_forest_tuning_model")
    
    # Print results
    print("Random Forest Regressor with Best Hyperparameters:")
    print(f"MSE   : {mse:.2f}")
    print(f"RMSE  : {rmse:.2f}")
    print(f"R²    : {r2:.3f}")
    print(f"MAPE  : {mape:.3f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print("\nSample Predictions:")
print(comparison.head(10))



Random Forest Regressor with Best Hyperparameters:
MSE   : 1779669.13
RMSE  : 1334.04
R²    : 0.970
MAPE  : 0.225

Sample Predictions:
    Actual   Predicted
0    500.0    504.5974
1  37422.0  39619.0220
2   4080.0   5755.2480
3   2485.0   1914.4511
4  22500.0  23071.4950
5    500.0    506.2120
6   7459.2   8416.6152
7    500.0    512.4800
8    500.0    506.5000
9   2240.0   2329.4796


In [9]:
#Logistic Regression Model
# --- Features and target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)  # Features
y = df['emi_eligibility']                                     # Classification target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Set new MLflow experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

with mlflow.start_run(run_name="LogisticRegression_Model"):

    # Initialize and train Logistic Regression
    log_clf = LogisticRegression(
        solver='saga', 
        penalty='l2', 
        class_weight='balanced', 
        max_iter=1000
    )
    log_clf.fit(X_train, y_train)

    # Predict
    y_pred = log_clf.predict(X_test)

    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_param("model_type", "Logistic Regression Model")
    mlflow.log_param("solver", "saga")
    mlflow.log_param("penalty", "l2")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Log model
    mlflow.sklearn.log_model(log_clf, "logistic_regression_model")

    # Print metrics
    print("Logistic Regression Metrics:")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)



Logistic Regression Metrics:
Accuracy : 0.7388
Precision: 0.8808
Recall   : 0.7388
F1-score : 0.7937

Confusion Matrix:
[[10361  3200   722]
 [ 1196  1693   565]
 [ 3762 11079 46002]]


In [12]:
# Logistic Regression with SMOTE
# --- Features & Target ---
X = df.drop(['max_monthly_emi', 'emi_eligibility'], axis=1)  # Drop regression target
y = df['emi_eligibility']  # Classification target (already label encoded)

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Apply SMOTE to balance classes ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# --- MLflow experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

with mlflow.start_run(run_name="LogisticRegression_with_SMOTE"):

    # --- Train Logistic Regression ---
    log_clf = LogisticRegression(
        solver='saga',
        penalty='l2',
        C=1.0,
        max_iter=1000,
        class_weight='balanced'
    )
    log_clf.fit(X_train_res, y_train_res)

    # --- Predict ---
    y_pred = log_clf.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log parameters and metrics ---
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_param("solver", "saga")
    mlflow.log_param("penalty", "l2")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log model ---
    mlflow.sklearn.log_model(log_clf, "logistic_regression_smote_model")

    # --- Print metrics ---
    print("Logistic Regression Metrics (with SMOTE):")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

[WinError 2] The system cannot find the file specified
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\admin\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

Logistic Regression Metrics (with SMOTE):
Accuracy : 0.7435
Precision: 0.8808
Recall   : 0.7435
F1-score : 0.7981

Confusion Matrix:
[[ 9990  3481   978]
 [ 1122  1664   606]
 [ 3161 10810 46768]]


In [6]:
#Random Forest Classifier with SMOTE and Tuning
# --- Features and Target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1) 
y = df['emi_eligibility']

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Apply SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# --- Set MLflow experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

# --- Train Random Forest with MLflow ---
with mlflow.start_run(run_name="RandomForest_Classification with Tuning and SMOTE"):

    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    rf_clf.fit(X_train_res, y_train_res)

    # --- Predict ---
    y_pred = rf_clf.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log params & metrics ---
    mlflow.log_param("model_type", "Random Forest Classifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("class_weight", "balanced")
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log model ---
    mlflow.sklearn.log_model(rf_clf, "rf_classification_model")

    # --- Display results ---
    print("Random Forest Classification Metrics:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print(classification_report(y_test, y_pred))



Random Forest Classification Metrics:
Accuracy : 0.9157
Precision: 0.9101
Recall   : 0.9157
F1-score : 0.9127

Confusion Matrix:
[[12665   652  1132]
 [ 1154   795  1443]
 [ 1032  1215 58492]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.86     14449
           1       0.30      0.23      0.26      3392
           2       0.96      0.96      0.96     60739

    accuracy                           0.92     78580
   macro avg       0.70      0.69      0.70     78580
weighted avg       0.91      0.92      0.91     78580



In [14]:
# Random Forest
# --- Features and Target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)
y = df['emi_eligibility']

# --- Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Set MLflow Experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

# --- Train Normal Random Forest ---
with mlflow.start_run(run_name="RandomForest_Classification_Normal"):
    
    rf = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)

    # --- Predictions ---
    y_pred = rf.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log Parameters & Metrics ---
    mlflow.log_param("model_type", "Random Forest Classifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("tuning", "None")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log Model ---
    mlflow.sklearn.log_model(rf, "rf_classification_model_normal")

    # --- Print Results ---
    print("📊 Random Forest Classification (Normal) Results:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)



📊 Random Forest Classification (Normal) Results:
Accuracy : 0.9266
Precision: 0.9199
Recall   : 0.9266
F1-score : 0.9071

Confusion Matrix:
[[12644    14  1791]
 [ 1023    67  2302]
 [  629     5 60105]]


In [16]:
# XGBoost Classifier Model
# --- Features and Target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)
y = df['emi_eligibility']

# --- Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Set MLflow Experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

# --- Train Normal XGBoost Model ---
with mlflow.start_run(run_name="XGBoost_Classification_Normal"):
    
    xgb_model = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )
    xgb_model.fit(X_train, y_train)

    # --- Predictions ---
    y_pred = xgb_model.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log to MLflow ---
    mlflow.log_param("model_type", "XGBoost Classifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("tuning", "None")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log Model ---
    mlflow.xgboost.log_model(xgb_model, "xgb_classification_model_normal")

    # --- Print Results ---
    print("📊 XGBoost Classification (Normal) Results:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


📊 XGBoost Classification (Normal) Results:
Accuracy : 0.9341
Precision: 0.9080
Recall   : 0.9341
F1-score : 0.9134

Confusion Matrix:
[[13184     7  1258]
 [ 1055     4  2333]
 [  528     1 60210]]


In [8]:
#Random Forest Classifier with SMOTE and Tuning
# --- Features and Target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1) 
y = df['emi_eligibility']

# --- Train/Test split ---
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Apply SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size=0.2, random_state=42, stratify=y_train_res)

# --- Set MLflow experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

# --- Train Random Forest with MLflow ---
with mlflow.start_run(run_name="RandomForest_Classification with Tuning and SMOTE"):

    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    rf_clf.fit(X_train, y_train)

    # --- Predict ---
    y_pred = rf_clf.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log params & metrics ---
    mlflow.log_param("model_type", "Random Forest Classifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("class_weight", "balanced")
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log model ---
    mlflow.sklearn.log_model(rf_clf, "rf_classification_model")

    # --- Display results ---
    print("Random Forest Classification Metrics:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print(classification_report(y_test, y_pred))



Random Forest Classification Metrics:
Accuracy : 0.9610
Precision: 0.9614
Recall   : 0.9610
F1-score : 0.9610

Confusion Matrix:
[[57434  2475   829]
 [  752 59328   659]
 [ 1001  1399 58339]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     60738
           1       0.94      0.98      0.96     60739
           2       0.98      0.96      0.97     60739

    accuracy                           0.96    182216
   macro avg       0.96      0.96      0.96    182216
weighted avg       0.96      0.96      0.96    182216



In [None]:
# --- Features and Target ---
X = df.drop(['emi_eligibility', 'max_monthly_emi'], axis=1) 
y = df['emi_eligibility']

# --- Apply SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
    X_train_res, y_train_res, test_size=0.2, random_state=42, stratify=y_train_res
)

# --- Set MLflow experiment ---
mlflow.set_experiment("EMI_Prediction_Classification")

# --- Train Random Forest with MLflow ---
with mlflow.start_run(run_name="Final RandomForest_Classification with Tuning and SMOTE"):

    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    rf_clf.fit(X_train, y_train)

    # --- Predict ---
    y_pred = rf_clf.predict(X_test)

    # --- Metrics ---
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # --- Log params & metrics ---
    mlflow.log_param("model_type", "Random Forest Classifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("class_weight", "balanced")
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # --- Log model to MLflow ---
    mlflow.sklearn.log_model(rf_clf, "rf_classification_model")

    # --- Save model locally as pickle ---
    import joblib
    joblib.dump(rf_clf, "FinalClassification.pkl")
    print("✅ Model saved locally as Classification.pkl")

    # --- Display results ---
    print("Random Forest Classification Metrics:")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print(classification_report(y_test, y_pred))




✅ Model saved locally as Classification.pkl
Random Forest Classification Metrics:
Accuracy : 0.9610
Precision: 0.9614
Recall   : 0.9610
F1-score : 0.9610

Confusion Matrix:
[[57434  2475   829]
 [  752 59328   659]
 [ 1001  1399 58339]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     60738
           1       0.94      0.98      0.96     60739
           2       0.98      0.96      0.97     60739

    accuracy                           0.96    182216
   macro avg       0.96      0.96      0.96    182216
weighted avg       0.96      0.96      0.96    182216

