In [11]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import math

import mlflow

import mlflow.sklearn

from sklearn.ensemble import VotingClassifier, RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from imblearn.over_sampling import SMOTE

import os

 

In [12]:
import pandas as pd

# Load the Excel file
df = pd.read_csv(r"C:\Users\Minfy.DESKTOP-81ME0ME\Downloads\dataset loan.csv")

# Strip leading/trailing spaces from column names (important!)
df.columns = df.columns.str.strip()

# View shape and first 5 rows
print("Shape of dataset:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Basic info
print("\nData types and null values:")
print(df.info())



Shape of dataset: (5000, 14)

First 5 rows:
   ID  Age  Experience  Income  ZIP Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0           0  
3              0                   0           0       0           0  
4              0                   0           0       0           1  

Data types and null values:
<class 'pandas.core.

In [13]:
# Strip column names (to avoid issues with spaces)
df.columns = df.columns.str.strip()

# Drop 'ID' and 'ZIP Code' if they exist
df.drop(columns=['ID', 'ZIP Code'], inplace=True, errors='ignore')

# Confirm the shape and columns
print("Shape after dropping:", df.shape)
print("Remaining columns:", df.columns.tolist())


Shape after dropping: (5000, 12)
Remaining columns: ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']


In [25]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1. Prepare data
X = df.drop(columns=['Personal Loan'])
y = df['Personal Loan']
X = pd.get_dummies(X, drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# 2. Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# 3. MLflow experiment (optional: set a name)
mlflow.set_experiment("Personal_Loan_Classification")

# 4. Train and log each model
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Log parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metrics({f"{label}_precision": report[label]["precision"]
                            for label in report if label in ['0', '1']})
        mlflow.log_metrics({f"{label}_recall": report[label]["recall"]
                            for label in report if label in ['0', '1']})

        # Log model
        mlflow.sklearn.log_model(model, "model")

        print(f"✅ Logged {model_name} to MLflow with accuracy: {acc:.4f}")


2025/07/01 15:15:45 INFO mlflow.tracking.fluent: Experiment with name 'Personal_Loan_Classification' does not exist. Creating a new experiment.




✅ Logged Logistic Regression to MLflow with accuracy: 0.8960




✅ Logged Random Forest to MLflow with accuracy: 0.9930




✅ Logged XGBoost to MLflow with accuracy: 0.9940


In [28]:
mlflow.set_tracking_uri("http://localhost:5000")


In [29]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment_name = "Personal_Loan_Classification"
experiment = client.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

# 🔍 Find best run (based on accuracy)
best_run = None
best_accuracy = -1

for run in client.search_runs(experiment_ids=experiment_id):
    acc = run.data.metrics.get("accuracy", 0)
    if acc > best_accuracy:
        best_accuracy = acc
        best_run = run

# 🏷 Register model from best run
model_uri = f"runs:/{best_run.info.run_id}/model"

model_name = "Best_Personal_Loan_Model"  # You can give any name
mlflow.register_model(model_uri=model_uri, name=model_name)

print(f"✅ Registered best model from run {best_run.info.run_id} with accuracy {best_accuracy:.4f}")


Successfully registered model 'Best_Personal_Loan_Model'.
2025/07/01 15:53:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best_Personal_Loan_Model, version 1


✅ Registered best model from run c10454b288a14561b2f5ef5976586f6a with accuracy 0.9940


Created version '1' of model 'Best_Personal_Loan_Model'.


In [36]:
import mlflow
from mlflow.sklearn import load_model
import pandas as pd

# Load model from Model Registry
model = load_model("models:/Best_Personal_Loan_Model/1")


In [39]:
new_data = pd.DataFrame({
    'Age': [35],
    'Experience': [10],
    'Income': [150],            # 🔼 High income
    'Family': [4],
    'CCAvg': [5.0],             # 🔼 High credit card spending
    'Education': [3],           # 🔼 Highest education level
    'Mortgage': [100],          # 🔼 Has mortgage
    'Securities Account': [1],  # 🔼 Investment-minded
    'CD Account': [1],          # 🔼 CD Account active
    'Online': [1],
    'CreditCard': [1]
})
# If your original data had categorical columns encoded:
new_data_encoded = pd.get_dummies(new_data, drop_first=True)

# Match columns (padding missing ones if needed)
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0
new_data_encoded = new_data_encoded[X.columns]  # Ensure order

# Scale
new_scaled = scaler.transform(new_data_encoded)

# Predict
predicted_class = model.predict(new_scaled)
predicted_proba = model.predict_proba(new_scaled)

print(f"Predicted Class: {predicted_class[0]}")
print(f"Predicted Probabilities: {predicted_proba[0]}")


Predicted Class: 1
Predicted Probabilities: [2.169609e-05 9.999783e-01]
