# Import all required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from joblib import dump
import matplotlib.pyplot as plt

# 1. Load and preprocess data

In [2]:
df = pd.read_csv('customer.csv')

# Drop unnecessary columns

In [3]:
df = df.drop(['Client_Num', 'Zipcode'], axis=1)


# Convert binary columns

In [4]:
binary_cols = ['Car_Owner', 'House_Owner', 'Personal_loan']
df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0})


  df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0})


# One-hot encode categorical variables

In [5]:
cat_cols = ['Gender', 'Education_Level', 'Marital_Status', 'state_cd', 'contact', 'Customer_Job']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


# Handle missing values

In [6]:
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Separate features and target

In [7]:
X = df.drop('Personal_loan', axis=1)
y = df['Personal_loan']


# 2. Split and scale data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify numerical columns

In [9]:
num_cols = ['Customer_Age', 'Dependent_Count', 'Income', 'Cust_Satisfaction_Score']

# Initialize and fit scaler

In [10]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 3. Handle class imbalance

In [11]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 4. Train model with GridSearch

In [12]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

In [13]:
model = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
model.fit(X_train_smote, y_train_smote)


# 5. Evaluate model

In [14]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Best Parameters:", model.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_prob))

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

Confusion Matrix:
 [[2486  157]
 [ 364   26]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.94      0.91      2643
           1       0.14      0.07      0.09       390

    accuracy                           0.83      3033
   macro avg       0.51      0.50      0.50      3033
weighted avg       0.78      0.83      0.80      3033


ROC AUC Score: 0.5094696197987911


# 6. Save model and scaler

In [15]:
dump(model.best_estimator_, 'loan_model.joblib')
dump(scaler, 'scaler.joblib')

print("\nModel and scaler saved successfully!")


Model and scaler saved successfully!
