In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset
df = pd.read_csv('credit_data.csv')

In [3]:
# Display basic info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                100 non-null    int64 
 1   Gender             100 non-null    object
 2   Marital Status     100 non-null    object
 3   Education          100 non-null    object
 4   Annual Income      100 non-null    int64 
 5   Credit Score       100 non-null    int64 
 6   Previous Defaults  100 non-null    int64 
 7   Loan Purpose       100 non-null    object
 8   Default            100 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 7.2+ KB


In [4]:
# Display the dataset

df.head()

Unnamed: 0,Age,Gender,Marital Status,Education,Annual Income,Credit Score,Previous Defaults,Loan Purpose,Default
0,59,Male,Single,Bachelor,31093,575,1,Business,0
1,49,Female,Single,Master,38070,508,0,Business,0
2,35,Female,Married,PhD,55777,573,0,Personal,1
3,28,Female,Single,High School,76958,752,0,Car,0
4,41,Male,Married,High School,102074,729,3,Business,0


In [5]:
# Handling missing values (if any)
df = df.dropna()

In [6]:
# Encoding categorical variables using pd.get_dummies()
categorical_cols = ['Gender', 'Marital Status', 'Education', 'Loan Purpose']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [7]:
df_encoded.head()

Unnamed: 0,Age,Annual Income,Credit Score,Previous Defaults,Default,Gender_Male,Marital Status_Married,Marital Status_Single,Education_High School,Education_Master,Education_PhD,Loan Purpose_Car,Loan Purpose_Home,Loan Purpose_Personal
0,59,31093,575,1,0,True,False,True,False,False,False,False,False,False
1,49,38070,508,0,0,False,False,True,False,True,False,False,False,False
2,35,55777,573,0,1,False,True,False,False,False,True,False,False,True
3,28,76958,752,0,0,False,False,True,True,False,False,True,False,False
4,41,102074,729,3,0,True,True,False,True,False,False,False,False,False


In [8]:
# Splitting dataset into features and labels; here axis = 1 column wise operation
X = df_encoded.drop('Default', axis=1)
y = df_encoded['Default']

In [9]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Scaling the numerical features
scaler = StandardScaler()
numerical_cols = ['Age', 'Annual Income', 'Credit Score', 'Previous Defaults']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

### Logistic Regression

In [11]:
# Training a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [12]:
# Making predictions on the test set
y_pred = logreg.predict(X_test)


In [13]:
# Evaluating the model
print("Accuracy:", round(accuracy_score(y_test, y_pred)*100,2), "%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 73.33 %

Confusion Matrix:
 [[20  4]
 [ 4  2]]


In [14]:
# Function to input new customer data and make predictions
def get_new_customer_data():
    # Gather input from user for each feature
    age = float(input("Enter Age: "))
    gender = input("Enter Gender (Male/Female): ")
    marital_status = input("Enter Marital Status (Single/Married/Divorced): ")
    education = input("Enter Education (High School/Bachelor/Master/PhD): ")
    annual_income = float(input("Enter Annual Income: "))
    credit_score = float(input("Enter Credit Score: "))
    previous_defaults = int(input("Enter Previous Defaults: "))
    loan_purpose = input("Enter Loan Purpose (Home/Car/Business/Personal): ")

    # Convert inputs into a DataFrame
    new_data = pd.DataFrame({
        'Age': [age],
        'Annual Income': [annual_income],
        'Credit Score': [credit_score],
        'Previous Defaults': [previous_defaults],
        'Gender': [gender],
        'Marital Status': [marital_status],
        'Education': [education],
        'Loan Purpose': [loan_purpose]
    })

    # Apply get_dummies only to categorical columns
    categorical_cols = ['Gender', 'Marital Status', 'Education', 'Loan Purpose']
    new_data_encoded = pd.get_dummies(new_data, columns=categorical_cols, drop_first=True)

    # Reindex to ensure all columns align with the trained model
    missing_cols = set(X.columns) - set(new_data_encoded.columns)
    for col in missing_cols:
        new_data_encoded[col] = 0
    new_data_encoded = new_data_encoded[X.columns]  # Ensure order of columns

    # Scale the continuous variables
    new_data_encoded[numerical_cols] = scaler.transform(new_data_encoded[numerical_cols])

    return new_data_encoded

In [15]:
# Function to predict risk and determine if credit should be granted
def predict_credit_risk():
    # Get new customer data
    new_customer_data = get_new_customer_data()
    
    # Predict the risk
    prediction = logreg.predict(new_customer_data)
    risk_score = logreg.predict_proba(new_customer_data)[0][1]  # Probability of default
    
    # Output the results
    print(f"Prediction: {'Default' if prediction == 1 else 'No Default'}")
    print(f"Risk Score (Probability of Default): {100*risk_score:.2f} %")
    
    if prediction == 1:
        print("Credit should NOT be granted due to high risk.")
    else:
        print("Credit can be granted with low risk.")


In [16]:
# Test the function with new customer input
#predict_credit_risk()

In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid_log_reg = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'lbfgs']
}

# Create the GridSearchCV object
log_reg_cv = GridSearchCV(LogisticRegression(max_iter=500), param_grid_log_reg, cv=5, n_jobs=-1)
log_reg_cv.fit(X_train, y_train)

# Best parameters
print(f"Best Hyperparameters for Logistic Regression: {log_reg_cv.best_params_}")

# Predict and evaluate
y_pred_log_reg_cv = log_reg_cv.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_log_reg_cv))} %")

Best Hyperparameters for Logistic Regression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 80.0 %


### Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier with default hyperparameters
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

# Predict and evaluate
y_pred_tree = tree.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_tree))} %")

Accuracy: 56.666666666666664 %


In [19]:
# Define the parameter grid for Decision Tree
param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Create the GridSearchCV object
tree_cv = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, cv=5, n_jobs=-1)
tree_cv.fit(X_train, y_train)

# Best parameters
print(f"Best Hyperparameters for Decision Tree: {tree_cv.best_params_}")

# Predict and evaluate
y_pred_tree_cv = tree_cv.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_tree_cv))} %")

Best Hyperparameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 20}
Accuracy: 60.0 %


### Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier with default hyperparameters
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_rf))} %")

Accuracy: 63.33333333333333 %


In [22]:
# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200], # no of trees -> n_estimators
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
rf_cv = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, n_jobs=-1)
rf_cv.fit(X_train, y_train)

# Best parameters
print(f"Best Hyperparameters for Random Forest: {rf_cv.best_params_}")

# Predict and evaluate
y_pred_rf_cv = rf_cv.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_cv)}")
print(classification_report(y_test, y_pred_rf_cv))


Best Hyperparameters for Random Forest: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
Accuracy: 0.5666666666666667
              precision    recall  f1-score   support

           0       0.76      0.67      0.71        24
           1       0.11      0.17      0.13         6

    accuracy                           0.57        30
   macro avg       0.44      0.42      0.42        30
weighted avg       0.63      0.57      0.60        30



### Support Vector Machine

In [23]:
from sklearn.svm import SVC

# SVM with default hyperparameters
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_svm))}")

Accuracy: 76.66666666666667


In [24]:
# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Create the GridSearchCV object
svm_cv = GridSearchCV(SVC(), param_grid_svm, cv=5, n_jobs=-1)
svm_cv.fit(X_train, y_train)

# Best parameters
print(f"Best Hyperparameters for SVM: {svm_cv.best_params_}")

# Predict and evaluate
y_pred_svm_cv = svm_cv.predict(X_test)
print(f"Accuracy: {100*(accuracy_score(y_test, y_pred_svm))}")


Best Hyperparameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
Accuracy: 76.66666666666667
