## Importing Packages

In [1]:
import pandas as pd

## Data Loading

In [2]:
df = pd.read_csv("dataset/loan.csv") # loading the dataset "loan.csv"

In [3]:
df.head() # Checking contents from the loaded dataset. The "default_status" column was already converted into numerical values 
          # where FALSE = 0 and TRUE = 1

Unnamed: 0,loan_type,loan_amount,interest_rate,loan_term,employment_type,income_level,credit_score,gender,marital_status,education_level,default_status
0,Car Loan,16795,0.051852,15,Self-employed,Medium,833,Male,Single,Master,0
1,Personal Loan,1860,0.089296,56,Full-time,Medium,776,Female,Married,Bachelor,0
2,Personal Loan,77820,0.07047,51,Full-time,Low,697,Male,Divorced,High School,0
3,Car Loan,55886,0.062155,30,Full-time,Low,795,Female,Married,PhD,0
4,Home Loan,7265,0.070635,48,Part-time,Low,519,Female,Married,High School,0


In [4]:
df.info() # Checking for null values and data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loan_type        5000 non-null   object 
 1   loan_amount      5000 non-null   int64  
 2   interest_rate    5000 non-null   float64
 3   loan_term        5000 non-null   int64  
 4   employment_type  5000 non-null   object 
 5   income_level     5000 non-null   object 
 6   credit_score     5000 non-null   int64  
 7   gender           5000 non-null   object 
 8   marital_status   5000 non-null   object 
 9   education_level  5000 non-null   object 
 10  default_status   5000 non-null   int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 429.8+ KB


## Data Transformation

In [5]:
df['default_status'].unique() # Checking for data type of each column and converted data

array([0, 1], dtype=int64)

In [6]:
# Conversion of data from categorical to numerical
df['loan_type'] = df['loan_type'].map({'Car Loan':0, 'Personal Loan':1, 'Home Loan':2, 'Education Loan':3}).astype('int')
df['employment_type'] = df['employment_type'].map({'Self-employed':2, 'Full-time':1, 'Part-time':0}).astype('int')
df['income_level'] = df['income_level'].map({'Medium':1, 'Low':0, 'High':2}).astype('int')
df['gender'] = df['gender'].map({'Male':1, 'Female':0}).astype('int')
df['marital_status'] = df['marital_status'].map({'Single':0, 'Married':1, 'Divorced':2}).astype('int')
df['education_level'] = df['education_level'].map({'Master':2, 'Bachelor':1, 'High School':0, 'PhD':3}).astype('int')

In [7]:
df.info() # Checking for converted data type. To proceed with modeling, all data type should be numerical.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loan_type        5000 non-null   int32  
 1   loan_amount      5000 non-null   int64  
 2   interest_rate    5000 non-null   float64
 3   loan_term        5000 non-null   int64  
 4   employment_type  5000 non-null   int32  
 5   income_level     5000 non-null   int32  
 6   credit_score     5000 non-null   int64  
 7   gender           5000 non-null   int32  
 8   marital_status   5000 non-null   int32  
 9   education_level  5000 non-null   int32  
 10  default_status   5000 non-null   int64  
dtypes: float64(1), int32(6), int64(4)
memory usage: 312.6 KB


In [8]:
df.head() # Rreviewing data content. Data are on different ranges. Applying feature scaling will improve performance of algorithm

Unnamed: 0,loan_type,loan_amount,interest_rate,loan_term,employment_type,income_level,credit_score,gender,marital_status,education_level,default_status
0,0,16795,0.051852,15,2,1,833,1,0,2,0
1,1,1860,0.089296,56,1,1,776,0,1,1,0
2,1,77820,0.07047,51,1,0,697,1,2,0,0
3,0,55886,0.062155,30,1,0,795,0,1,3,0
4,2,7265,0.070635,48,0,0,519,0,1,0,0


In [9]:
# Setting data for X and y
X = df.drop('default_status', axis=1)
y = df['default_status']

In [10]:
X.shape

(5000, 10)

In [11]:
y.shape

(5000,)

#### Feature Scaling

In [12]:
# Setting cols as variable for feature scaling
cols = ['loan_amount', 'interest_rate', 'loan_term', 'credit_score']

In [13]:
# Using RobustScaler for scaling
from sklearn.preprocessing import RobustScaler
st = RobustScaler()
X[cols] = st.fit_transform(X[cols])

In [14]:
# Cheking for the scaled value
X

Unnamed: 0,loan_type,loan_amount,interest_rate,loan_term,employment_type,income_level,credit_score,gender,marital_status,education_level
0,0,-0.656992,-1.334365,-0.869565,2,1,0.945848,1,0,2
1,1,-0.954917,0.470603,0.913043,1,1,0.740072,0,1,1
2,1,0.560343,-0.436900,0.695652,1,0,0.454874,1,2,0
3,0,0.122801,-0.837700,-0.217391,1,0,0.808664,0,1,3
4,2,-0.847098,-0.428934,0.565217,0,0,-0.187726,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4995,0,-0.235089,-0.455318,0.956522,2,2,-0.216606,1,1,3
4996,1,-0.015819,-1.114868,0.652174,0,1,-0.249097,1,0,3
4997,2,-0.842888,-0.738553,1.000000,1,2,-0.429603,0,0,0
4998,0,0.060363,0.741455,-1.000000,2,1,0.566787,1,1,3


### Model Training using Random Forest Classifier

#### Random Forest Classifier

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [17]:
clf.fit(X_train, y_train)

In [18]:
y_pred = clf.predict(X_test)

#### Feature importance

In [19]:
pd.DataFrame(clf.feature_importances_, 
             index = X_train.columns, 
             columns=['Importance']).sort_values('Importance', ascending=False)

Unnamed: 0,Importance
interest_rate,0.200989
loan_amount,0.200648
credit_score,0.193229
loan_term,0.152956
education_level,0.053153
loan_type,0.050723
income_level,0.044471
marital_status,0.041698
employment_type,0.04009
gender,0.022044


The feature importance indicates the significance of each of the features in a model. Interest rate and Loan amount has the most influence in these features which has the highest importance value at approximately 19.9%. The second most important feature is the credit score, with an importance value just a little behind the top 2 around 19.2%. Larger loan amounts impact approval chances. Next is the loan term which has importance value of 15.4%. The rest are below 6% which means they have lesser impact when it comes to predicting if a borrower will default.

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7976

The model’s accuracy on the test data is around 80.08% which is considerably good. Testing with cross validation score for more comparison.

In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train, y_train, cv=10)

array([0.80266667, 0.79733333, 0.8       , 0.80266667, 0.80266667,
       0.79733333, 0.8       , 0.8       , 0.79733333, 0.8       ])

Cross-validation results indicate consistent performance across different folds.