In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loading the bank loan data

dataset = pd.read_excel('Bank_Personal_Loan_Modelling.xlsx',sheet_name=1)

dataset.shape

(5000, 14)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [6]:
dataset.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [9]:
dataset.drop_duplicates().count()

ID                    5000
Age                   5000
Experience            5000
Income                5000
ZIP Code              5000
Family                5000
CCAvg                 5000
Education             5000
Mortgage              5000
Personal Loan         5000
Securities Account    5000
CD Account            5000
Online                5000
CreditCard            5000
dtype: int64

In [11]:
dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [43]:
#removing columns which are not required 

df2  = dataset.drop(columns=['ID','ZIP Code'],axis=1)

In [44]:
df2.columns

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [45]:
df2.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [56]:
#assigning the values to the features variable

features = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Securities Account', 'CD Account',
       'Online', 'CreditCard']

x = df2[features]
y = df2['Personal Loan']

In [57]:
x.shape

(5000, 11)

In [58]:
#initializing RF algorithm

rf_model = RandomForestClassifier(max_features=2,n_estimators=1000,oob_score=True)

rf_model

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [60]:
rf_model.fit(x,y)

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [61]:
print("OOB_Accuracy: ",rf_model.oob_score_)

OOB_Accuracy:  0.988


In [62]:
#identifying the important features

for feature,imp in zip(features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.04489888137736484
Experience 0.04387745908590352
Income 0.34721227508214814
Family 0.0992495036111691
CCAvg 0.1785241541919169
Education 0.1651375462037364
Mortgage 0.04318147736969826
Securities Account 0.005387921086864017
CD Account 0.05430726763666971
Online 0.008254951085444513
CreditCard 0.009968563269084572


In [65]:
predict = pd.DataFrame([x['Income'],x['CCAvg'],x['Education']]).T

predict.head()

Unnamed: 0,Income,CCAvg,Education
0,49.0,1.6,1.0
1,34.0,1.5,1.0
2,11.0,1.0,1.0
3,100.0,2.7,2.0
4,45.0,1.0,2.0


In [66]:
dc_tree = tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)

In [67]:
dc_tree.fit(predict,y)

DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10)

In [68]:
#generating output to a file

with open("DctreeBank.dot",'w') as f:
    f = tree.export_graphviz(dc_tree,feature_names=['Income','CCAvg','Education'],out_file=f)

In [69]:
dc_tree.score(predict,y)

0.9718