In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/credit-card-defaulters/credit.csv


In [2]:
data = pd.read_csv('../input/credit-card-defaulters/credit.csv')
data.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


Majority of data is qualitative

In [3]:
# we cant use one hot encoding directly as it will create too many new independent attribute of data
# which should be avoided

data.shape

(1000, 17)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

In [5]:
# changing data type of 'object' to 'categorical'
for i in data.columns:
    if data[i].dtype == 'object':
        data[i] = pd.Categorical(data[i])

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   checking_balance      1000 non-null   category
 1   months_loan_duration  1000 non-null   int64   
 2   credit_history        1000 non-null   category
 3   purpose               1000 non-null   category
 4   amount                1000 non-null   int64   
 5   savings_balance       1000 non-null   category
 6   employment_duration   1000 non-null   category
 7   percent_of_income     1000 non-null   int64   
 8   years_at_residence    1000 non-null   int64   
 9   age                   1000 non-null   int64   
 10  other_credit          1000 non-null   category
 11  housing               1000 non-null   category
 12  existing_loans_count  1000 non-null   int64   
 13  job                   1000 non-null   category
 14  dependents            1000 non-null   int64   
 15  phone

In [7]:
# taking count of all the categorical attribute values 

print (data.checking_balance.value_counts())

unknown       394
< 0 DM        274
1 - 200 DM    269
> 200 DM       63
Name: checking_balance, dtype: int64


In [8]:
print(data.credit_history.value_counts())

good         530
critical     293
poor          88
very good     49
perfect       40
Name: credit_history, dtype: int64


In [9]:
print(data.purpose.value_counts())
print(data.employment_duration.value_counts())
print(data.housing.value_counts())
print(data.job.value_counts())
print(data.phone.value_counts())


furniture/appliances    473
car                     337
business                 97
education                59
renovations              22
car0                     12
Name: purpose, dtype: int64
1 - 4 years    339
> 7 years      253
4 - 7 years    174
< 1 year       172
unemployed      62
Name: employment_duration, dtype: int64
own      713
rent     179
other    108
Name: housing, dtype: int64
skilled       630
unskilled     200
management    148
unemployed     22
Name: job, dtype: int64
no     596
yes    404
Name: phone, dtype: int64


In [10]:
# replacing the values with dummy values which we create
# eg employement with some value unemployement with something else
replace_data={"checking_balance":{"< 0 DM":1,"1 - 200 DM":2,"> 200 DM":3,
                                 "unknown":-1},
             "credit_history":{'critical':1,'poor':2,'good':3,
                               'very good':4,'perfect':5},
             "savings_balance":{'< 100 DM':1,'100 - 500 DM':2,'500 - 1000 DM':3,
                                '> 1000 DM':4,'unknown':-1},
             "employment_duration":{"unemployed":1,"< 1 year":2,
                                   "1 - 4 years":3,"4 - 7 years":4,
                                   "> 7 years":5},
             "phone":{"no":1,"yes":2},
             "default":{"no":0,'yes':1}}

In [11]:
cols = ['purpose', 'housing', 'other_credit', 'job']

In [12]:
data= data.replace(replace_data)
data= pd.get_dummies(data,columns= cols)
data.head(10)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,existing_loans_count,...,housing_other,housing_own,housing_rent,other_credit_bank,other_credit_none,other_credit_store,job_management,job_skilled,job_unemployed,job_unskilled
0,1,6,1,1169,-1,5,4,4,67,2,...,0,1,0,0,1,0,0,1,0,0
1,2,48,3,5951,1,3,2,2,22,1,...,0,1,0,0,1,0,0,1,0,0
2,-1,12,1,2096,1,4,2,3,49,1,...,0,1,0,0,1,0,0,0,0,1
3,1,42,3,7882,1,4,2,4,45,1,...,1,0,0,0,1,0,0,1,0,0
4,1,24,2,4870,1,3,3,4,53,2,...,1,0,0,0,1,0,0,1,0,0
5,-1,36,3,9055,-1,3,2,4,35,1,...,1,0,0,0,1,0,0,0,0,1
6,-1,24,3,2835,3,5,3,4,53,1,...,0,1,0,0,1,0,0,1,0,0
7,2,36,3,6948,1,3,2,2,35,1,...,0,0,1,0,1,0,1,0,0,0
8,-1,12,3,3059,4,4,2,4,61,1,...,0,1,0,0,1,0,0,0,0,1
9,2,30,1,5234,1,1,4,2,28,2,...,0,1,0,0,1,0,1,0,0,0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   checking_balance              1000 non-null   int64
 1   months_loan_duration          1000 non-null   int64
 2   credit_history                1000 non-null   int64
 3   amount                        1000 non-null   int64
 4   savings_balance               1000 non-null   int64
 5   employment_duration           1000 non-null   int64
 6   percent_of_income             1000 non-null   int64
 7   years_at_residence            1000 non-null   int64
 8   age                           1000 non-null   int64
 9   existing_loans_count          1000 non-null   int64
 10  dependents                    1000 non-null   int64
 11  phone                         1000 non-null   int64
 12  default                       1000 non-null   int64
 13  purpose_business              1000

In [14]:
# spliting data

x= data.drop('default', axis= 1)
y = data.pop('default')

In [15]:
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size= 0.30, random_state= 1)

In [16]:
dtree= DecisionTreeClassifier(criterion= 'gini', random_state= 1) # randome_state to chop data randomly


In [17]:
dtree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=1)

In [18]:
y_pred = dtree.predict(x_test)

In [19]:
print(y_pred)

[1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1 0
 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0
 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 1
 1 0 1 0 0 1 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1
 0 0 0 0]


In [20]:
print(dtree.score(x_test, y_test))

0.6933333333333334


In [21]:
print(dtree.score(x_train, y_train))

1.0


no modle can be this perfect to give 100% accurate preditction so this is over fitting of data

In [22]:
# proning data
dtree1= DecisionTreeClassifier(criterion='gini', max_depth=3, random_state= 1) 
# max depth means every branch to be have only 3 splits not more than that 
# 3 here is threshold

In [23]:
dtree1.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=1)

In [24]:
print(dtree1.score(x_train, y_train))
print(dtree1.score(x_test, y_test))

0.7528571428571429
0.7433333333333333


the diffirence of score of train and test data should not be extreme<br>
it should be as minimum as possible

In [25]:
cm = metrics.confusion_matrix(y_test, y_pred)

In [26]:
print(cm)

[[168  46]
 [ 46  40]]


In [27]:
# Ensamble Techinique
from sklearn.ensemble import BaggingClassifier


In [28]:
bagging= BaggingClassifier(base_estimator = dtree, n_estimators= 50, random_state= 1) #n_estimators are number of rows to be considered for sampling

In [29]:
bagging.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                  n_estimators=50, random_state=1)

In [31]:
y_pred1= bagging.predict(x_test)
print(bagging.score(x_test, y_test))

0.7733333333333333


In [32]:
print(metrics.confusion_matrix(y_test, y_pred1))

[[192  22]
 [ 46  40]]


It improved dtree data by 0.6933 
<br>
to 0.7733

In [33]:
# Adaboosting
from sklearn.ensemble import AdaBoostClassifier

In [34]:
adaboost = AdaBoostClassifier(n_estimators= 50, random_state= 1)# we dont mention base_estimatior here as its not in its syntax


In [35]:
adaboost.fit(x_train, y_train)

AdaBoostClassifier(random_state=1)

In [36]:
y_pred2= adaboost.predict(x_test)
print(adaboost.score(x_test, y_test))

0.7366666666666667


This score is less than bagging 

so we will prefer bagging over adaboosing

In [37]:
# Gradiant Boost
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
gradient= GradientBoostingClassifier(n_estimators= 50, random_state= 1)

In [39]:
gradient.fit(x_train, y_train)

GradientBoostingClassifier(n_estimators=50, random_state=1)

In [40]:
y_pred3 = gradient.predict(x_test)
print(gradient.score(x_test, y_test))

0.74


similar to bagging

In [41]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

In [44]:
random_forest= RandomForestClassifier(n_estimators= 50, random_state= 1, max_features= 12)
# 12 is number of columns to be considered

In [45]:
random_forest.fit(x_train, y_train)

RandomForestClassifier(max_features=12, n_estimators=50, random_state=1)

In [47]:
y_pred4= random_forest.predict(x_test)
print(random_forest.score(x_test, y_test))

0.7766666666666666


The score is little better than bagging <br>
bagging is 0.7733
random_forest is 0.7766

In [48]:
# ensemble technique doesnt work on all the data 
# there we can use solo model
# mostly to be used with decision tree to improved with its overfitting problem