## Ensemble Technique on credit dataset

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [52]:
credits_df = pd.read_csv("credit.csv")

In [53]:
credits_df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [54]:
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


In [55]:
#The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.

for col in credits_df.columns:
    
    if credits_df[col].dtype == 'object':
        print (col)
        credits_df[col] = pd.Categorical(credits_df[col]).codes

checking_balance
credit_history
purpose
savings_balance
employment_duration
other_credit
housing
job
phone
default


In [56]:
x = credits_df.iloc[:,:-1]
y = credits_df.iloc[:,-1]

In [57]:
credits_df.shape

(1000, 17)

### Split the dataset

In [58]:
# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 300, random_state = 0)

In [60]:
x_train.shape

(700, 16)

### Decision Tree


In [61]:
classifier = DecisionTreeClassifier(criterion='entropy')

In [62]:
classifier.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [63]:
classifier.score(x_test,y_test)

0.67

In [64]:
classifier.score(x_train,y_train)

1.0

### Regularisation on DT

In [72]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
dt_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [73]:
print(dt_model.score(x_train,y_train))

print(dt_model.score(x_test , y_test))

0.7671428571428571
0.6833333333333333


### Ensemble - Bagging

In [65]:
from sklearn.ensemble import BaggingClassifier

In [66]:
bg_clf = BaggingClassifier(n_estimators=50,max_samples=0.8,oob_score=True)

In [67]:
bg_clf.fit(x,y)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=0.8,
         n_estimators=50, n_jobs=1, oob_score=True, random_state=None,
         verbose=0, warm_start=False)

In [70]:
print (bg_clf.oob_score_)

0.744


In [71]:
bg_clf.fit(x_train,y_train)
print (bg_clf.score(x_test,y_test))

0.8
