In [1]:
#Last amended: 30th June, 2020
#My folder:    /home/ashok/Documents/5.decisiontree
#Data folder:  /home/ashok/datasets/german_credit
#Data source: UCI repository
# Ref: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

# Objectives:
#    i)    Read and explore data
#    ii)   Process data using sklearn
#    iii)  Learn to build decision tree model
#    iv)   Vary decision tree parameters and check
#          how accuracy is affected
#    v)    Feature importance

In [2]:
## PERFORM FIRST
#  https://github.com/harnalashok/general/blob/master/Pipeline%20%26%20ColumnTransformer/data%20pipelining%20template.ipynb####
# 1.0 Reset memory
%reset -f
# 1.1 Call libraries
import numpy as np
import pandas as pd
# 1.2 For OS operatios
import os
# 1.3 For plotting
import matplotlib.pyplot as plt

# 1.4 Class for applying multiple data transformation jobs
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1.5 Scale numeric data
from sklearn.preprocessing import StandardScaler as ss
from sklearn.preprocessing import Normalizer as nz

# 1.6 One hot encode data--Convert to dummy
from sklearn.preprocessing import OneHotEncoder as ohe

# 1.7 for data splitting
from sklearn.model_selection import train_test_split

# 1.8 Modeler
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier as dt

###############

In [3]:
# 2.0 Display outputs from all commands from a cell not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# 2.1 Kill warnings
import warnings
warnings.filterwarnings("ignore")


# 2.2 Change your working folder
#     and check files therein
path = "C:\\Users\\ashok\\Desktop\\cbi\\5.decisiontree"
#path = "/home/ashok/datasets/german_credit"
os.chdir(path)
os.listdir()

# 2.3 Change ipython options to display all data columns
pd.options.display.max_columns = 300
np.set_printoptions(
                    threshold=np.inf,
                    precision=3
                    )


['.ipynb_checkpoints',
 '1.iris_graphviz.py',
 '1.toy.py',
 '2.german_credit_verysimple.py',
 '3.german_credit_simple.py',
 '4..german_credit.py',
 '5.german_credit_randomForest ver1.py',
 'abc',
 'class_exercise_data',
 'german_credit.csv.zip',
 'iris',
 'iris.pdf',
 'iris_wheader.csv',
 'Untitled.ipynb']

In [5]:
# 3.0 Read data from zip file
german = pd.read_csv("german_credit.csv.zip")

# 3.1 Get to know data:
german.shape             # 1000 X 20
german.columns
german.dtypes            # All are int64
german.dtypes.value_counts() 

# 3.2 Look at data
german.head()             # Target: creditability. All 1's are at top
german.tail()             #  and 0s at the bottom

(1000, 20)

Index(['creditability', 'account_balance', 'previous_credit_payment_status',
       'credit_duration_month', 'purpose_of_earlier_credit', 'credit_amount',
       'installment_percent', 'current_emploment_length',
       'sex_and_marital_status', 'guarantors', 'duration_at_current_address',
       'most_valuable_available_asset', 'age', 'concurrent_credits',
       'apartmenttype', 'howmanycreditsat_this_bank', 'occupation',
       'dependents', 'telephone', 'foreign_worker'],
      dtype='object')

creditability                     int64
account_balance                   int64
previous_credit_payment_status    int64
credit_duration_month             int64
purpose_of_earlier_credit         int64
credit_amount                     int64
installment_percent               int64
current_emploment_length          int64
sex_and_marital_status            int64
guarantors                        int64
duration_at_current_address       int64
most_valuable_available_asset     int64
age                               int64
concurrent_credits                int64
apartmenttype                     int64
howmanycreditsat_this_bank        int64
occupation                        int64
dependents                        int64
telephone                         int64
foreign_worker                    int64
dtype: object

int64    20
dtype: int64

Unnamed: 0,creditability,account_balance,previous_credit_payment_status,credit_duration_month,purpose_of_earlier_credit,credit_amount,installment_percent,current_emploment_length,sex_and_marital_status,guarantors,duration_at_current_address,most_valuable_available_asset,age,concurrent_credits,apartmenttype,howmanycreditsat_this_bank,occupation,dependents,telephone,foreign_worker
0,1,1,4,18,2,1049,4,2,2,1,4,2,21,3,1,1,3,1,1,1
1,1,1,4,9,0,2799,2,3,3,1,2,1,36,3,1,2,3,2,1,1
2,1,2,2,12,9,841,2,4,2,1,4,1,23,3,1,1,2,1,1,1
3,1,1,4,12,0,2122,3,3,3,1,2,1,39,3,1,2,2,2,1,2
4,1,1,4,12,0,2171,4,3,3,1,4,2,38,1,2,2,2,1,1,2


Unnamed: 0,creditability,account_balance,previous_credit_payment_status,credit_duration_month,purpose_of_earlier_credit,credit_amount,installment_percent,current_emploment_length,sex_and_marital_status,guarantors,duration_at_current_address,most_valuable_available_asset,age,concurrent_credits,apartmenttype,howmanycreditsat_this_bank,occupation,dependents,telephone,foreign_worker
995,0,1,2,24,3,1987,2,3,3,1,4,1,21,3,1,1,2,2,1,1
996,0,1,2,24,0,2303,4,5,3,2,1,1,45,3,2,1,3,1,1,1
997,0,4,4,21,0,12680,4,5,3,1,4,4,30,3,3,1,4,1,2,1
998,0,2,2,12,3,6468,2,1,3,1,1,4,52,3,2,1,4,1,2,1
999,0,1,2,30,2,6350,4,5,3,1,4,2,31,3,2,1,3,1,1,1


In [6]:
# 3.3 Shuffle data
german = german.sample(frac = 1)
 
# 3.4 Missing data?
german.isnull().sum()    # None

# 3.5 Is target balanced?
german['creditability'].value_counts()  # Ratio of 700:300

creditability                     0
account_balance                   0
previous_credit_payment_status    0
credit_duration_month             0
purpose_of_earlier_credit         0
credit_amount                     0
installment_percent               0
current_emploment_length          0
sex_and_marital_status            0
guarantors                        0
duration_at_current_address       0
most_valuable_available_asset     0
age                               0
concurrent_credits                0
apartmenttype                     0
howmanycreditsat_this_bank        0
occupation                        0
dependents                        0
telephone                         0
foreign_worker                    0
dtype: int64

1    700
0    300
Name: creditability, dtype: int64

In [7]:
# 4.1 Separate predictors and target
# 4.1.1 Popup target
y = german.pop('creditability')
y[:3]                 # Pandas Series

# 4.1.2 Create an alias of german
X = german

943    0
126    1
564    1
Name: creditability, dtype: int64

In [8]:
###########################
## Split and model
###########################

# 5 Split into train and test datasets AS ALSO GET INDICIES
X_train,X_test, y_train, y_test = train_test_split(
                                                    X,               # Predictors
                                                    y,               # Target
                                                    test_size = 0.3  # split-ratio
                                                    )



# 5.1 Check the splits
X_train.shape       # 700 X 19
X_test.shape        # 300 X 19
y_train.shape       # (700,)
y_test.shape        # (300,)

(700, 19)

(300, 19)

(700,)

(300,)

In [9]:
# 6.1 Create some features:
# 6.1.1 Get an idea of 'age' variation
X_train['age'].min()         # 20
X_train['age'].max()         # 75

19

75

In [10]:
# 6.2 Based upon above statistics from X_train,
#     decide intervals for bins:

X_train['age_cat'] = pd.cut(
                             X_train['age'],
                             [5,35,50,100],
                             labels=["0","1","2"]
                          )                      # Equal range cut

X_train[['age','age_cat']].head()


# 6.3 Transform X_test in similar manner:
X_test['age_cat'] = pd.cut(
                             X_test['age'],
                             [5,35,50,100],
                             labels=["0","1","2"]
                          )                      # Equal range cut

X_test[['age','age_cat']].head()


Unnamed: 0,age,age_cat
226,35,0
331,26,0
257,23,0
656,38,1
758,24,0


Unnamed: 0,age,age_cat
721,40,1
740,31,0
760,28,0
995,21,0
956,61,2


In [11]:
# 6.4 Get min and max values for credit_amount
#     so that we can create bins:

X_train['credit_amount'].min()    # 250
X_train['credit_amount'].max()    # 18424

276

15945

In [12]:
# 6.5 Decide intervals and bin credit amount
#     We take min as 200 and max as 20000

X_train['credit_amount_cat'] = pd.cut(
                                      X_train['credit_amount'],
                                      [200,6000,12000,20000],
                                      labels=["0","1","2"]
                                     )

# 6.5.1 Check
X_train[['credit_amount','credit_amount_cat']].head()

# 6.6 Similarly for X_test
X_test['credit_amount_cat'] = pd.cut(
                                      X_test['credit_amount'],
                                      [200,6000,12000,20000],
                                      labels=["0","1","2"]
                                     )
# 6.6.1 Check
X_test[['credit_amount','credit_amount_cat']].head()

Unnamed: 0,credit_amount,credit_amount_cat
226,4380,0
331,2404,0
257,2146,0
656,1216,0
758,2150,0


Unnamed: 0,credit_amount,credit_amount_cat
721,1905,0
740,1736,0
760,3060,0
995,1987,0
956,2767,0


In [13]:
### 7.0
###    We now want to know which of the columns are categorical
###    but disguised as integers

# 7.1 How many unique values per column.
#     Check every column
#     We will assume that if unique values are 4 or less
#     it is categorical column else numeric
X_train.nunique()
X_train.nunique() < 5    # All True are categorical
dg = (X_train.nunique() < 5)

# 7.2 List of cat/num columns
categorical_columns = dg[dg].index.tolist()
numerical_columns  = dg[dg == False].index.tolist()

# 7.3 Here is our list
categorical_columns    # 15
numerical_columns      # 6


account_balance                     4
previous_credit_payment_status      5
credit_duration_month              32
purpose_of_earlier_credit          10
credit_amount                     665
installment_percent                 4
current_emploment_length            5
sex_and_marital_status              4
guarantors                          3
duration_at_current_address         4
most_valuable_available_asset       4
age                                53
concurrent_credits                  3
apartmenttype                       3
howmanycreditsat_this_bank          4
occupation                          4
dependents                          2
telephone                           2
foreign_worker                      2
age_cat                             3
credit_amount_cat                   3
dtype: int64

account_balance                    True
previous_credit_payment_status    False
credit_duration_month             False
purpose_of_earlier_credit         False
credit_amount                     False
installment_percent                True
current_emploment_length          False
sex_and_marital_status             True
guarantors                         True
duration_at_current_address        True
most_valuable_available_asset      True
age                               False
concurrent_credits                 True
apartmenttype                      True
howmanycreditsat_this_bank         True
occupation                         True
dependents                         True
telephone                          True
foreign_worker                     True
age_cat                            True
credit_amount_cat                  True
dtype: bool

['account_balance',
 'installment_percent',
 'sex_and_marital_status',
 'guarantors',
 'duration_at_current_address',
 'most_valuable_available_asset',
 'concurrent_credits',
 'apartmenttype',
 'howmanycreditsat_this_bank',
 'occupation',
 'dependents',
 'telephone',
 'foreign_worker',
 'age_cat',
 'credit_amount_cat']

['previous_credit_payment_status',
 'credit_duration_month',
 'purpose_of_earlier_credit',
 'credit_amount',
 'current_emploment_length',
 'age']

In [14]:
# 8.0 Preparing for inputs to ColumnTransformer()
#     Create a tuple of processing tasks, as:
#     (taskName, objectToPerformTask, columns-upon-which-to-perform)

# 8.1 OHE transformer: 
#     To one hot encode categorical columns
#     Format: (some _name   , transformer  , columns_list)
op_cat =      ('categorical', ohe()        , categorical_columns)

# 8.2 Scaling Transformer:
#     To scale numerical columns
op_num = ('numeric', ss(), numerical_columns)

# 8.3 Instantiate columnTransformer class.
#     Specify what all operations to perform in parallel
col_trans = ColumnTransformer([op_cat, op_num])


# Just checking if ColumnTransformer is OK.
#  We will be nesting ColumnTransformer in Pipeline
#   No error should be generated

# 8.4 Learn data
col_trans.fit(X_train)
# 8.5 Now transform X_train
Xt = col_trans.transform(X_train)


ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('categorical',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=True),
                                 ['account_balance', 'installment_percent',
                                  'sex_and_marital_status', 'guarantors',
                                  'duration_at_current_address',
                                  'most_valuab...
                                  'concurrent_credits', 'apartmenttype',
                                  'howmanycreditsat_this_bank', 'occupation',
                                  'dependents', 'telephone', 'foreign_worker',
                                  'age_cat', 'credit_am

In [15]:
# 8.6 Just 
Xt.shape           # 700 X 55
Xt[:5, :3]         # See dummy variables
Xt[:5,49:]         # See only numeric columns`

(700, 55)

array([[1., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

array([[-0.505, -0.233, -1.033,  0.399, -0.293, -0.03 ],
       [ 1.309, -0.233,  0.056, -0.314, -0.293, -0.824],
       [ 1.309, -0.886, -0.307, -0.407, -1.11 , -1.089],
       [ 1.309,  0.257,  0.056, -0.743, -1.11 ,  0.235],
       [-0.505,  0.747, -1.033, -0.406, -0.293, -1.   ]])

In [16]:
# 9.0 Develop Pipeline now
pipe = Pipeline(
                 [
                     ('ct', col_trans),
                     ('dt', dt())
                 ]
               )

In [17]:
# 9.1 pipe also acts as an estimator
#     Train pipe
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['account_balance',
                                                   'installment_percent',
                                                   'sex_and_marital_status',
                                                   'guarantors',
 

In [18]:
# 9.2 Make predictions
pred = pipe.predict(X_test)
pred
# 9.3 Check Accuracy
np.sum(pred == y_test)/len(y_test)    # 67%

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1], d

0.6566666666666666

In [19]:
# 10.0 Change decision tree parameters and check
pipe = Pipeline(
                 [
                     ('ct', col_trans),
                     ('dt', dt(min_samples_leaf = 10))
                 ]
               )
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
np.sum(pred == y_test)/len(y_test)    # 67%

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['account_balance',
                                                   'installment_percent',
                                                   'sex_and_marital_status',
                                                   'guarantors',
 

0.6566666666666666

In [20]:
# 10.1 Change decision tree parameters further
pipe = Pipeline(
                 [
                     ('ct', col_trans),
                     ('dt', dt(min_samples_leaf = 10,
                                max_depth = 4
                               )
                     )
                 ]
               )
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
np.sum(pred == y_test)/len(y_test)    # 67%

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['account_balance',
                                                   'installment_percent',
                                                   'sex_and_marital_status',
                                                   'guarantors',
 

0.6733333333333333

In [None]:
############## Done now ####################