# 6. Decision Trees and Ensemble Learning

This week, we'll talk about decision trees and tree-based ensemble algorithms

## 6.1 Credit risk scoring project

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

The data is taken from [GitHub](https://github.com/gastonstat/CreditScoring)

# 6.2 Data cleaning and preparation
* Downlading the dataset
* Re-encoding the categorical variables
* Doing the train/validation/test split


In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

In [3]:
!wget -q $data -O CreditScoring.csv

In [4]:
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


In [5]:
df = pd.read_csv("CreditScoring.csv")

In [6]:
df.columns = df.columns.str.lower()

In [7]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [8]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

In [9]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [10]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

In [11]:
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

In [12]:
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

In [13]:

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [14]:

df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [18]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [20]:
#replace missing numbers with n/a
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace = 99999999, value=np.nan)

In [22]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [26]:
df = df[df.status != 'unk'].reset_index(drop=True)

In [27]:
df

Unnamed: 0,index,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,0,ok,9,rent,60,30,married,no,freelance,73,129.0,0.0,0.0,800,846
1,1,ok,17,rent,60,58,widow,no,fixed,48,131.0,0.0,0.0,1000,1658
2,2,default,10,owner,36,46,married,yes,freelance,90,200.0,3000.0,0.0,2000,2985
3,3,ok,0,rent,60,24,single,no,fixed,63,182.0,2500.0,0.0,900,1325
4,4,ok,0,rent,36,26,single,no,fixed,46,107.0,0.0,0.0,310,910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4449,4450,default,1,rent,60,39,married,no,fixed,69,92.0,0.0,0.0,900,1020
4450,4451,ok,22,owner,60,46,married,no,fixed,60,75.0,3000.0,600.0,950,1263
4451,4452,default,0,owner,24,37,married,no,partime,60,90.0,3500.0,0.0,500,963
4452,4453,ok,0,rent,48,23,single,no,freelance,49,140.0,0.0,0.0,550,550


In [37]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=11)


In [38]:
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [39]:
y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

In [40]:
del df_train['status']
del df_val['status']
del df_test['status']

In [41]:
df_train

Unnamed: 0,index,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,3541,15,owner,60,34,married,no,fixed,45,82.0,3500.0,0.0,750,1624
1,2616,7,parents,60,30,single,no,fixed,35,95.0,0.0,0.0,900,1158
2,454,10,owner,36,47,married,no,fixed,60,133.0,3000.0,0.0,360,360
3,2070,5,owner,48,39,married,yes,freelance,45,100.0,30000.0,0.0,1550,2294
4,4182,14,owner,36,40,married,no,fixed,45,80.0,3000.0,0.0,900,1263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2845,2382,18,private,36,45,married,no,fixed,45,220.0,20000.0,0.0,800,1600
2846,1784,7,private,60,29,married,no,fixed,60,51.0,3500.0,500.0,1000,1290
2847,808,1,parents,24,19,single,no,fixed,35,28.0,0.0,0.0,400,600
2848,1857,15,owner,48,43,married,no,freelance,60,100.0,18000.0,0.0,2500,2976


## 6.3 Decision trees
* How a decision tree lookks like
* Training a decision tree
* Overfitting
* Controlling the size of a tree

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [43]:
train_dict = df_train.to_dict(orient='records') #might need to fillna(0) to avoid getting errors

In [46]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [47]:
dv.get_feature_names_out()

array(['age', 'amount', 'assets', 'debt', 'expenses', 'home=ignore',
       'home=other', 'home=owner', 'home=parents', 'home=private',
       'home=rent', 'home=unk', 'income', 'index', 'job=fixed',
       'job=freelance', 'job=others', 'job=partime', 'job=unk',
       'marital=divorced', 'marital=married', 'marital=separated',
       'marital=single', 'marital=unk', 'marital=widow', 'price',
       'records=no', 'records=yes', 'seniority', 'time'], dtype=object)

In [48]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [50]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [54]:
y_pred = dt.predict_proba(X_val)[:,1]

In [55]:
roc_auc_score(y_val, y_pred)

np.float64(0.625699530516432)

In [58]:
y_pred=dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)

np.float64(0.9997582205029014)

#### Overfitting - memorizing the data, but fails to generalize

_Solution_: Need to restrict the __depth__ of the tree

In [69]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [70]:
#train
y_pred=dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)

np.float64(0.7700033886231307)

In [71]:
#validation
y_pred = dt.predict_proba(X_val)[:,1]
roc_auc_score(y_val, y_pred)

np.float64(0.7477511737089203)

In [72]:
#Tree with one condition is called a 'decision stump'

In [73]:
from sklearn.tree import export_text

In [75]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- records=yes <= 0.50
|   |--- job=partime <= 0.50
|   |   |--- income <= 74.50
|   |   |   |--- class: 0
|   |   |--- income >  74.50
|   |   |   |--- class: 0
|   |--- job=partime >  0.50
|   |   |--- assets <= 8750.00
|   |   |   |--- class: 1
|   |   |--- assets >  8750.00
|   |   |   |--- class: 0
|--- records=yes >  0.50
|   |--- seniority <= 6.50
|   |   |--- seniority <= 1.50
|   |   |   |--- class: 1
|   |   |--- seniority >  1.50
|   |   |   |--- class: 1
|   |--- seniority >  6.50
|   |   |--- income <= 103.50
|   |   |   |--- class: 1
|   |   |--- income >  103.50
|   |   |   |--- class: 0

