# Machine Learning - Arvore de Decisão

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree

#### Carregando a base de dados bank-numeric

In [2]:
bank = pd.read_csv('datasets/bank-numeric.csv')
bank.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,recent_pdays,deposit_cat,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,0.0001,1,...,0,1,0,0,0,1,0,0,0,1


In [4]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  11162 non-null  int64  
 1   balance              11162 non-null  int64  
 2   duration             11162 non-null  int64  
 3   campaign             11162 non-null  int64  
 4   previous             11162 non-null  int64  
 5   default_cat          11162 non-null  int64  
 6   housing_cat          11162 non-null  int64  
 7   loan_cat             11162 non-null  int64  
 8   recent_pdays         11162 non-null  float64
 9   deposit_cat          11162 non-null  int64  
 10  job_blue-collar      11162 non-null  int64  
 11  job_entrepreneur     11162 non-null  int64  
 12  job_other            11162 non-null  int64  
 13  job_pink-collar      11162 non-null  int64  
 14  job_self-employed    11162 non-null  int64  
 15  job_technician       11162 non-null 

#### Separando as features das classes

In [3]:
bank_data = bank.drop('deposit_cat', 1)
bank_target = bank.deposit_cat

#### Dividindo os dados em treino e teste

In [4]:
X_train, X_test, y_train, y_test = train_test_split(bank_data,bank_target,test_size=0.3)

#### Função para treinar o modelo de arvore de decisão com o parametro max_depth

O parametro controla a profundidade da arvore, portanto o seu tamanho. Uma arvore grande é mais complexa do que as menores.


In [5]:
def compara_modelos(maxdepth):
    if maxdepth == 0:
        dt = tree.DecisionTreeClassifier(random_state=1)
    else:   
        dt = tree.DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    dt.fit(X_train, y_train)
    train_score = dt.score(X_train, y_train)
    test_score = dt.score(X_test, y_test)
    return train_score,test_score

compara_modelos(5)#profundidade maxima igual a 2.

(0.7939331882759504, 0.787996416840848)

In [6]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos(2))))
print('{:1}         {} '.format(3,str(compara_modelos(3))))
print('{:1}         {} '.format(4,str(compara_modelos(4))))
print('{:1}         {} '.format(10,str(compara_modelos(10))))
print('{:1}         {} '.format(15,str(compara_modelos(15))))
print('{:1}         {} '.format('Full',str(compara_modelos(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.7496480225265583, 0.7438041206330248) 
3         (0.7638551132727506, 0.7587339504329651) 
4         (0.7853577371048253, 0.7748581666169005) 
10         (0.8629207730705235, 0.7811286951328755) 
15         (0.9393318827595034, 0.751269035532995) 
Full         (1.0, 0.722006569125112) 


#### Verificando as features mais importantes para o modelo de arvore de decisão treinado

##### Treinando o modelo utilizando o valor de max_depth igual a 4

In [7]:
dt = tree.DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

##### Listando as features e sua importância para o modelo

In [11]:
bank.columns

Index(['age', 'balance', 'duration', 'campaign', 'previous', 'default_cat',
       'housing_cat', 'loan_cat', 'recent_pdays', 'deposit_cat',
       'job_blue-collar', 'job_entrepreneur', 'job_other', 'job_pink-collar',
       'job_self-employed', 'job_technician', 'job_white-collar',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'poutcome_failure', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [28]:
fi = dt.feature_importances_

l = len(bank.columns)
for i in range(0,len(bank.columns)-1):
    print('{:.<20} {:3}'.format(features[i],fi[i]))

0.0................. 0.0
0.0010325438679464653 0.0010325438679464653
0.645041896223182... 0.645041896223182
0.0................. 0.0
0.0020509159323967335 0.0020509159323967335
0.0................. 0.0
0.09507704486476527. 0.09507704486476527
0.0................. 0.0
0.08338760819193822. 0.08338760819193822
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.0................. 0.0
0.17340999091977133. 0.17340999091977133
0.0................. 0.0
