# Base Crédito - Algoritmo Árvores de Decisão

O objetivo desse estudo é prever se o cliente pagará o empréstimo ou não.

In [1]:
import pandas as pd
import numpy as np

In [2]:
base = pd.read_csv(r"C:\Users\herna\OneDrive\Machine Learning - A a Z\Secao 3 - Pre-processamento com Pandas e scikit-learm\credit_data.csv")

In [3]:
base.columns

Index(['clientid', 'income', 'age', 'loan', 'default'], dtype='object')

In [4]:
base.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
base.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [6]:
base.dtypes

clientid      int64
income      float64
age         float64
loan        float64
default       int64
dtype: object

## Valores inconsistentes

In [7]:
base.loc[base['age']<0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [8]:
# metodos para apagar

# apagar coluna: 
#base.drop('age', 1, inplace=True)

# apagar somente os registros com problema: 
#base.drop(base[base.age<0].index, inplace = True)

# preencher os valores com a média

base['age'][base['age']>0].mean()

40.92770044906149

In [9]:
base.loc[base['age'] < 0, 'age'] = 40.92

In [10]:
base.loc[base['age']<0]

Unnamed: 0,clientid,income,age,loan,default


## Valores faltantes

In [11]:
pd.isnull(base['age'])

0       False
1       False
2       False
3       False
4       False
        ...  
1995    False
1996    False
1997    False
1998    False
1999    False
Name: age, Length: 2000, dtype: bool

In [12]:
base.loc[pd.isnull(base['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [13]:
previsores = base.iloc[:,1:4].values

In [14]:
classe = base.iloc[:,4].values

In [15]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


In [16]:
imputer = imputer.fit(previsores[:, 0:3])

previsores[:, 0:3] = imputer.transform(previsores[:,0:3])

## Escalonamento de atributos

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
previsores = scaler.fit_transform(previsores)

## Divisão base de treinamento e teste

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores,classe,test_size=0.25,random_state=0)

## Algoritmo Árvores de Decisão

In [22]:
from sklearn.tree import DecisionTreeClassifier

classificador = DecisionTreeClassifier(criterion='entropy')

classificador.fit(previsores_treinamento, classe_treinamento)

DecisionTreeClassifier(criterion='entropy')

In [23]:
previsoes = classificador.predict(previsores_teste)

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score

precisao = accuracy_score(classe_teste, previsoes)

In [25]:
precisao

0.982

In [26]:
matriz = confusion_matrix(classe_teste, previsoes)

In [27]:
matriz

array([[430,   6],
       [  3,  61]], dtype=int64)