In [3]:
import os
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Carregando a base de dados

In [4]:
data = pd.read_csv('loan.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
data.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [5]:
data2 = data[data.Loan_Status=='Y'].sample(200)


In [6]:
data = data2.append(data[data.Loan_Status=='N'].sample(192))

In [7]:
data.Loan_Status.value_counts()

Y    200
N    192
Name: Loan_Status, dtype: int64

## Checando Missing Values

In [11]:
data2.isnull().sum()

Loan_ID               0
Gender                3
Married               3
Dependents            6
Education             0
Self_Employed        10
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       18
Property_Area         0
Loan_Status           0
dtype: int64

Preechendo Missing Values:

- `Dependents`: Assumindo o valor majoritário da coluna.
- `Self_Employed`: Assumindo o valor majoritário da coluna.
- `Loan_Amount_Term`: Preenchendo com o valor médio da coluna.
- `Credit_History`: Assumindo o valor majoritário da coluna.
- `Married`: Assumindo o valor majoritário da coluna.
- `Gender`: Assumindo o valor majoritário da coluna.

In [14]:
data.Gender.value_counts()

Male      320
Female     72
Name: Gender, dtype: int64

In [8]:
data['Gender'] = data['Gender'].fillna('Male')

In [9]:
data['Married'] = data['Married'].fillna('No')

In [10]:
data['Dependents'] = data['Dependents'].fillna('0')

In [11]:
data['Self_Employed'] = data['Self_Employed'].fillna('No')

In [12]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())

In [13]:
data['Credit_History'] = data['Credit_History'].fillna(1.0)

In [14]:
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

In [15]:
data.Credit_History.value_counts()

1.0    307
0.0     85
Name: Credit_History, dtype: int64

### Checando novamente Missing Values

In [16]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

**Transformando dados categóricos**

Várias colunas do dataframe são categóricas, precisamos transforma-las, são elas: `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
loan_values = {'Y':1,'N':0}
data.replace({'Gender': gender_values,
                 'Married': married_values, 
                 'Education': education_values,
                 'Self_Employed': employed_values, 
                 'Dependents': dependent_values,
                 'Loan_Status': loan_values
                }, inplace=True)

In [19]:
data.drop(['Loan_ID','CoapplicantIncome','Loan_Amount_Term','Credit_History','Property_Area'],axis=1,inplace=True)

In [20]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Status
390,1,0,3,0,0,9167,185.0,1
301,1,1,0,1,0,2875,105.0,1
223,1,1,0,0,0,7578,175.0,1
374,0,0,0,0,0,2764,110.0,1
347,1,1,2,1,0,3083,126.0,1


Selecionando o melhor classificador através de Pipeline e GridSearchCV

In [57]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump, load

In [22]:
pipe_random_forest = Pipeline([
        ('scl', StandardScaler()),
        
        ('clf', RandomForestClassifier())
        ]
)

In [23]:
pipe_svm = Pipeline([
        ('scl', StandardScaler()),
        ('clf', svm.SVC())
        ]
)

In [24]:
pipe_knn = Pipeline([
        ('scl', StandardScaler()),
        ('clf', KNeighborsClassifier())
        ]
)

#### Valores para Grid

In [25]:
valores = [1, 3, 5, 7, 9, 10]

In [26]:
grid_params_rf = [{
    'clf__criterion': ['gini', 'entropy'],
    'clf__min_samples_leaf': valores,
    'clf__max_depth': valores,
    'clf__min_samples_split': valores[1:]
}]

In [27]:
grid_params_svm = [{
    'clf__kernel': ['linear', 'rbf'], 
    'clf__C': valores
}]

In [28]:
grid_params_knn = [{
    'clf__n_neighbors': valores,
}]

#### Construindo GridSearch

In [29]:
gs_rf = GridSearchCV(
    estimator=pipe_random_forest,
    param_grid=grid_params_rf,
    scoring='accuracy',
    cv=10 
)

In [30]:
gs_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=grid_params_svm,
    scoring='accuracy',
    cv=10,
)

In [31]:
gs_knn = GridSearchCV(
    estimator=pipe_knn,
    param_grid=grid_params_knn,
    scoring='accuracy',
    cv=10,
)

In [32]:
X_train = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

#### Computando o GridSearch para Random Forest

In [33]:
gs_rf.fit(X_train,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                

#### Melhores parametros e scoring

In [34]:
print('Melhores parâmetros: %s' % gs_rf.best_params_)
print('Melhores Acurácia: %.3f' % gs_rf.best_score_)

Melhores parâmetros: {'clf__criterion': 'entropy', 'clf__max_depth': 7, 'clf__min_samples_leaf': 7, 'clf__min_samples_split': 5}
Melhores Acurácia: 0.536


#### Computando o GridSearch para SVM

In [35]:
gs_svm.fit(X_train,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, 

#### Melhores parametros e scoring

In [36]:
print('Melhores parâmetros: %s' % gs_svm.best_params_)
print('Melhores Acurácia: %.3f' % gs_svm.best_score_)

Melhores parâmetros: {'clf__C': 9, 'clf__kernel': 'rbf'}
Melhores Acurácia: 0.503


#### Computando o GridSearch para KNN

In [37]:
gs_knn.fit(X_train,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             

#### Melhores parametros e scoring

In [38]:
print('Melhores parâmetros: %s' % gs_knn.best_params_)
print('Melhores Acurácia: %.3f' % gs_knn.best_score_)

Melhores parâmetros: {'clf__n_neighbors': 9}
Melhores Acurácia: 0.510


### Métricas de Validação ###

In [39]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [40]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_train, y)

In [41]:
print (pd.crosstab(y_teste, gs_rf.predict(X_teste), rownames=['Real'], colnames=['Predito'], margins=True), '')

Predito   0   1  All
Real                
0        32  18   50
1        10  38   48
All      42  56   98 


In [42]:
print (metrics.classification_report(y_teste,gs_rf.predict(X_teste)))

              precision    recall  f1-score   support

           0       0.76      0.64      0.70        50
           1       0.68      0.79      0.73        48

    accuracy                           0.71        98
   macro avg       0.72      0.72      0.71        98
weighted avg       0.72      0.71      0.71        98



### Persistindo o modelo de Machine Learning para o disco. ###

In [43]:
from sklearn.externals import joblib

#### Persistindo o melhor modelo em disco.

In [61]:
#dump(gs_rf, 'model.pkl')
joblib.dump(gs_rf, 'model.pkl')

['model.pkl']

#### Listando os arquivos em disco.

In [62]:
!dir

 O volume na unidade E ‚ LEOCADIO
 O N£mero de S‚rie do Volume ‚ 42B9-CE65

 Pasta de E:\Datazero\MachineLearning\Em_producao\notebook-dataset

17/04/2020  15:55    <DIR>          .
17/04/2020  15:55    <DIR>          ..
02/05/2019  14:35    <DIR>          .ipynb_checkpoints
28/02/2019  13:38            37.397 loan.csv
22/04/2020  18:31             9.782 persistencia-objetos-disco.ipynb
23/04/2020  18:38            47.261 persistindo-modelo-machine-learning-disco.ipynb
22/04/2020  18:16                19 lista.pkl
22/04/2020  18:22               168 array.npy
22/04/2020  18:25               144 lista2.npy
22/04/2020  18:27               233 lista3.joblib
22/04/2020  18:28               210 lista3.gz
23/04/2020  19:05           349.788 model.pkl
               9 arquivo(s)        445.002 bytes
               3 pasta(s)    8.197.292.032 bytes dispon¡veis


#### Carregando o modelo a partir do disco para a memória.

In [59]:
model = load('model.pkl')
#model = joblib.load('model.pkl')

In [60]:
model.estimator

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [50]:
print("Atributos do Modelo:\n\nClasses:{}\nEstimator:{}".format(model.classes_,model.estimator))

Atributos do Modelo:

Classes:[0 1]
Estimator:Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
  

**Verificando o Dataset final gerado.**

In [51]:
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount
390,1,0,3,0,0,9167,185.0
301,1,1,0,1,0,2875,105.0
223,1,1,0,0,0,7578,175.0
374,0,0,0,0,0,2764,110.0
347,1,1,2,1,0,3083,126.0


**Teste de Classificação.**

In [53]:
teste = np.array([[1,1,3,0,0,9504,275.0],[0,0,6,1,1,13000,30.0]])
teste

array([[1.000e+00, 1.000e+00, 3.000e+00, 0.000e+00, 0.000e+00, 9.504e+03,
        2.750e+02],
       [0.000e+00, 0.000e+00, 6.000e+00, 1.000e+00, 1.000e+00, 1.300e+04,
        3.000e+01]])

In [54]:
model.predict(teste)

array([1, 1], dtype=int64)

**Probabilidades de Classes.**

In [55]:
model.predict_proba(teste)

array([[0.48114523, 0.51885477],
       [0.45036428, 0.54963572]])