## 1. Dividindo os dados entre treino e teste

In [1]:
import pandas as pd

In [2]:
treino = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/calculadora-de-imoveis-jun-20/master/analise_e_preprocessamento/treino_preprocessado.csv')
teste = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/calculadora-de-imoveis-jun-20/master/analise_e_preprocessamento/teste_preprocessado.csv')

In [3]:
treino.head()

Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area,preco
0,0,1,0,0,0.693147,3.713572,6.621406
1,1,0,0,0,1.098612,5.198497,7.601402
2,0,0,0,1,1.386294,6.216606,8.699681
3,0,1,0,0,1.098612,4.26268,7.09091
4,1,0,0,0,1.098612,4.189655,7.313887


In [None]:
teste.head()

Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area,preco
0,0,1,0,0,1.098612,6.111467,9.903538
1,1,0,0,0,1.098612,5.525453,8.853808
2,0,0,1,0,1.609438,5.802118,9.852247
3,0,0,0,1,1.386294,5.888878,9.350189
4,0,1,0,0,1.386294,4.394449,7.003974


In [14]:
X_treino = treino.drop('preco', axis=1)

In [None]:
X_treino.head()

Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area
0,0,1,0,0,0.693147,3.713572
1,1,0,0,0,1.098612,5.198497
2,0,0,0,1,1.386294,6.216606
3,0,1,0,0,1.098612,4.26268
4,1,0,0,0,1.098612,4.189655


In [4]:
y_treino = treino['preco']

In [None]:
y_treino.head()

0    6.621406
1    7.601402
2    8.699681
3    7.090910
4    7.313887
Name: preco, dtype: float64

In [5]:
X_teste = teste.drop('preco', axis=1)
y_teste = teste['preco']

## 2. Criando um modelo de regressão linear

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
reg = LinearRegression()

In [15]:
reg.fit(X_treino, y_treino)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
import numpy as np

# reg.predict(X_teste)
n_quartos = 3
area = 100

log1_pred = reg.predict([[0, 1, 0, 0, np.log1p(n_quartos), np.log1p(area)]])

Estamos lidando com um modelo que é log-log, mais especificamente, log + 1 de entrada e exponencial - 1 de saída.

In [17]:
np.expm1(log1_pred)

array([2016.01706802])

In [18]:
log1_predicoes = reg.predict(X_teste)

In [19]:
predicoes = np.expm1(log1_predicoes)

In [None]:
X_teste.head()

Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area
0,0,1,0,0,1.098612,6.111467
1,1,0,0,0,1.098612,5.525453
2,0,0,1,0,1.609438,5.802118
3,0,0,0,1,1.386294,5.888878
4,0,1,0,0,1.386294,4.394449


In [12]:
y_teste.head()

0    9.903538
1    8.853808
2    9.852247
3    9.350189
4    7.003974
Name: preco, dtype: float64

In [20]:
np.log1p(predicoes[0])

8.59375

In [None]:
9.9 - 8.5

1.4000000000000004

In [21]:
reg.score(X_teste, y_teste) # r quadrado da regressão linear

0.7460404920512879

## 3. Criando um modelo de base (baseline, benchmark)

In [22]:
# modelo que preveja a média
from sklearn.dummy import DummyRegressor

In [23]:
regressor_boboca = DummyRegressor(strategy='mean')

In [24]:
regressor_boboca.fit(X_treino, y_treino)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [25]:
regressor_boboca.score(X_teste, y_teste) # quando a gente faz o score, usamos x e y de teste

-0.007175323673028533

In [26]:
pred_boboca = regressor_boboca.predict(X_teste) # quando a gente faz a predição, só passamos o x de teste

Quando a gente usa o R² de dentro do pacote de métricas do sklearn (ou qualquer outra métrica do pacote metrics) passamos o y_true (y_teste) e os valores preditos (y_pred).

O .score é o R² do pacote de métricas.

In [27]:
from sklearn.metrics import r2_score

r2_score(y_teste, pred_boboca)

-0.007175323673028533

In [28]:
from sklearn.ensemble import RandomForestRegressor

rfn = RandomForestRegressor(random_state=100)
rfn.fit(X_treino, y_treino)
rfn.score(X_teste, y_teste)

0.7435630181216769

In [29]:
rfn

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=100, verbose=0, warm_start=False)

In [None]:
# 42, 1, 100, 0 .... # vemos na próxima aula
from sklearn.model_selection import GridSearchCV

parametros = {'n_estimators':[50, 100, 150, 200], 
              'max_depth':[3, 5, 10, 15, 30], 
              'random_state': [20, 50, 42, 100]} 

# instancia um objeto grid search
grid = GridSearchCV(rfn, parametros, verbose=2).fit(X_treino, y_treino)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.5min finished


In [None]:
grid.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=50, verbose=0, warm_start=False)

In [None]:
grid.best_params_

{'max_depth': 5, 'n_estimators': 150, 'random_state': 50}

In [None]:
grid.best_score_

0.7308914131005916

In [None]:
resultados_cv = pd.DataFrame(grid.cv_results_)

In [None]:
resultados_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067104,0.003118,0.004874,8.5e-05,3,50,20,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.795232,0.784045,0.566785,0.670506,0.735295,0.710373,0.08421,32
1,0.067978,0.004434,0.005036,0.00032,3,50,50,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.792607,0.778388,0.577368,0.674995,0.735924,0.711856,0.078705,30
2,0.070554,0.0057,0.004989,0.000102,3,50,42,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.798134,0.785751,0.569082,0.68158,0.730672,0.713044,0.083129,26
3,0.067914,0.002493,0.005062,0.000213,3,50,100,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.799395,0.784328,0.561778,0.673394,0.738331,0.711445,0.086739,31
4,0.139043,0.008863,0.00845,0.000655,3,100,20,"{'max_depth': 3, 'n_estimators': 100, 'random_...",0.796708,0.787608,0.57038,0.674642,0.737045,0.713277,0.083622,23


# Versão alternativa do grid search

In [62]:
rf = RandomForestRegressor()

In [63]:
# 42, 1, 100, 0 .... # vemos na próxima aula
from sklearn.model_selection import GridSearchCV

parametros = {'n_estimators':[10, 20, 50, 100],
              'max_depth': [5, 10, 20],
              'random_state': [5, 10, 42, 100, 200]}

rfr_tunado = GridSearchCV(rf, parametros, verbose=1)

In [65]:
rfr_tunado.fit(X_treino, y_treino)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   23.8s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [66]:
rfr_tunado.best_params_

{'max_depth': 5, 'n_estimators': 100, 'random_state': 200}

In [67]:
rfr_tunado.best_score_

0.7314710175481862

## 4. Salvando o modelo após a buscas de hiperparâmetros (exportando o modelo)

In [54]:
import pickle

In [55]:
pickle.dump(rf, open('modelo.pkl', 'wb')) # wb é de escrever o binário ou write binary 

In [56]:
rf = pickle.load(open('modelo.pkl', 'rb')) # rb é de ler o binário ou read binary

In [57]:
primeiro_el_teste = X_teste.values[0]

In [58]:
rf.predict([primeiro_el_teste])

array([9.00642947])

In [59]:
el_teste_verdadeiro = y_teste.values[0]

In [60]:
el_teste_verdadeiro

9.90353755128617

In [61]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=200)

rf.fit(X_treino, y_treino)
rf.score(X_teste, y_teste)

0.7693024131906113

In [None]:
# dar um restart no runtime e ver se os scores continuam baixos ou melhoram
# testar com outros hiperparâmetros
# olhar nos plots de árvores para entender o que pode ter acontecido com a decisão