# Códigos para realizar regressão
Com diferentes opções de validação (Kfold, leave-one-out e holdout)
* Linear Regression
* Lasso
* Ridge
* Elastic Net
* Árvore de Decisão
* Random Forest
* MLP Regressor

In [1]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn import datasets



# Gerando base de dados (Boston housing prices)

In [2]:
boston = datasets.load_boston()

dados = pd.DataFrame(boston.data)
dados.columns = boston.feature_names
dados['PRICE'] = boston.target

X = dados.drop('PRICE',axis=1)
Y = dados['PRICE']

X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


# Linear Regression

In [3]:
LM = linear_model.LinearRegression(fit_intercept=True, 
                                   normalize=False, 
                                   copy_X=True, 
                                   n_jobs=1)

LM.fit(X,Y)
y_pred = LM.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",LM.coef_)
print("Intercepto: {:.2f}.".format(LM.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [ -1.07170557e-01   4.63952195e-02   2.08602395e-02   2.68856140e+00
  -1.77957587e+01   3.80475246e+00   7.51061703e-04  -1.47575880e+00
   3.05655038e-01  -1.23293463e-02  -9.53463555e-01   9.39251272e-03
  -5.25466633e-01]
Intercepto: 36.49.
RMSE: 21.90.
MAE: 3.27.
R2: 0.74.


In [4]:
# K-folds / Leave one out

LM = linear_model.LinearRegression(fit_intercept=True, 
                                   normalize=False, 
                                   copy_X=True, 
                                   n_jobs=1)


# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = LM.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,3.42,2.21,23.37,9.3,0.74,0.73
1,3.39,2.9,22.88,14.17,0.75,0.47
2,3.43,2.79,23.22,14.11,0.74,-1.01
3,3.11,4.6,20.78,35.19,0.72,0.64
4,3.27,4.11,21.34,31.92,0.74,0.55
5,3.28,3.57,22.36,19.86,0.7,0.74
6,3.41,2.67,23.33,9.96,0.75,0.38
7,2.61,9.66,11.96,168.53,0.84,-0.13
8,3.19,5.06,21.59,33.62,0.74,-0.78
9,3.39,2.54,23.19,10.97,0.74,0.42


In [5]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 21.65
RMSE médio treino: 21.40
RMSE agregado teste: 57.35
RMSE médio teste: 34.76


In [6]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

LM.fit(X_train, y_train)
pred_train = LM.predict(X_train)
pred_test = LM.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 9.99
RMSE teste: 302.64
MAE treino: 2.44
MAE treino: 9.86
R2 treino: 0.86
R2 treino: -2.24


# Lasso

In [7]:
LASSO = linear_model.Lasso(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        precompute=False, 
                        copy_X=True, 
                        max_iter=1000, 
                        tol=0.0001, 
                        warm_start=False, 
                        positive=False, 
                        random_state=42, 
                        selection='cyclic')
LASSO.fit(X,Y)
y_pred= LASSO.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",LASSO.coef_)
print("Intercepto: {:.2f}.".format(LASSO.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-0.05579539  0.04896746 -0.          0.         -0.          0.32599066
  0.02788314 -0.5641117   0.25844651 -0.01505597 -0.70870374  0.00779028
 -0.80310256]
Intercepto: 44.50.
RMSE: 28.35.
MAE: 3.72.
R2: 0.66.


In [8]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = LASSO.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,3.83,2.87,29.75,12.81,0.67,0.63
1,3.8,3.45,29.61,19.83,0.67,0.26
2,3.87,2.33,29.73,11.24,0.67,-0.6
3,3.48,6.01,26.19,62.86,0.64,0.36
4,3.64,4.64,27.4,47.55,0.67,0.33
5,3.68,5.01,28.96,45.37,0.62,0.4
6,3.85,4.0,30.01,19.99,0.67,-0.25
7,3.11,6.74,18.53,95.28,0.75,0.36
8,3.71,3.46,27.85,22.1,0.66,-0.17
9,3.8,2.7,29.21,13.5,0.68,0.28


In [9]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 27.92
RMSE médio treino: 27.72
RMSE agregado teste: 43.71
RMSE médio teste: 35.06


In [10]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

LASSO.fit(X_train, y_train)
pred_train = LASSO.predict(X_train)
pred_test = LASSO.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 17.11
RMSE teste: 43.49
MAE treino: 2.98
MAE treino: 4.48
R2 treino: 0.75
R2 treino: 0.53


# Ridge

In [11]:
RIDGE = linear_model.Ridge(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        copy_X=True,
                        max_iter=None, 
                        tol=0.001, 
                        solver='auto', 
                        random_state=42)
RIDGE.fit(X,Y)
y_pred= RIDGE.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",RIDGE.coef_)
print("Intercepto: {:.2f}.".format(RIDGE.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [ -1.03236552e-01   4.75402667e-02  -1.18326117e-02   2.53281439e+00
  -1.00058825e+01   3.85182718e+00  -6.03083188e-03  -1.36097976e+00
   2.87962047e-01  -1.29773256e-02  -8.68178397e-01   9.79343988e-03
  -5.35230990e-01]
Intercepto: 31.09.
RMSE: 22.08.
MAE: 3.27.
R2: 0.74.


In [12]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = RIDGE.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,3.42,2.13,23.58,9.35,0.74,0.73
1,3.37,2.67,23.17,12.48,0.74,0.54
2,3.43,2.33,23.55,10.63,0.74,-0.52
3,3.11,4.65,20.94,36.78,0.71,0.63
4,3.25,3.9,21.62,29.44,0.74,0.58
5,3.26,3.46,22.63,18.54,0.7,0.75
6,3.39,2.71,23.57,9.46,0.74,0.41
7,2.6,9.62,12.06,168.56,0.84,-0.13
8,3.18,5.1,21.72,34.14,0.73,-0.81
9,3.37,2.61,23.4,11.53,0.74,0.39


In [13]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 21.88
RMSE médio treino: 21.62
RMSE agregado teste: 57.20
RMSE médio teste: 34.09


In [14]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

RIDGE.fit(X_train, y_train)
pred_train = RIDGE.predict(X_train)
pred_test = RIDGE.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 10.07
RMSE teste: 179.58
MAE treino: 2.42
MAE treino: 7.95
R2 treino: 0.85
R2 treino: -0.93


# Elastic Net

In [15]:
EN = ElasticNet(alpha=1.0, 
                l1_ratio=0.5, 
                fit_intercept=True, 
                normalize=False, 
                precompute=False, 
                max_iter=1000, 
                copy_X=True, 
                tol=0.0001, 
                warm_start=False, 
                positive=False, 
                random_state=42, 
                selection='cyclic')

EN.fit(X,Y)
y_pred= EN.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",EN.coef_)
print("Intercepto: {:.2f}.".format(EN.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-0.08019191  0.05322902 -0.01252934  0.         -0.          0.93268608
  0.02058656 -0.7620405   0.30141294 -0.01643638 -0.74821876  0.00839777
 -0.758702  ]
Intercepto: 42.22.
RMSE: 26.50.
MAE: 3.59.
R2: 0.69.


In [16]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = EN.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,3.72,2.65,28.04,11.68,0.69,0.67
1,3.67,3.23,27.61,17.64,0.69,0.34
2,3.75,2.28,28.04,10.73,0.69,-0.53
3,3.36,5.67,24.43,56.38,0.67,0.43
4,3.52,4.37,25.69,41.98,0.69,0.4
5,3.54,4.47,26.82,35.33,0.65,0.53
6,3.72,3.53,27.97,16.45,0.7,-0.03
7,3.2,6.4,19.94,84.34,0.73,0.43
8,3.61,3.62,26.5,23.18,0.68,-0.23
9,3.7,2.79,27.71,13.92,0.69,0.26


In [17]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 26.38
RMSE médio treino: 26.27
RMSE agregado teste: 38.56
RMSE médio teste: 31.16


In [18]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

EN.fit(X_train, y_train)
pred_train = EN.predict(X_train)
pred_test = EN.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 20.38
RMSE teste: 38.56
MAE treino: 3.25
MAE treino: 4.33
R2 treino: 0.71
R2 treino: 0.59


# Árvore de Decisão

In [19]:
DT = DecisionTreeRegressor(criterion='mse', 
                           splitter='best', 
                           max_depth=10, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features=None, 
                           random_state=42, 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           presort=False)
DT.fit(X,Y)

y_pred= DT.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 0.87.
MAE: 0.55.
R2: 0.99.


In [20]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = DT.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,0.53,3.24,0.8,15.78,0.99,0.55
1,0.55,2.33,0.85,9.98,0.99,0.63
2,0.47,3.23,0.72,17.8,0.99,-1.54
3,0.48,4.95,0.69,45.19,0.99,0.54
4,0.52,3.23,0.96,17.0,0.99,0.76
5,0.66,4.28,1.27,34.29,0.98,0.54
6,0.56,3.2,0.94,18.3,0.99,-0.14
7,0.39,6.36,0.49,93.61,0.99,0.37
8,0.5,4.52,0.81,61.99,0.99,-2.29
9,0.45,2.91,0.65,16.12,0.99,0.15


In [21]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 0.84
RMSE médio treino: 0.82
RMSE agregado teste: 41.68
RMSE médio teste: 33.01


In [22]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

DT.fit(X_train, y_train)
pred_train = DT.predict(X_train)
pred_test = DT.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 0.14
RMSE teste: 38.92
MAE treino: 0.16
MAE treino: 4.24
R2 treino: 1.0
R2 treino: 0.58


# Random Forest

In [23]:
RF = RandomForestRegressor(n_estimators=10, 
                           criterion='mse', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=1, 
                           random_state=42, 
                           verbose=0, 
                           warm_start=False)
RF.fit(X,Y)

y_pred= RF.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 1.97.
MAE: 0.93.
R2: 0.98.


In [24]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = RF.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,0.9,2.68,1.64,10.26,0.98,0.71
1,0.93,1.82,1.72,6.17,0.98,0.77
2,0.9,1.72,1.77,5.36,0.98,0.24
3,0.88,4.41,1.64,33.16,0.98,0.66
4,0.85,2.66,1.53,12.52,0.98,0.82
5,0.91,4.13,1.94,38.13,0.97,0.49
6,0.98,2.21,2.3,8.24,0.98,0.48
7,0.93,5.76,1.91,84.4,0.97,0.43
8,0.94,3.64,2.13,27.78,0.97,-0.47
9,0.9,2.92,2.02,16.27,0.98,0.14


In [25]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 1.87
RMSE médio treino: 1.86
RMSE agregado teste: 33.33
RMSE médio teste: 24.23


In [26]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

RF.fit(X_train, y_train)
pred_train = RF.predict(X_train)
pred_test = RF.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 1.66
RMSE teste: 35.78
MAE treino: 0.84
MAE treino: 3.95
R2 treino: 0.98
R2 treino: 0.62


# MLP Regressor

In [27]:
MLP = MLPRegressor(hidden_layer_sizes=(100, ),
                   activation='relu', 
                   solver='adam', 
                   alpha=0.01, #default = 0.0001
                   batch_size='auto', 
                   learning_rate='constant', 
                   learning_rate_init=0.1, #default = 0.0001
                   power_t=0.5, 
                   max_iter=400, #default = 100
                   shuffle=True, 
                   random_state=42, 
                   tol=0.0001, 
                   verbose=False, 
                   warm_start=False, 
                   momentum=0.9, 
                   nesterovs_momentum=True, 
                   early_stopping=False, 
                   validation_fraction=0.1, 
                   beta_1=0.9, 
                   beta_2=0.999, 
                   epsilon=1e-08)

MLP.fit(X,Y)

y_pred= MLP.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 163.18.
MAE: 9.78.
R2: -0.93.


In [28]:
# K-folds / Leave one out
# definir k para kfolds
# se k = n, leave one out

k = 10
#k = len(dados)
cv = KFold(k)


metrics = [mean_squared_error, r2_score, mean_absolute_error]

first = True

for train_index, test_index in cv.split(X):
    model = MLP.fit(X.iloc[train_index],Y[train_index])
       
    pred_train = model.predict(X.iloc[train_index])
    pred_test = model.predict(X.iloc[test_index])

    y_train = Y[train_index]
    y_test = Y[test_index]
        
    mse_treino = mean_squared_error(y_train, pred_train)
    mse_teste = mean_squared_error(y_test, pred_test)
        
    mae_treino = mean_absolute_error(y_train, pred_train)
    mae_teste = mean_absolute_error(y_test, pred_test)
        
    r2_treino = r2_score (y_train, pred_train)
    r2_teste = r2_score (y_test, pred_test)
    
    resultados_aux = {
    "MSE: 1. Treino" : [mse_treino] ,
    "MSE: 2. Teste" : [mse_teste],
    "MAE: 1. Treino" : [mae_treino],
    "MAE: 2. Teste" : [mae_teste],
    "R2: 1. Treino" : [r2_treino],
    "R2: 2. Teste" : [r2_teste]
    }

    resultados_aux = pd.DataFrame(resultados_aux)

    if first == True:
        results_folds = resultados_aux
        first = False
    else:
        results_folds = pd.concat([results_folds, resultados_aux], axis = 0)

results_folds.index = range(0,k)
results_mean = np.transpose(pd.DataFrame(results_folds.mean(), columns=['Média']))
    
results_folds = pd.concat([results_folds, results_mean], axis = 0)

results_folds.round(2)

Unnamed: 0,MAE: 1. Treino,MAE: 2. Teste,MSE: 1. Treino,MSE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,11.55,9.29,193.38,106.04,-1.16,-2.04
1,11.68,6.86,194.9,61.85,-1.15,-1.3
2,10.16,4.34,172.86,24.7,-0.91,-2.52
3,10.41,19.97,174.02,489.83,-1.37,-3.99
4,11.78,6.9,198.76,76.93,-1.41,-0.09
5,10.02,8.69,157.22,132.21,-1.08,-0.76
6,10.61,9.27,190.56,102.07,-1.07,-5.38
7,9.78,8.83,162.04,155.14,-1.18,-0.04
8,23.76,14.39,653.28,223.13,-6.97,-10.84
9,10.95,5.36,194.67,38.2,-1.16,-1.03


In [29]:
# Calculando RMSE agregado para k-fold

import math

R1 = [0 for x in range(k)]
R1_2 = [0 for x in range(k)]
R2 = [0 for x in range(k)]
R2_2 = [0 for x in range(k)]

for i in range(0,k):
    R1[i] = results_folds.iloc[i,2]**2
    R1_2[i] = results_folds.iloc[i,2]
    R2[i] = results_folds.iloc[i,3]**2
    R2_2[i] = results_folds.iloc[i,3]
        
a = math.sqrt(sum(R1)/len(R1))
a2 = sum(R1_2)/len(R1_2)
b = math.sqrt(sum(R2)/len(R2))
b2 = sum(R2_2)/len(R2_2) 

print("RMSE agregado treino:","%.2f" % a)
print("RMSE médio treino:","%.2f" % a2)
print("RMSE agregado teste:","%.2f" % b)
print("RMSE médio teste:","%.2f" % b2)

RMSE agregado treino: 269.63
RMSE médio treino: 229.17
RMSE agregado teste: 190.98
RMSE médio teste: 141.01


In [30]:
# Holdout
# selecionar ponto de corte na base

corte = 253
c = corte-1
n = len(X)

# criando treino e teste 

X_train = X.iloc[0:corte,:]
y_train = Y.loc[0:c]
X_test = X.loc[corte:n,:]
y_test = Y.iloc[corte:n]

MLP.fit(X_train, y_train)
pred_train = MLP.predict(X_train)
pred_test = MLP.predict(X_test)    

mse_treino = mean_squared_error(y_train, pred_train)
mse_teste = mean_squared_error(y_test, pred_test)
        
mae_treino = mean_absolute_error(y_train, pred_train)
mae_teste = mean_absolute_error(y_test, pred_test)
        
R2_treino = r2_score (y_train, pred_train)
R2_teste = r2_score (y_test, pred_test)

print("RMSE treino:",mse_treino.round(2))
print("RMSE teste:",mse_teste.round(2))
print("MAE treino:",mae_treino.round(2))
print("MAE treino:",mae_teste.round(2))
print("R2 treino:",R2_treino.round(2))
print("R2 treino:",R2_teste.round(2))

RMSE treino: 222.05
RMSE teste: 567.61
MAE treino: 13.21
MAE treino: 20.66
R2 treino: -2.21
R2 treino: -5.08
