## Modelo preditivo para o problema House Prices
### Letícia Saraiva Chaves, 402120

### Import

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics

### House Prices dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/'
                 'python-machine-learning-book-2nd-edition'
                 '/master/code/ch10/housing.data.txt',
                 header=None,
                 sep='\s+')

df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 
              'NOX', 'RM', 'AGE', 'DIS', 'RAD', 
              'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
X = df.iloc[:,:-1].values
y = df['MEDV'].values

y = y.reshape(y.shape[0], 1)

print(X.shape)
print(y.shape)


(506, 13)
(506, 1)


In [4]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

### Métrica MAE - Mean Absolute Error

In [5]:
def mae(y_test, y_pred):
    soma_y = 0
    for i, j in enumerate(y_test):
        soma_y += abs(j - y_pred[i])

    mae = soma_y / (i+1)
    
    return mae

### Métrica MSE - Mean Squared Error

In [6]:
def mse(y_test, y_pred):
    soma_y = 0
    for i, j in enumerate(y_test):
        soma_y += (j - y_pred[i])**2

    mse = soma_y / (i+1)
    
    return mse

### Métrica RMSE - Root Mean Squared Error

In [7]:
def rmse(y_test, y_pred):
    var_mse = mse(y_test, y_pred)
    
    rmse = np.sqrt(var_mse)
    return rmse

### Métrica MSLE - Mean squared logarithmic error

In [8]:
def msle(y_test, y_pred):
    soma_y = 0
    for i, j in enumerate(y_test):
        if y_pred[i] > 0:
            soma_y += (np.log(j+1) - np.log(y_pred[i] + 1))**2

    msle_ = soma_y / (i+1)
    
    return msle_

### Métrica RMSLE - Root Mean squared logarithmic error

In [9]:
def rmsle(y_test, y_pred):
    var_msle = msle(y_test, y_pred)
    
    rmsle_ = np.sqrt(var_msle)
    return rmsle_

### Métrica  $R^2$  - Coeficiente de Determinação

In [10]:
def ss_res(y_test, y_pred):
    soma = 0
    for i, j in enumerate(y_test):
        soma += (j - y_pred[i])**2
    
    return soma

In [11]:
def ss_tot(y_test, y_mean):
    soma = 0
    for i in y_test:
        soma += (i - y_mean)**2
    
    return soma

In [12]:
def r2(y_test, y_pred):
    y_mean = np.mean(y_test)
    
    var_ss_res = ss_res(y_test, y_pred)
    var_ss_tot = ss_tot(y_test, y_mean)
    
    r2_ = 1 - (var_ss_res/var_ss_tot)
    return r2_

### Usando o K-fold. Treinando o modelo usando o conjunto de dados de treino e o algoritmo de regressão linear e Calculando as métricas

In [13]:
kfold = model_selection.KFold(n_splits=3, shuffle=True, random_state=42)
mae_ = 0
mse_ = 0
rmse_ = 0
msle_ = 0
rmsle_ = 0
r2_ = 0
i = 0

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Treinando o modelo com todos os grupos
    model_lr = linear_model.LinearRegression()
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    
    i += 1
    mae_ += mae(y_test, y_pred)
    mse_ += mse(y_test, y_pred)
    rmse_ += rmse(y_test, y_pred)
    msle_ += msle(y_test, y_pred)
    rmsle_ += rmsle(y_test, y_pred)
    r2_ +=  r2(y_test, y_pred)
    
    
    print("Fold = ", i , "Mean Absolute Error: ", mae(y_test, y_pred))
    print("Fold = ", i , "Mean Squared Error: ", mse(y_test, y_pred))
    print("Fold = ", i , "Root Mean Squared Error: ", rmse(y_test, y_pred))
    print("Fold = ", i , "Mean squared logarithmic error: ", msle(y_test, y_pred))
    print("Fold = ", i , "Root Mean squared logarithmic error: ", rmsle(y_test, y_pred))
    print("Fold = ", i , "𝑅² - Coeficiente de Determinação: ", r2(y_test, y_pred))

Fold =  1 Mean Absolute Error:  [3.14927615]
Fold =  1 Mean Squared Error:  [20.59500679]
Fold =  1 Root Mean Squared Error:  [4.53817219]
Fold =  1 Mean squared logarithmic error:  [0.03619943]
Fold =  1 Root Mean squared logarithmic error:  [0.19026149]
Fold =  1 𝑅² - Coeficiente de Determinação:  [0.72620992]
Fold =  2 Mean Absolute Error:  [3.62320579]
Fold =  2 Mean Squared Error:  [27.31734136]
Fold =  2 Root Mean Squared Error:  [5.22659941]
Fold =  2 Mean squared logarithmic error:  [0.09372715]
Fold =  2 Root Mean squared logarithmic error:  [0.3061489]
Fold =  2 𝑅² - Coeficiente de Determinação:  [0.71324978]
Fold =  3 Mean Absolute Error:  [3.37028054]
Fold =  3 Mean Squared Error:  [21.66891755]
Fold =  3 Root Mean Squared Error:  [4.65498846]
Fold =  3 Mean squared logarithmic error:  [0.04283337]
Fold =  3 Root Mean squared logarithmic error:  [0.20696224]
Fold =  3 𝑅² - Coeficiente de Determinação:  [0.73342693]


### Calculando as médias das métricas

In [14]:
print("Média MAE: ", mae_/3)
print("Média MSE: ", mse_/3)
print("Média RMSE: ", rmse_/3)
print("Média MSLE: ", msle_/3)
print("Média RMSLE: ", rmsle_/3)
print("Média R²: ", r2_/3)


Média MAE:  [3.38092083]
Média MSE:  [23.19375523]
Média RMSE:  [4.80658668]
Média MSLE:  [0.05758665]
Média RMSLE:  [0.23445754]
Média R²:  [0.72429554]
