In [2]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error

df = pd.read_csv('data/kc_house_data.csv')
df = df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront']]

x = df.drop('price', axis=1)
y = df['price']

# normalização dos dados
min_max_scaler = StandardScaler()
x = min_max_scaler.fit_transform(x)

In [6]:
# Criando o modelo de regressão linear
linear_regressor = LinearRegression()

# Definindo o número de folds
k = 5

# Criando o objeto KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Armazenará os scores de cada fold
mape_scores = []

# Realizando o K-Fold Cross-Validation
for train_index, val_index in kf.split(x):
    x_train, x_val = x[train_index], x[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Treinando o modelo no conjunto de treino
    linear_regressor.fit(x_train, y_train)

    # Fazendo previsões no conjunto de validação
    y_pred = linear_regressor.predict(x_val)

    # Calculando o erro percentual absoluto médio (MAPE)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    # Armazenando o MAPE para cada fold
    mape_scores.append(mape)

# Calculando o MAPE médio
mape_mean = np.mean(mape_scores)

print(f"MAPE médio: {mape_mean}")

MAPE médio: 0.34622668025790687


## Separando 10% de dados para teste final

In [7]:
# Separando os dados de treino (para a validação cruzada) e de teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# Criando o modelo de regressão linear
linear_regressor = LinearRegression()

# Definindo o número de folds
k = 5

# Criando o objeto KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Armazenará os scores de cada fold
mape_scores = []

# Realizando o K-Fold Cross-Validation
for train_index, val_index in kf.split(x_train, y_train):
    x_train_cv, x_val = x[train_index], x[val_index]
    y_train_cv, y_val = y[train_index], y[val_index]

    # Treinando o modelo no conjunto de treino
    linear_regressor.fit(x_train_cv, y_train_cv)

    # Fazendo previsões no conjunto de validação
    y_pred = linear_regressor.predict(x_val)

    # Calculando o erro percentual absoluto médio (MAPE)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    # Armazenando o MAPE para cada fold
    mape_scores.append(mape)

# Calculando o MAPE médio
mape_mean = np.mean(mape_scores)

print(f"MAPE médio: {mape_mean}")

MAPE médio: 0.3511047791823444


In [8]:
# Vendo a performance agora do modelo para o conjunto de teste

# Fazendo previsões no conjunto de teste
y_pred_2 = linear_regressor.predict(x_test)

# Calculando o erro percentual absoluto médio (MAPE)
mape_test = mean_absolute_percentage_error(y_test, y_pred_2)

print(f"MAPE: {mape_test}")
MAPE: 0.3513107105323694

MAPE: 0.35131071053236945


## Tuning de Hiperparâmetros

In [9]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# Definindo os parâmetros a serem ajustados
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

# Criando o modelo
svr = SVR()

# Ajuste fino
clf = GridSearchCV(svr, parameters)

# Treinando o modelo com otimização
clf.fit(x_train, y_train)

In [10]:
df_results = pd.DataFrame(clf.cv_results_)
df_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.919422,0.0271,0.981137,0.007024,1,linear,"{'C': 1, 'kernel': 'linear'}",-0.001193,0.002392,0.00169,-0.00049,0.0091,0.0023,0.00365,2
1,8.505153,0.079098,3.817577,0.084502,1,rbf,"{'C': 1, 'kernel': 'rbf'}",-0.058082,-0.053739,-0.059771,-0.057534,-0.052944,-0.056414,0.002627,4
2,6.505857,0.024156,0.962438,0.002693,10,linear,"{'C': 10, 'kernel': 'linear'}",0.229087,0.235911,0.242881,0.234096,0.252912,0.238978,0.008252,1
3,8.348203,0.048859,3.848216,0.01665,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-0.031867,-0.030399,-0.031064,-0.033309,-0.024302,-0.030188,0.003099,3


In [11]:
df_results = pd.DataFrame(clf.cv_results_)
df_results.query("rank_test_score == 1")['params']

{'C': 10, 'kernel': 'linear'}

{'C': 10, 'kernel': 'linear'}

In [12]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# Definindo os parâmetros a serem ajustados
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

# Criando o modelo
svr = SVR()

# Ajuste fino com validação cruzada
clf = GridSearchCV(svr, parameters, cv=10)

# Treinando o modelo com otimização
clf.fit(x_train, y_train)