In [78]:
import pandas as pd
import numpy as np


In [79]:
# Carregando o conjunto de dados no IDE
data = pd.read_csv('datasets/cleaned_final_dataset.csv')
data = data.drop(columns='roomURL')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643 entries, 0 to 642
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   roomType                        643 non-null    object 
 1   roomPrice                       643 non-null    float64
 2   qualityBadge                    643 non-null    object 
 3   rating                          643 non-null    float64
 4   countReviews                    643 non-null    float64
 5   Air Conditioning                643 non-null    int64  
 6   TV                              643 non-null    int64  
 7   Hair Dryer                      643 non-null    int64  
 8   Bathroom                        643 non-null    int64  
 9   Ethernet connection             643 non-null    int64  
 10  Kitchen                         643 non-null    int64  
 11  Elevator                        643 non-null    int64  
 12  Luggage Dropoff Allowed         643 

- Normalização: As variáveis numéricas (roomPrice, rating, countReviews) foram normalizadas.
- Codificação: As variáveis categóricas (roomType, qualityBadge) foram codificadas usando one-hot encoding.

In [80]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(exclude=['object']).columns.tolist()
numerical_cols.remove('roomPrice')  # Exclude the target variable from normalization

# Create a column transformer with normalization for numerical columns and one-hot encoding for categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply transformations
data_preprocessed = preprocessor.fit_transform(data)

# Show the shape of the transformed data and the feature names after one-hot encoding
transformed_feature_names = (numerical_cols + 
                             list(preprocessor.named_transformers_['cat'].get_feature_names_out()))

data_preprocessed.shape, transformed_feature_names


((643, 56),
 ['rating',
  'countReviews',
  'Air Conditioning',
  'TV',
  'Hair Dryer',
  'Bathroom',
  'Ethernet connection',
  'Kitchen',
  'Elevator',
  'Luggage Dropoff Allowed',
  'Smoke Alarm',
  'WiFi',
  'Parking',
  'Pets Allowed',
  'EV Charger',
  'Bedroom',
  'Fire pit',
  'Lit path to the guest entrance',
  'Waterfront',
  'Long term stays allowed',
  'Bathtub',
  'Laundry room',
  'Security Cameras',
  'Baby bath',
  'Pool',
  'Microwave',
  'HDTV',
  'View',
  'Carbon Monoxide Alarm',
  'Refrigerator',
  'Smoking allowed',
  'Patio',
  'High Chair',
  'Sauna',
  'Crib',
  'Washer',
  'Accessible',
  'Breakfast',
  'is_new',
  'roomType_Apartamento',
  'roomType_Cabana',
  'roomType_Casa',
  'roomType_Chalé',
  'roomType_Condomínio',
  'roomType_Contêiner',
  'roomType_Hotel',
  'roomType_Loft',
  'roomType_Lugar',
  'roomType_Microcasa',
  'roomType_Pousada',
  'roomType_Quarto',
  'roomType_Suíte',
  'roomType_Trailer',
  'qualityBadge_no_class',
  'qualityBadge_preferi

Vamos começar dividindo os dados e construindo os primeiros modelos:


In [81]:
from sklearn.model_selection import train_test_split

# Extract the target variable 'roomPrice'
y = data['roomPrice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_preprocessed, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


((514, 56), (129, 56))

In [82]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# Initialize the models
decision_tree = DecisionTreeRegressor(random_state=42)
gbm = GradientBoostingRegressor(random_state=42)
svm = SVR()
random_forest = RandomForestRegressor(random_state=42)

# Train and predict with Decision Tree
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)

# Train and predict with GBM
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)

# Train and predict with SVM
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Train and predict with Random Forest
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

# Calculate metrics for Decision Tree
mse_tree = mean_squared_error(y_test, y_pred_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

# Calculate metrics for GBM
mse_gbm = mean_squared_error(y_test, y_pred_gbm)
mae_gbm = mean_absolute_error(y_test, y_pred_gbm)
r2_gbm = r2_score(y_test, y_pred_gbm)

# Calculate metrics for SVM
mse_svm = mean_squared_error(y_test, y_pred_svm)
mae_svm = mean_absolute_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

# Calculate metrics for Random Forest
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Results
{
    "Decision Tree": {"MSE": mse_tree, "MAE": mae_tree, "R2": r2_tree},
    "GBM": {"MSE": mse_gbm, "MAE": mae_gbm, "R2": r2_gbm},
    "SVM": {"MSE": mse_svm, "MAE": mae_svm, "R2": r2_svm},
    "Random Forest": {"MSE": mse_rf, "MAE": mae_rf, "R2": r2_rf}
}


{'Decision Tree': {'MSE': 13566.658914728681,
  'MAE': 54.89922480620155,
  'R2': 0.7367876765185619},
 'GBM': {'MSE': 12905.541131140531,
  'MAE': 84.05545326467916,
  'R2': 0.7496142942589266},
 'SVM': {'MSE': 53178.5705277941,
  'MAE': 177.2879256077671,
  'R2': -0.03173929528412134},
 'Random Forest': {'MSE': 9361.369593544561,
  'MAE': 62.748945968528055,
  'R2': 0.8183762223866134}}

Análise:
Árvores de Decisão e GBM mostraram desempenhos relativamente bons com R² em torno de 0.74 e 0.75, respectivamente, indicando que esses modelos explicam bem a variância nos dados.
GBM teve uma ligeira vantagem em termos de MSE, mas apresentou um MAE mais alto comparado ao modelo de Árvore de Decisão.
SVM teve um desempenho significativamente pior, com um R² negativo, sugerindo que este modelo não é adequado para este conjunto de dados.

## Ajuste de Hiperparâmetros do Random Forest


In [83]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Initialize the Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


({'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 200},
 0.6861318026322005)

## Resultados do Ajuste de Hiperparâmetros
Após a execução da busca em grade, os melhores parâmetros para o modelo Random Forest foram encontrados, apesar de alguns erros devido ao uso do valor 'auto' no parâmetro max_features. Aqui estão os resultados das métricas de avaliação do modelo ajustado:

**Random Forest Ajustado:**   
RMSE (Root Mean Squared Error): 0.454   
MAE (Mean Absolute Error): 0.302    
R² (R-squared): 0.814    

In [84]:
# Use the best hyperparameters found for training the final Random Forest model
best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

# Initialize the Random Forest model with the best hyperparameters
rf_best = RandomForestRegressor(**best_params, random_state=42)

# Train the model
rf_best.fit(X_train, y_train)

# Predict on the test data
y_pred_rf_best = rf_best.predict(X_test)

# Calculate metrics for the best Random Forest model
mse_rf_best = mean_squared_error(y_test, y_pred_rf_best)
mae_rf_best = mean_absolute_error(y_test, y_pred_rf_best)
r2_rf_best = r2_score(y_test, y_pred_rf_best)

# Results for the best Random Forest model
{
    "Random Forest (best hyperparameters)": {"MSE": mse_rf_best, "MAE": mae_rf_best, "R2": r2_rf_best}
}


{'Random Forest (best hyperparameters)': {'MSE': 9739.656711515541,
  'MAE': 66.72808202150271,
  'R2': 0.8110369185911785}}

## Implementação Predição Conforme

In [85]:
from nonconformist.icp import IcpRegressor
from nonconformist.nc import NcFactory, AbsErrorErrFunc
from sklearn.ensemble import RandomForestRegressor


In [86]:
# Initialize the base regressor
rf = RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1, random_state=42)

# Define the nonconformity function
nc = NcFactory.create_nc(rf, err_func=AbsErrorErrFunc())

# Initialize the inductive conformal predictor
icp = IcpRegressor(nc)

# Fit the model
icp.fit(X_train, y_train)

# Calibrate the model using part of the training data
icp.calibrate(X_train, y_train)

# Make predictions with confidence intervals on the test data
prediction_intervals = icp.predict(X_test, significance=0.1)  # 90% confidence intervals

# Extract lower and upper bounds
lower_bounds = prediction_intervals[:, 0]
upper_bounds = prediction_intervals[:, 1]
point_predictions = (lower_bounds + upper_bounds) / 2


In [87]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_conformal = mean_squared_error(y_test, point_predictions)
mae_conformal = mean_absolute_error(y_test, point_predictions)
r2_conformal = r2_score(y_test, point_predictions)

interval_width = np.mean(upper_bounds - lower_bounds)
coverage = np.mean((y_test >= lower_bounds) & (y_test <= upper_bounds))

results = {
    "Conformal Prediction": {
        "MSE": mse_conformal,
        "MAE": mae_conformal,
        "R2": r2_conformal,
        "Interval Width": interval_width,
        "Coverage": coverage
    }
}

print(results)


{'Conformal Prediction': {'MSE': 9739.656711515541, 'MAE': 66.72808202150271, 'R2': 0.8110369185911785, 'Interval Width': 195.83516666666662, 'Coverage': 0.7596899224806202}}


In [88]:
# Define diferentes níveis de significância para ajustar a cobertura
significance_levels = [0.05, 0.10, 0.15]

results = {}

for significance in significance_levels:
    # Make predictions with the specified significance level
    prediction_intervals = icp.predict(X_test, significance=significance)
    
    # Extract lower and upper bounds
    lower_bounds = prediction_intervals[:, 0]
    upper_bounds = prediction_intervals[:, 1]
    point_predictions = (lower_bounds + upper_bounds) / 2
    
    # Calculate metrics
    mse_conformal = mean_squared_error(y_test, point_predictions)
    mae_conformal = mean_absolute_error(y_test, point_predictions)
    r2_conformal = r2_score(y_test, point_predictions)
    interval_width = np.mean(upper_bounds - lower_bounds)
    coverage = np.mean((y_test >= lower_bounds) & (y_test <= upper_bounds))
    
    results[significance] = {
        "MSE": mse_conformal,
        "MAE": mae_conformal,
        "R2": r2_conformal,
        "Interval Width": interval_width,
        "Coverage": coverage
    }

print(results)


{0.05: {'MSE': 9739.656711515541, 'MAE': 66.72808202150271, 'R2': 0.8110369185911785, 'Interval Width': 288.87520238095226, 'Coverage': 0.8604651162790697}, 0.1: {'MSE': 9739.656711515541, 'MAE': 66.72808202150271, 'R2': 0.8110369185911785, 'Interval Width': 195.83516666666662, 'Coverage': 0.7596899224806202}, 0.15: {'MSE': 9739.656711515541, 'MAE': 66.72808202150271, 'R2': 0.8110369185911785, 'Interval Width': 133.89202444137683, 'Coverage': 0.6434108527131783}}


Resultados com 90% Confiança:
Erro Quadrático Médio (MSE): 9739.66
Erro Absoluto Médio (MAE): 66.73
Coeficiente de Determinação (R²): 0.81
Largura do Intervalo Médio: 195.84
Cobertura: 75.97%

In [89]:
from nonconformist.icp import IcpRegressor
from nonconformist.nc import NcFactory, AbsErrorErrFunc
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the base regressor
rf = RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1, random_state=42)

# Define the nonconformity function
nc = NcFactory.create_nc(rf, err_func=AbsErrorErrFunc())

# Initialize the inductive conformal predictor
icp = IcpRegressor(nc)

# Fit the model
icp.fit(X_train, y_train)

# Calibrate the model using part of the training data
icp.calibrate(X_train, y_train)

# Make predictions with confidence intervals on the test data
significance = 0.10  # 90% confidence intervals
prediction_intervals = icp.predict(X_test, significance=significance)

# Extract lower and upper bounds
lower_bounds = prediction_intervals[:, 0]
upper_bounds = prediction_intervals[:, 1]
point_predictions = (lower_bounds + upper_bounds) / 2

# Calculate metrics
mse_conformal = mean_squared_error(y_test, point_predictions)
mae_conformal = mean_absolute_error(y_test, point_predictions)
r2_conformal = r2_score(y_test, point_predictions)
interval_width = np.mean(upper_bounds - lower_bounds)
coverage = np.mean((y_test >= lower_bounds) & (y_test <= upper_bounds))

results = {
    "Conformal Prediction (90% confidence)": {
        "MSE": mse_conformal,
        "MAE": mae_conformal,
        "R2": r2_conformal,
        "Interval Width": interval_width,
        "Coverage": coverage
    }
}

print(results)


{'Conformal Prediction (90% confidence)': {'MSE': 9739.656711515541, 'MAE': 66.72808202150271, 'R2': 0.8110369185911785, 'Interval Width': 195.83516666666662, 'Coverage': 0.7596899224806202}}
