# Analise Comparativa

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('../data/raw/data.csv')
dictionary_df = pd.read_csv('../data/external/dictionary.csv')

In [3]:

# Filtering variables based on criteria
target_variable = 'price'

useless_variables = (
    dictionary_df
    .query("tipo == 'inútil'")
    .variavel
    .to_list()
)

nominal_variables = (
    dictionary_df
    .query("subtipo == 'nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)

ordinal_variables = (
    dictionary_df
    .query("subtipo == 'ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)

continuous_variables = (
    dictionary_df
    .query("subtipo == 'contínua' and variavel != @target_variable")
    .variavel
    .to_list()
)

discrete_variables = (
    dictionary_df
    .query("subtipo == 'discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)

# Creating X and y datasets
X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

{
    "X_shape": X.shape,
    "y_shape": y.shape,
    "Nominal Variables": nominal_variables,
    "Ordinal Variables": ordinal_variables,
    "Continuous Variables": continuous_variables,
    "Discrete Variables": discrete_variables
}

{'X_shape': (53940, 10),
 'y_shape': (53940,),
 'Nominal Variables': [],
 'Ordinal Variables': ['cut', 'color', 'clarity'],
 'Continuous Variables': ['carat', 'depth', 'table', 'x', 'y', 'z'],
 'Discrete Variables': []}

In [4]:

nominal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')), 
    ("encoding", OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'))
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')), 
    ("encoding", OrdinalEncoder())
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='mean')), 
    ("scaling", StandardScaler())
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", KNNImputer()), 
    ("scaling", StandardScaler())
])


In [5]:
# Combine preprocessors in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_preprocessor, nominal_variables),
        ("ordinal", ordinal_preprocessor, ordinal_variables),
        ("continuous", continuous_preprocessor, continuous_variables),
        ("discrete", discrete_preprocessor, discrete_variables),
    ]
)

In [6]:
models = [DummyRegressor(strategy='mean'), LinearRegression(), KNeighborsRegressor(n_neighbors=5)]
metrics = [
    'neg_mean_absolute_error',
    'neg_mean_squared_error',
    'neg_mean_absolute_percentage_error',
    'r2',
]
monte_carlo = ShuffleSplit(n_splits=10, test_size=.2, random_state=42)
# hold_out = ShuffleSplit(n_splits=1, test_size=.2, random_state=42)
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
results_total = None
for model in models:
    model_name = model.__class__.__name__
    print(f"rodando para o modelo: {model_name}")
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model),
    ])

    scores = cross_validate(
        approach, X, y,
        scoring=metrics, 
        cv=monte_carlo
    )
    results_model = pd.DataFrame(scores)
    results_model['model'] = model_name
    if results_total is None:
        results_total = results_model
    else:
        results_total = pd.concat([results_total, results_model])

rodando para o modelo: DummyRegressor
rodando para o modelo: LinearRegression
rodando para o modelo: KNeighborsRegressor


In [8]:
results_total.groupby('model').agg(['mean', 'std']).T

Unnamed: 0,model,DummyRegressor,KNeighborsRegressor,LinearRegression
fit_time,mean,0.04266479,0.087072,0.07966759
fit_time,std,0.01094923,0.013518,0.01710426
score_time,mean,0.009534454,0.275957,0.0151984
score_time,std,0.001441761,0.01787,0.003669444
test_neg_mean_absolute_error,mean,-3022.423,-389.101696,-860.9442
test_neg_mean_absolute_error,std,17.02945,4.203031,7.631357
test_neg_mean_squared_error,mean,-15827500.0,-557133.098198,-1814252.0
test_neg_mean_squared_error,std,263195.8,24072.413778,59112.09
test_neg_mean_absolute_percentage_error,mean,-1.877466,-0.109717,-0.3839988
test_neg_mean_absolute_percentage_error,std,0.01333744,0.001017,0.003842612
