## Carregar Dados

In [None]:
import os
from IPython.display import clear_output

if "notebooks" in os.getcwd():
    os.chdir("..")
    print(f"Changed working directory to {os.getcwd()}")

In [None]:
import re
import pandas as pd

from beer_color_prediction import config


def slugify(s):
    return re.sub(r"\W+", "_", s).lower().strip("_")


df = pd.read_csv(config.RAW_DATA_DIR / "dataset.csv")
df = df.drop(columns=["Unnamed: 0", "Date/Time"])
df = df.set_index("Job ID")
df.columns = [slugify(col) for col in df.columns]

O objetivo do problema é entender quais variáveis influenciam na cor da cerveja Amstel, para isso, separamos de antemão o subconjunto em dados da Heineken e da Amstel. Como a coloração não pode ser negativa e há presença de valores faltantes, removemos essas amostras.

In [None]:
df = df.dropna(subset=["color"])
df = df.query("color >= 0")
df_amstel = df.query("product == 'AMST'")
df_heineken = df.query("product == 'HNK'")

In [None]:
df_amstel

In [None]:
df_heineken

Separamos de antemão um teste (gold) contendo apenas amostras da Amstel e um treino (train) contendo amostras de ambas marcas. Esse subconjunto será utilizado nas análises da predição do modelo.

In [None]:
df_test = df_amstel.sample(frac=0.2, random_state=42)
df_amstel = df_amstel.drop(df_test.index)
df = df.drop(df_test.index)
df_test.shape, df_amstel.shape

## Treinamento de Modelos

### Funcoes

In [None]:
def preprocess_data(
    data: pd.DataFrame,
    handle_negative_values: str = "keep",
    handle_outliers: str = "keep",
    outlier_threshold: float = 1.5,
    lower_percentile: float = 0.05,
    upper_percentile: float = 0.95,
) -> pd.DataFrame:
    """Preprocesses the input DataFrame.

    Args:
        data (pd.DataFrame): Input DataFrame.
        handle_negative_values (str, optional): How to handle negative values. Defaults to "keep".
        Options: "keep", "replace_with_zero", "replace_with_nan", "drop".
        handle_outliers (str, optional): How to handle outliers. Defaults to "keep".
        Options: "keep", "clip", "replace_with_nan", "drop".
        outlier_threshold (float, optional): Threshold for outlier detection. Defaults to 1.5.
        lower_percentile (float, optional): Lower percentile for outlier detection. Defaults to 0.05.
        upper_percentile (float, optional): Upper percentile for outlier detection. Defaults to 0.95.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """

    data = data.drop(columns=["product", "roast_color"], errors="ignore")

    if handle_negative_values == "replace_with_zero":
        data = data.clip(lower=0)
    elif handle_negative_values == "replace_with_nan":
        data = data.where(data >= 0)
    elif handle_negative_values == "drop":
        data = data[(data >= 0).all(axis=1)]

    if (
        handle_outliers == "clip"
        or handle_outliers == "replace_with_nan"
        or handle_outliers == "drop"
    ):
        iqrs = data.quantile(upper_percentile) - data.quantile(lower_percentile)
        lower_bound = data.quantile(lower_percentile) - outlier_threshold * iqrs
        upper_bound = data.quantile(upper_percentile) + outlier_threshold * iqrs
        if handle_outliers == "clip":
            data = data.clip(lower=lower_bound, upper=upper_bound, axis=1)
        elif handle_outliers == "replace_with_nan":
            data = data.where((data >= lower_bound) & (data <= upper_bound))
        elif handle_outliers == "drop":
            data = data[((data >= lower_bound) & (data <= upper_bound)).all(axis=1)]

    return data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor

from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
    root_mean_squared_error,
)

import numpy as np


def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return mse, rmse, r2, mae, mape


def print_metrics(y_true, y_pred):
    mse, rmse, r2, mae, mape = evaluate(y_true, y_pred)
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R^2: {r2:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}")


def filter_df_by_product(df_in, product_name):
    """Filters a DataFrame by a specific product name.

    Args:
      df: The input DataFrame.
      product_name: The name of the product to filter by.

    Returns:
      A new DataFrame containing only rows with the specified product.
    """

    filtered_df = (
        df_in.join(df["product"])
        .query(f"product == '{product_name}'")
        .drop(columns=["product"])
    )
    return filtered_df

### Experimentos



In [None]:
df1 = preprocess_data(
    df, handle_negative_values="keep", handle_outliers="keep"
)  # Não mudar nada

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(columns=["color"]), df1["color"], test_size=0.2, random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

print(
    X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape
)

In [None]:
X_train

#### Experimento 1: Sem tratamento de valores extremos e negativos
Para um primeiro teste, não removemos os valores extremos ou negativos e usamos uma estratégia simples de preenchimento de valores faltantes. Para isso, usamos a média dos valores da coluna.

##### Dummy Regressor (Baseline)

In [None]:
regressor = TransformedTargetRegressor(
    regressor=DummyRegressor(strategy="mean"),
    func=np.log,
    inverse_func=np.exp,
)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)


print("Predições:")
print_metrics(y_test, y_pred)


print("\nAMSTEL")
test_amstel = filter_df_by_product(X_test, "AMST")
y_pred = regressor.predict(test_amstel)
print_metrics(y_test.loc[test_amstel.index], y_pred)

print("\nHeineken")
test_heineken = filter_df_by_product(X_test, "HNK")
y_pred = regressor.predict(test_heineken)
print_metrics(y_test.loc[test_heineken.index], y_pred)

##### Testar múltiplos modelos

Usamos o LazyPredict para treinar diversos modelos de regressão e escolher o melhor para a tarefa. 

In [None]:
from lazypredict.Supervised import LazyRegressor

reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=mean_absolute_error)

In [None]:
models, predictions = reg.fit(X_train, X_val, y_train, y_val)

clear_output()
models

O melhor modelo segundo o LazyPredict foi ExtraTreesRegressor, no entanto, ao utilizar o modelo para fazer previsões, o resultado r2 se mostrou muito baixo.

In [None]:
top_models = models.head(3)

print("Predições:\n\n")
for model_name in top_models.index:
    y_pred = reg.models[model_name].predict(X_test)
    print("-" * 5, model_name, "-" * 5)
    print("\nGeral:")
    print_metrics(y_test, y_pred)

    print("\nAMSTEL")
    test_amstel = filter_df_by_product(X_test, "AMST")
    y_pred = reg.models[model_name].predict(test_amstel)
    print_metrics(y_test.loc[test_amstel.index], y_pred)

#### Experimento 2: Com tratamento de valores extremos e negativos

Na exploração de dados identificamos que existem valores extremos e negativos que podem estar prejudicando o treinamento dos modelos. Para isso, vamos tratar esses valores e treinar os modelos novamente. Os valores faltantes estão sendo preenchidos com a média dos valores da coluna pelo LazyPrediction.


In [None]:
# Nessa estratégia removemos as linhas com valores negativos e outliers
_X_train = preprocess_data(
    X_train,
    handle_negative_values="drop",
    handle_outliers="drop",
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values="drop",
    handle_outliers="drop",
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values="drop",
    handle_outliers="drop",
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

##### Dummy Regressor (Baseline)

In [None]:
regressor = TransformedTargetRegressor(
    regressor=DummyRegressor(strategy="mean"),
    func=np.log,
    inverse_func=np.exp,
)

regressor.fit(_X_train, _y_train)

_y_pred = regressor.predict(_X_test)

print_metrics(_y_test, _y_pred)

##### Testar múltiplos modelos

In [None]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=mean_absolute_error)

models, predictions = reg.fit(_X_train, _X_val, _y_train, _y_val)

clear_output()
models

In [None]:
top_models = models.head(3)

print("Predições:\n\n")
for model_name in top_models.index:
    y_pred = reg.models[model_name].predict(_X_test)
    print("-" * 5, model_name, "-" * 5)
    print("\nGeral:")
    print_metrics(_y_test, y_pred)

    print("\nAMSTEL")
    test_amstel = filter_df_by_product(_X_test, "AMST")
    y_pred = reg.models[model_name].predict(test_amstel)
    print_metrics(_y_test.loc[test_amstel.index], y_pred)


Fazer a remoção dos valores extremos ajudou no r2 e rmse do modelo ExtraTreesRegressor, mas ao remover esses valores nosso conjunto de dados foi reduzido significativamente. A seguir, não vamos remover os valores extremos, mas substituít-los pela média dos valores da coluna.

In [None]:
# Ao substituir os valores negativos e outliers por NaN, o simples imputer vai substituir esses valores pela média

strategy = "replace_with_nan"
_X_train = preprocess_data(
    X_train,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

In [None]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=mean_absolute_error)

models, predictions = reg.fit(_X_train, _X_val, _y_train, _y_val)

clear_output()

models

In [None]:
top_models = models.head(3)
for model_name in top_models.index:
    y_pred = reg.models[model_name].predict(_X_test)
    print("-" * 5, model_name, "-" * 5)
    print("\nGeral:")
    print_metrics(_y_test, y_pred)

    print("\nAMSTEL")
    test_amstel = filter_df_by_product(_X_test, "AMST")
    y_pred = reg.models[model_name].predict(test_amstel)
    print_metrics(_y_test.loc[test_amstel.index], y_pred)

As métricas foram piores comparada a remoção no geral. Substituir os outliers usando a estratégia de clipping não melhorou o modelo comparado a remoção dos outliers.

In [None]:
strategy = "clip"
_X_train = preprocess_data(
    X_train,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=mean_absolute_error)

models, predictions = reg.fit(_X_train, _X_val, _y_train, _y_val)

clear_output()

models

In [None]:
top_models = models.head(3)
for model_name in top_models.index:
    y_pred = reg.models[model_name].predict(_X_test)
    print("-" * 5, model_name, "-" * 5)
    print("\nGeral:")
    print_metrics(_y_test, y_pred)

    print("\nAMSTEL")
    test_amstel = filter_df_by_product(_X_test, "AMST")
    y_pred = reg.models[model_name].predict(test_amstel)
    print_metrics(_y_test.loc[test_amstel.index], y_pred)

#### Experimento 3: Busca de normalizador e estratégias de preenchimento dos dados

O modelo final escolhido foi o ExtraTreesRegressor, pois foi o que apresentou "melhores" métricas. A seguir, vamos testar diferentes estratégias de tratamento de valores faltantes para tentar melhorar o modelo. Primeiro substituímos os valores extremos/negativos por nan com o propósito de testar diferentes abordagens de preenchimento desses valores.

In [None]:
_X_train = preprocess_data(
    X_train,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor
import optuna


def objective(trial):

    imputer = trial.suggest_categorical("imputer", ["simple", "knn", "iterative"])
    scaler = trial.suggest_categorical("scaler", ["standard", "minmax", "robust"])
    target_transformer = trial.suggest_categorical(
        "target_transformer", ["log", "minmax", "robust", "standard", "none"]
    )

    if imputer == "knn":
        n_neighbors = trial.suggest_int("n_neighbors", 3, 10)
        weight = trial.suggest_categorical("weight", ["uniform", "distance"])
        imputer = KNNImputer(n_neighbors=n_neighbors, weights=weight)
    elif imputer == "iterative":
        max_iter = trial.suggest_int("max_iter", 5, 20)
        imputer = IterativeImputer(max_iter=max_iter, random_state=42)
    elif imputer == "simple":
        strategy = trial.suggest_categorical(
            "strategy", ["mean", "median", "most_frequent"]
        )
        imputer = SimpleImputer(strategy=strategy)

    if scaler == "standard":
        scaler = preprocessing.StandardScaler()
    elif scaler == "minmax":
        scaler = preprocessing.MinMaxScaler()
    elif scaler == "robust":
        scaler = preprocessing.RobustScaler()

    n_estimators = trial.suggest_int("n_estimators", 25, 100)
    regressor = ExtraTreesRegressor(n_estimators=n_estimators, random_state=42)

    regressor = Pipeline(
        [
            ("imputer", imputer),
            ("scaler", scaler),
            ("regressor", regressor),
        ]
    )

    if target_transformer == "log":
        regressor = TransformedTargetRegressor(
            regressor=regressor,
            func=np.log,
            inverse_func=np.exp,
        )
    elif target_transformer == "minmax":
        regressor = TransformedTargetRegressor(
            regressor=regressor,
            transformer=preprocessing.MinMaxScaler(),
        )
    elif target_transformer == "robust":
        regressor = TransformedTargetRegressor(
            regressor=regressor,
            transformer=preprocessing.RobustScaler(),
        )
    elif target_transformer == "standard":
        regressor = TransformedTargetRegressor(
            regressor=regressor,
            transformer=preprocessing.StandardScaler(),
        )

    regressor.fit(_X_train, _y_train)
    y_pred = regressor.predict(_X_val)
    return root_mean_squared_error(_y_val, y_pred)


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

Alterar o tipo de preenchimento dos valores faltantes, o normalizador não influenciou significativamente nas métricas do modelo de acordo com os logs do Optuna.

In [None]:
trial = study.best_trial
trial.params

In [None]:
reg = ExtraTreesRegressor(random_state=42)
imputer = SimpleImputer(strategy="median")
scaler = preprocessing.MinMaxScaler()

pipe = Pipeline(
    [
        ("imputer", imputer),
        ("scaler", scaler),
        ("regressor", reg),
    ]
)

pipe = TransformedTargetRegressor(
    regressor=pipe,
    transformer=preprocessing.RobustScaler(),
)

pipe.fit(_X_train, _y_train)

_y_pred = pipe.predict(_X_test)

print_metrics(_y_test, _y_pred)

#### Outros Experimentos

Experimentos extras em que exploramos aumentar os dados artificialmente e discretizar as variáveis de entrada.

##### Construção e seleção de features

Experimentar com construcoes de features a partir das existente utilizando operadores basicos: soma, subtracao multiplicacao, etc, de pares de features. Por causa da grande quantidade de features geradas, e necessario uma etapa de selecao.

In [None]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor


pipe = Pipeline(
    [
        ("imputer", SimpleImputer()),
        ("scaler", preprocessing.StandardScaler()),
        ("regressor", ExtraTreesRegressor(random_state=42)),
    ]
)

pipe = TransformedTargetRegressor(
    regressor=pipe,
    transformer=preprocessing.RobustScaler(),
)

In [None]:
_X_train = preprocess_data(
    X_train,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values="replace_with_nan",
    handle_outliers="replace_with_nan",
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

In [None]:
# Feature construction
# Golden Features
# A partir de cada par de features originais, cria uma nova feature usando operadores matemáticos: +, -, /, * e avalia seu poder preditivo.
def get_golden_features(data):
    golden_features = pd.DataFrame()
    for i, col1 in enumerate(data.columns):
        for j, col2 in enumerate(data.columns):
            if i < j:
                golden_features[f"{col1}_plus_{col2}"] = data[col1] + data[col2]
                golden_features[f"{col1}_minus_{col2}"] = data[col1] - data[col2]
                golden_features[f"{col1}_times_{col2}"] = data[col1] * data[col2]
                golden_features[f"{col1}_div_{col2}"] = data[col1] / data[col2]
    return golden_features

_X_train = get_golden_features(_X_train)
_X_val = get_golden_features(_X_val)
_X_test = get_golden_features(_X_test)

_X_train.shape, _X_val.shape, _X_test.shape

In [None]:
import numpy as np
#lets fix ValueError: Input X contains infinity or a value too large for dtype('float64').
_X_train = _X_train.replace([np.inf, -np.inf], np.nan)
_X_val = _X_val.replace([np.inf, -np.inf], np.nan)
_X_test = _X_test.replace([np.inf, -np.inf], np.nan)

_X_train = _X_train.fillna(_X_train.mean())
_X_val = _X_val.fillna(_X_val.mean())
_X_test = _X_test.fillna(_X_test.mean())

In [None]:
#Feature selection: pegar as 100 melhores features
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=100)
selector.fit(_X_train,_y_train)
_X_train = selector.transform(_X_train)
_X_val = selector.transform(_X_val)
_X_test = selector.transform(_X_test)

Estrategia nao obteve ganhos significantes

In [None]:
pipe.fit(_X_train, _y_train)
y_pred = pipe.predict(_X_test)
print_metrics(_y_test, y_pred)

##### Discretização das variáveis de entrada

Avaliamos a performance ao discretizar as variaveis de entrada

In [None]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor


pipe = Pipeline(
    [
        ("imputer", SimpleImputer()),
        ("scaler", preprocessing.StandardScaler()),
        ("regressor", ExtraTreesRegressor(random_state=42)),
    ]
)

pipe = TransformedTargetRegressor(
    regressor=pipe,
    transformer=preprocessing.RobustScaler(),
)

In [None]:
_X_train = preprocess_data(
    X_train,
    handle_negative_values="drop",
    handle_outliers="drop",
)

_X_test = preprocess_data(
    X_test,
    handle_negative_values="drop",
    handle_outliers="replace_dropwith_nan",
)

_X_val = preprocess_data(
    X_val,
    handle_negative_values="drop",
    handle_outliers="drop",
)

_y_train = y_train.loc[_X_train.index]
_y_test = y_test.loc[_X_test.index]
_y_val = y_val.loc[_X_val.index]

In [None]:
_X_train.iloc[:3]

In [None]:
#vamos discretizar os dados substituindo os valores pelo bin ao qual pertencem. Usaremos o KBinsDiscretizer do scikit-learn.
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=30, encode="ordinal", strategy="uniform")

_X_train = discretizer.fit_transform(_X_train)
_X_val = discretizer.transform(_X_val)
_X_test = discretizer.transform(_X_test)


In [None]:
_X_train[:3]

Sem ganhos significativos

In [None]:
pipe.fit(_X_train, _y_train)
y_pred = pipe.predict(_X_test)
print_metrics(_y_test, y_pred)

## Análise de Resultados

Como a escolha de estratégia de preenchimento de valores faltantes, normalização e tratamento de valores extremos e negativos, entre outros experimentos, não influenciaram significativamente nas métricas do modelo, uma possível melhoria seria a coleta de mais dados para treinamento do modelo. Usamos a configuração padrão do modelo ExtraTreesRegressor, preenchimento de valores faltantes com a média dos valores da coluna e o normalizador StandardScaler.

In [None]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor


pipe = Pipeline(
    [
        ("imputer", SimpleImputer()),
        ("scaler", preprocessing.StandardScaler()),
        ("regressor", ExtraTreesRegressor(random_state=42)),
    ]
)

pipe = TransformedTargetRegressor(
    regressor=pipe,
    transformer=preprocessing.RobustScaler(),
)

### Treinando usando amstel e heineken

Recapitulando: O dataset separado no início do notebook contém apenas dados da Amstel. 

Desejamos saber o desempenho do modelo para estimar a cor da marca, para isso, treinamos o modelo usando os dados de ambas as marcas, vemos que as métricas foram similares as obtidas nas análises anteriores.

In [None]:
strategy = "replace_with_nan"
df_train = preprocess_data(
    df,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

X_train = df_train.drop(columns=["color"])
y_train = df_train["color"]

pipe.fit(X_train, y_train)

df_test = preprocess_data(
    df_test,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)
X_test = df_test.drop(columns=["color"])
y_test = df_test["color"]

y_pred = pipe.predict(X_test)

print_metrics(y_test, y_pred)

In [None]:
reg = DummyRegressor(strategy="mean")

regressor = TransformedTargetRegressor(
    regressor=reg,
    func=np.log,
    inverse_func=np.exp,
)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

print_metrics(y_test, y_pred)

In [None]:
extra_tree_model = pipe.regressor_["regressor"]
feature_importance = extra_tree_model.feature_importances_

sorted_idx = np.argsort(feature_importance)[::-1]

plt.figure(figsize=(10, 6))
plt.barh(np.array(X_train.columns)[sorted_idx], feature_importance[sorted_idx], color="#00561F")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show()

Usar parte das features não mudou significativamente as métricas do modelo.

In [None]:
most_important_features = X_train.columns[sorted_idx][:10]

pipe = Pipeline(
    [
        ("imputer", SimpleImputer()),
        ("scaler", preprocessing.StandardScaler()),
        ("regressor", ExtraTreesRegressor(random_state=42)),
    ]
)

pipe = TransformedTargetRegressor(
    regressor=pipe,
    transformer=preprocessing.RobustScaler(),
)

pipe.fit(X_train[most_important_features], y_train)

y_pred = pipe.predict(X_test[most_important_features])

print_metrics(y_test, y_pred)

### Treinando usando apenas Amstel

As métricas apresentaram pouca variação em relação ao treinamento com os dados de ambas as marcas. Então, treinar o modelo com uma ou com as duas marcas não influenciou significativamente nas métricas.

In [None]:
strategy = "replace_with_nan"
df_train = preprocess_data(
    df_amstel,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)

X_train = df_train.drop(columns=["color"])
y_train = df_train["color"]

pipe.fit(X_train, y_train)

df_test = preprocess_data(
    df_test,
    handle_negative_values=strategy,
    handle_outliers=strategy,
)
X_test = df_test.drop(columns=["color"])
y_test = df_test["color"]

y_pred = pipe.predict(X_test)

print_metrics(y_test, y_pred)