# Extraindo os dados

In [1]:
import pandas as pd

uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"
dados = pd.read_csv(uri).drop(columns=["Unnamed: 0"], axis=1)
dados.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129


In [2]:
# gerando dados elatorios de modelo de carro para simulacao de agrupamento ao usar nosso estimador
import numpy as np

SEED = 158020
np.random.seed(SEED)

dados["modelo"] = dados.idade_do_modelo + np.random.randint(-2, 3, size=10000)
dados.modelo = dados.modelo + abs(dados.modelo.min()) + 1
dados.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,modelo
0,30941.02,1,18,35085.22134,19
1,40557.96,1,20,12622.05362,22
2,89627.5,0,12,11440.79806,14
3,95276.14,0,3,43167.32682,4
4,117384.68,1,4,12770.1129,4


# Analisando hiperpar√¢metros

In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate


def decision_tree_tests_pipeline(
    max_depth, x, y, groups, n_splits=10, SEED=np.random.seed
):
    np.random.seed(SEED)

    cv = GroupKFold(n_splits=n_splits)
    modelo = DecisionTreeClassifier(max_depth=max_depth)
    results = cross_validate(
        modelo, x, y, cv=cv, groups=groups, return_train_score=True
    )
    train_score = results["train_score"].mean() * 100
    test_score = results["test_score"].mean() * 100
    tabela = [max_depth, train_score, test_score]
    return tabela

In [4]:
x = dados[["preco", "idade_do_modelo", "km_por_ano"]]
y = dados["vendido"]
groups = dados["modelo"]

results = [
    decision_tree_tests_pipeline(max_depth, x, y, groups, n_splits=10, SEED=SEED)
    for max_depth in range(1, 33)
]
df_results = pd.DataFrame(results, columns=["max_depth", "train", "test"])
df_results.head()

Unnamed: 0,max_depth,train,test
0,1,75.791256,75.790527
1,2,75.791256,75.790527
2,3,78.755626,78.686402
3,4,78.791207,78.666231
4,5,78.953333,78.667173


In [5]:
import plotly.express as px

fig = px.line(df_results, x="max_depth", y=["train", "test"])
fig.show()