# Tratando dataframe

In [1]:
import pandas as pd

uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"

dados = pd.read_csv(uri).drop(columns=["Unnamed: 0"], axis=1)

dados.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129


In [2]:
x = dados.drop(['vendido'], axis = 1)
y = dados['vendido']

# Separando dados de teste e treino

In [3]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=5)

# Modelo Dummy

In [4]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier()
dummy_classifier.fit(xtrain, ytrain)
dummy_accuracy = dummy_classifier.score(xtest, ytest)

print(f'A acurácia do dummy classifier foi de {(dummy_accuracy * 100):.2f}%')

A acurácia do dummy classifier foi de 59.16%


# Modelo DecisionTreeClassifier

In [5]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_classifier = DecisionTreeClassifier(max_depth=2)
decision_tree_classifier.fit(xtrain, ytrain)
tree_accuracy = decision_tree_classifier.score(xtest, ytest)

print(f'A acurácia do decision tree classifier foi de {(tree_accuracy * 100):.2f}%')

A acurácia do decision tree classifier foi de 75.20%


# Modelo KneighborsClassifier

In [6]:
from sklearn.neighbors import KNeighborsClassifier

kneighbors_classifier = KNeighborsClassifier(n_neighbors=3)
kneighbors_classifier.fit(xtrain, ytrain)
kneighbors_accuracy = kneighbors_classifier.score(xtest, ytest)

print(f'A acurácia do kneighboors classifier foi de {(kneighbors_accuracy * 100):.2f}%')

A acurácia do kneighboors classifier foi de 74.96%


In [7]:
from sklearn.model_selection import cross_validate

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x, y, cv = 5, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [75.21% , 76.35%]


# Trabalhando com aleatoriedade no cross validate

In [8]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x, y, cv = cv, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [74.37% , 77.19%]


In [9]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, shuffle=True)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x, y, cv = cv, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [73.64% , 77.92%]


In [10]:
dados_azar = dados.sort_values('vendido', ascending=True)
x_azar = dados_azar.drop(['vendido'], axis = 1)
y_azar = dados_azar['vendido']

In [11]:
cv = KFold(n_splits=10)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x_azar, y_azar, cv = cv, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [34.29% , 81.39%]


In [12]:
cv = KFold(n_splits=10, shuffle=True)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x_azar, y_azar, cv = cv, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [74.56% , 77.00%]


In [13]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=10, shuffle=True)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x_azar, y_azar, cv = cv, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [74.19% , 77.39%]


In [14]:
import numpy as np

In [15]:
SEED = 200
np.random.seed(SEED)

dados['modelo'] = dados.idade_do_modelo + np.random.randint(-2, 3, len(dados))
dados.modelo = dados.modelo + abs(dados.modelo.min()) + 1
dados.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,modelo
0,30941.02,1,18,35085.22134,20
1,40557.96,1,20,12622.05362,21
2,89627.5,0,12,11440.79806,12
3,95276.14,0,3,43167.32682,7
4,117384.68,1,4,12770.1129,6


In [25]:
from sklearn.model_selection import GroupKFold

cv = GroupKFold(n_splits=10)

dtc = DecisionTreeClassifier(max_depth=2)

results = cross_validate(dtc, x_azar, y_azar, cv = cv, groups=dados.modelo, return_train_score = False)

media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()

intervalo = media - 2 * desvio_padrao, media + 2 * desvio_padrao

print(f'Intervalo : [{(intervalo[0] * 100):.2f}% , {(intervalo[1] * 100):.2f}%]')


Intervalo : [74.33% , 77.24%]
