In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
uri = 'https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv'
dados = pd.read_csv(uri)

In [3]:
dados.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [4]:
a_renomear = {
    'mileage_per_year': 'milhas_por_ano',
    'model_year': 'ano_do_modelo',
    'price':'preco',
    'sold': 'vendido'
}

In [5]:
dados = dados.rename(columns= a_renomear)
dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [6]:
a_trocar = {
    'yes':1,
    'no':0
}

dados['vendido'] = dados['vendido'].map(a_trocar)
dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [7]:
# Criando uma coluna para armazenar os anos de vida do carro
from datetime import datetime
ano_atual = datetime.today().year

dados['idade_modelo'] = ano_atual - dados['ano_do_modelo']
dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido,idade_modelo
0,0,21801,2000,30941.02,1,22
1,1,7843,1998,40557.96,1,24
2,2,7109,2006,89627.5,0,16
3,3,26823,2015,95276.14,0,7
4,4,7935,2014,117384.68,1,8


In [8]:
dados['km_por_ano'] = dados['milhas_por_ano'] * 1.60934
dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido,idade_modelo,km_por_ano
0,0,21801,2000,30941.02,1,22,35085.22134
1,1,7843,1998,40557.96,1,24,12622.05362
2,2,7109,2006,89627.5,0,16,11440.79806
3,3,26823,2015,95276.14,0,7,43167.32682
4,4,7935,2014,117384.68,1,8,12770.1129


In [9]:
dados.drop(['Unnamed: 0', 'milhas_por_ano', 'ano_do_modelo'], axis= 1, inplace= True)

In [10]:
dados.head()

Unnamed: 0,preco,vendido,idade_modelo,km_por_ano
0,30941.02,1,22,35085.22134
1,40557.96,1,24,12622.05362
2,89627.5,0,16,11440.79806
3,95276.14,0,7,43167.32682
4,117384.68,1,8,12770.1129


In [11]:
# Performance do Modelo com dados originais
x = dados.drop(['vendido'], axis =1)
y= dados['vendido']


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


# Quando utilizo este Seed o SKlearning assume este valor como seed para todos os métodos
SEED = 5
np.random.seed(SEED)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25, random_state= 20, stratify= y)

modelo = SVC(gamma= 'auto')

modelo.fit(x_train, y_train)

previsoes = modelo.predict(x_test)

accuracy_score(y_test, previsoes) *100

57.99999999999999

In [12]:
# Performance do modelo com dados Escalonados
SEED = 5
np.random.seed(SEED)


raw_x_train, raw_x_test, y_train, y_test = train_test_split(x, y, test_size= .25, stratify= y)

# 
scaler = StandardScaler()
x_train = scaler.fit_transform(raw_x_train)
x_test = scaler.fit_transform(raw_x_test)


modelo = SVC(gamma= 'auto')

modelo.fit(x_train, y_train)

previsoes = modelo.predict(x_test)

accuracy_score(y_test, previsoes) *100

77.4

### Gerando Dados aleatórios para ter uma BaseLine

In [13]:
from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier()
dummy_stratified.fit(x_train, y_train)

previsoes = dummy_stratified.predict(x_test)

accuracy_score(y_test, previsoes) * 100

57.99999999999999

In [14]:
from sklearn.dummy import DummyClassifier

dummy_mostfrequent = DummyClassifier()
dummy_mostfrequent.fit(x_train, y_train)

previsoes = dummy_mostfrequent.predict(x_test)

accuracy_score(y_test, previsoes) * 100

57.99999999999999

### Clasfficador de Árvore

In [18]:
# Performance do Modelo com dados originais
x = dados.drop(['vendido'], axis =1)
y = dados['vendido']


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


# Quando utilizo este Seed o SKlearning assume este valor como seed para todos os métodos
SEED = 5
np.random.seed(SEED)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25, random_state= 20, stratify= y)

modelo = DecisionTreeClassifier(max_depth= 5)

modelo.fit(x_train, y_train)

previsoes = modelo.predict(x_test)

accuracy_score(y_test, previsoes) *100

77.75999999999999