### Problema de negócio
Construir uma máquina preditiva para prever o preço de venda dos carros da empresa.

Esse processo de previsão vai automatizar e otimizar a definição dos preços dos carros que serão vendidos pelo app.



In [454]:
# biblioteca
import pandas as pd
import seaborn as sns
import time
import warnings
from ydata_profiling.profile_report import ProfileReport
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVR
import joblib
warnings.filterwarnings("ignore")

In [340]:
dados = pd.read_csv("dados/CarPrice.csv")

dados.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [341]:
#relatório geral
#ProfileReport(dados).to_file("relatorio.html")

In [342]:
#renomear colunas
dados.rename(columns = {
    'symboling' : 'risco',
    'fueltype' : 'combustivel',
    'aspiration' : 'turbo',
    'doornumber' : 'portas_4mais',
    'carbody' : 'carroceria',
    'drivewheel' : 'tracao',
    'horsepower' : 'potencia',
    'price' : 'preco'   
}, inplace=True)


In [343]:
#remove colunas
dados.drop(columns=['car_ID','CarName', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'peakrpm', 'citympg', 'highwaympg'], inplace=True)

In [344]:
#missing data
display(dados.isna().sum())

#duplicated
display(dados.duplicated().sum())

risco           0
combustivel     0
turbo           0
portas_4mais    0
carroceria      0
tracao          0
potencia        0
preco           0
dtype: int64

7

In [345]:
#ajusta a coluna tracao
dados.tracao = dados.tracao.map({'fwd' : 'dianteira', 'rwd' : 'traseira', '4wd' : '4x4'})
dados.tracao.value_counts()

dianteira    120
traseira      76
4x4            9
Name: tracao, dtype: int64

In [346]:
#ajusta a coluna carroceria
dados.carroceria = dados.carroceria.map({'convertible' : 'conversível', 'hatchback' : 'hatch', 'sedan' : 'sedan', 'wagon' : 'perua', 'hardtop' : 'conversível'})
dados.carroceria.value_counts()

sedan          96
hatch          70
perua          25
conversível    14
Name: carroceria, dtype: int64

In [347]:
#ajusta a coluna turbo
dados.turbo = dados.turbo.apply(lambda x : 1 if x == 'turbo' else 0)
dados.turbo.value_counts()

0    168
1     37
Name: turbo, dtype: int64

In [348]:
#ajusta coluna portas
dados.portas_4mais = dados.portas_4mais.apply(lambda x : 1 if x == 'four' else 0)
dados.portas_4mais.value_counts()

1    115
0     90
Name: portas_4mais, dtype: int64

In [349]:
dados

Unnamed: 0,risco,combustivel,turbo,portas_4mais,carroceria,tracao,potencia,preco
0,3,gas,0,0,conversível,traseira,111,13495.0
1,3,gas,0,0,conversível,traseira,111,16500.0
2,1,gas,0,0,hatch,traseira,154,16500.0
3,2,gas,0,1,sedan,dianteira,102,13950.0
4,2,gas,0,1,sedan,4x4,115,17450.0
...,...,...,...,...,...,...,...,...
200,-1,gas,0,1,sedan,traseira,114,16845.0
201,-1,gas,1,1,sedan,traseira,160,19045.0
202,-1,gas,0,1,sedan,traseira,134,21485.0
203,-1,diesel,1,1,sedan,traseira,106,22470.0


In [350]:
dados = pd.get_dummies(data=dados, drop_first=False)

dados

Unnamed: 0,risco,turbo,portas_4mais,potencia,preco,combustivel_diesel,combustivel_gas,carroceria_conversível,carroceria_hatch,carroceria_perua,carroceria_sedan,tracao_4x4,tracao_dianteira,tracao_traseira
0,3,0,0,111,13495.0,0,1,1,0,0,0,0,0,1
1,3,0,0,111,16500.0,0,1,1,0,0,0,0,0,1
2,1,0,0,154,16500.0,0,1,0,1,0,0,0,0,1
3,2,0,1,102,13950.0,0,1,0,0,0,1,0,1,0
4,2,0,1,115,17450.0,0,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,0,1,114,16845.0,0,1,0,0,0,1,0,0,1
201,-1,1,1,160,19045.0,0,1,0,0,0,1,0,0,1
202,-1,0,1,134,21485.0,0,1,0,0,0,1,0,0,1
203,-1,1,1,106,22470.0,1,0,0,0,0,1,0,0,1


In [351]:
dados.drop(columns=['combustivel_gas', 'carroceria_perua', 'tracao_dianteira'], inplace=True)

dados

Unnamed: 0,risco,turbo,portas_4mais,potencia,preco,combustivel_diesel,carroceria_conversível,carroceria_hatch,carroceria_sedan,tracao_4x4,tracao_traseira
0,3,0,0,111,13495.0,0,1,0,0,0,1
1,3,0,0,111,16500.0,0,1,0,0,0,1
2,1,0,0,154,16500.0,0,0,1,0,0,1
3,2,0,1,102,13950.0,0,0,0,1,0,0
4,2,0,1,115,17450.0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
200,-1,0,1,114,16845.0,0,0,0,1,0,1
201,-1,1,1,160,19045.0,0,0,0,1,0,1
202,-1,0,1,134,21485.0,0,0,0,1,0,1
203,-1,1,1,106,22470.0,1,0,0,1,0,1


In [357]:
dados.corr()

Unnamed: 0,risco,turbo,portas_4mais,potencia,preco,combustivel_diesel,carroceria_conversível,carroceria_hatch,carroceria_sedan,tracao_4x4,tracao_traseira
risco,1.0,-0.059866,-0.664073,0.070873,-0.079978,-0.194311,0.316352,0.435648,-0.378341,-0.067222,-0.076381
turbo,-0.059866,1.0,0.031792,0.241685,0.177926,0.401397,-0.076773,0.009786,0.017111,0.085162,0.11247
portas_4mais,-0.664073,0.031792,1.0,-0.126947,0.031835,0.191491,-0.306038,-0.606688,0.515046,0.093614,-0.073958
potencia,0.070873,0.241685,-0.126947,1.0,0.808139,-0.163926,0.230584,-0.05377,-0.024097,-0.047715,0.575564
preco,-0.079978,0.177926,0.031835,0.808139,1.0,0.105679,0.298803,-0.262039,0.125716,-0.058866,0.638957
combustivel_diesel,-0.194311,0.401397,0.191491,-0.163926,0.105679,1.0,-0.023844,-0.202093,0.185623,-0.070457,0.122035
carroceria_conversível,0.316352,-0.076773,-0.306038,0.230584,0.298803,-0.023844,1.0,-0.194953,-0.25408,-0.058015,0.272648
carroceria_hatch,0.435648,0.009786,-0.606688,-0.05377,-0.262039,-0.202093,-0.194953,1.0,-0.675779,-0.053884,-0.148046
carroceria_sedan,-0.378341,0.017111,0.515046,-0.024097,0.125716,0.185623,-0.25408,-0.675779,1.0,-0.057956,0.008293
tracao_4x4,-0.067222,0.085162,0.093614,-0.047715,-0.058866,-0.070457,-0.058015,-0.053884,-0.057956,1.0,-0.164477


In [523]:
#separa target e as features
x = dados.drop(columns=['preco'])
#x = dados[['potencia']]
y = dados['preco']

#separa os dados de treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)



In [524]:
x

Unnamed: 0,risco,turbo,portas_4mais,potencia,combustivel_diesel,carroceria_conversível,carroceria_hatch,carroceria_sedan,tracao_4x4,tracao_traseira
0,3,0,0,111,0,1,0,0,0,1
1,3,0,0,111,0,1,0,0,0,1
2,1,0,0,154,0,0,1,0,0,1
3,2,0,1,102,0,0,0,1,0,0
4,2,0,1,115,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
200,-1,0,1,114,0,0,0,1,0,1
201,-1,1,1,160,0,0,0,1,0,1
202,-1,0,1,134,0,0,0,1,0,1
203,-1,1,1,106,1,0,0,1,0,1


In [449]:
#escolha do melhor estimador
rand_state = 30
lista_modelos = {
    #"ExtraTreesRegressor" : ExtraTreesRegressor(random_state=rand_state, n_estimators = 100),# criterion = 'absolute_error'),#, max_depth = 20),
    "RandomForestRegressor" : RandomForestRegressor(random_state=rand_state),
    #"DecisionTreeRegressor" : DecisionTreeRegressor(),
    #"GradientBoostingRegressor" : GradientBoostingRegressor(random_state=rand_state),
    #"XGBRegressor" : XGBRegressor(),
    #"LGBMRegressor" : LGBMRegressor(verbose=0),
    #"LinearRegression" : LinearRegression(),
    #"KNeighborsRegressor" : KNeighborsRegressor(),
    #"SVR" : SVR()
}

avaliacao = {}

for nome, modelo in lista_modelos.items():
    #cria a maquina preditiva
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("modelo", modelo)
    ])

    pipeline.fit(x_train, y_train)

    y_predict = pipeline.predict(x_test)

    avaliacao[nome] = {
        'r2_score' : f"{r2_score(y_test, y_predict)*100:.2f}%",
        'mean_squared_error' : f"{mean_squared_error(y_test, y_predict):.2f}",
        'mean_absolute_error' : f"{mean_absolute_error(y_test, y_predict):.2f}",
        'parameters' : modelo.get_params()
    }
    
    import matplotlib.pyplot as plt
    #plt.hist(y_test - y_predict)
    #plt.scatter(y_test, y_predict)
    if(nome == "ExtraTreesRegressor"):
        #exibe as features na ordem de sua importância
        #display(pd.DataFrame(data=[modelo.feature_importances_], columns=x.columns).T.sort_values(by=0, ascending=False))
        1
    
pd.DataFrame(avaliacao).T


Unnamed: 0,mean_absolute_error,mean_squared_error,parameters,r2_score
RandomForestRegressor,2300.87,11658817.21,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",81.23%


In [515]:
#melhores parametros
hyperparam_randfor = {
    #'n_estimators' : [120]
    'random_state' : list(range(3, 20, 1))
    #'criterion' : 'friedman_mse' #['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
    #'max_depth' : 100 #[150, 40, 50, 80, 100]
}
pipeline = Pipeline([
    #("scaler", StandardScaler()),
    ("modelo", RandomForestRegressor())
])

grid = GridSearchCV(RandomForestRegressor(), param_grid=hyperparam_randfor, cv=5, scoring='neg_mean_squared_error')
grid.fit(x_train, y_train)

print(grid.best_params_)

{'random_state': 9}


In [520]:
#modelo preditivo definitivo
avaliacao.clear()
hyperparam_randfor = {
    #'n_estimators' : 120,
    'random_state' : 12,
    #'criterion' : 'friedman_mse',
}
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("modelo", RandomForestRegressor(**hyperparam_randfor))
])

pipeline.fit(x_train, y_train)

y_predict = pipeline.predict(x_test)

avaliacao[nome] = {
    'r2_score' : f"{r2_score(y_test, y_predict)*100:.2f}%",
    'mean_squared_error' : f"{mean_squared_error(y_test, y_predict):.2f}",
    'mean_absolute_error' : f"{mean_absolute_error(y_test, y_predict):.2f}",
    'parameters' : modelo.get_params()
}

pd.DataFrame(avaliacao).T

#salva modelo preditivo
joblib.dump(pipeline, "modelo.joblib")


['modelo.joblib']

29