# Preços de carro no Brasil: modelo de previsão

Treinando modelo em dataset da tabela FIPE

In [1]:
# importando bibliotecas

import numpy as np           
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
# bibliotecas para treinar modelo

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.linear_model import LinearRegression, Ridge
from xgboost.sklearn import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [3]:
# carregando o dataset da FIPE
df = pd.read_csv ('data/fipe_cars.csv')
df.head()

Unnamed: 0,year_of_reference,month_of_reference,fipe_code,authentication,brand,model,fuel,gear,engine_size,year_model,avg_price_brl
0,2021,January,038001-6,tlp4qry07m,Acura,NSX 3.0,Gasoline,manual,3.0,1995,40374.0
1,2021,January,038001-6,s1wksdv9by,Acura,NSX 3.0,Gasoline,manual,3.0,1994,38939.0
2,2021,January,038001-6,skrbcfnkch,Acura,NSX 3.0,Gasoline,manual,3.0,1993,37648.0
3,2021,January,038001-6,rxzh76d5db,Acura,NSX 3.0,Gasoline,manual,3.0,1992,35962.0
4,2021,January,038001-6,qrm322tpd8,Acura,NSX 3.0,Gasoline,manual,3.0,1991,32863.0


In [4]:
# checando consistência dos dados

print(f'Linhas: {df.shape[0]}')
print(f'Colunas (features): {df.shape[1]}')
print(f'Linhas duplicadas: {df.duplicated().sum()}')
print(f'Dados faltantes: {sum(df.isna().sum())}')

Linhas: 599007
Colunas (features): 11
Linhas duplicadas: 3
Dados faltantes: 0


In [5]:
# retirando valores duplicados

df.drop_duplicates(keep='first',inplace=True)
print(f'Linhas: {df.shape[0]}')
print(f'Linhas duplicadas: {df.duplicated().sum()}')

Linhas: 599004
Linhas duplicadas: 0


In [6]:
# adicionando coluna reference_date juntando year_of_reference e month_of_reference
df['reference_date'] = pd.to_datetime(df['month_of_reference'] + ' ' + df['year_of_reference'].astype(str), format='%B %Y')

# adicionando coluna age_model
df['age_model'] = df['year_of_reference'] - df['year_model']

# descartando fipe_code, authentication, year_of_reference, month_of_reference
df.drop(['fipe_code', 'authentication', 'year_of_reference', 'month_of_reference'], axis=1, inplace=True)
df.head()

Unnamed: 0,brand,model,fuel,gear,engine_size,year_model,avg_price_brl,reference_date,age_model
0,Acura,NSX 3.0,Gasoline,manual,3.0,1995,40374.0,2021-01-01,26
1,Acura,NSX 3.0,Gasoline,manual,3.0,1994,38939.0,2021-01-01,27
2,Acura,NSX 3.0,Gasoline,manual,3.0,1993,37648.0,2021-01-01,28
3,Acura,NSX 3.0,Gasoline,manual,3.0,1992,35962.0,2021-01-01,29
4,Acura,NSX 3.0,Gasoline,manual,3.0,1991,32863.0,2021-01-01,30


In [7]:
# estatísticas sobre cada feature
df.describe()

Unnamed: 0,engine_size,year_model,avg_price_brl,reference_date,age_model
count,599004.0,599004.0,599004.0,599004,599004.0
mean,2.266742,2008.751761,111580.9,2022-01-03 04:39:40.960394240,12.814644
min,0.7,1985.0,1679.0,2021-01-01 00:00:00,-1.0
25%,1.6,2001.0,20150.0,2021-07-01 00:00:00,5.0
50%,2.0,2010.0,42988.5,2022-01-01 00:00:00,11.0
75%,2.8,2016.0,96921.25,2022-07-01 00:00:00,20.0
max,6.7,2023.0,8600000.0,2023-01-01 00:00:00,38.0
std,1.031977,9.188348,291224.4,,9.19584


### Corrigindo a inflação

Com a ajuda de um índice, as séries podem ser corrigidas para retirar o efeito da inflação. Os dados então são tratados para utilizar como base o valor do real em Janeiro de 2023.

Também será importada aqui a série com as conversões dólar-real, que serão utilizadas futuramente para conversão.

In [8]:
inflacao = pd.read_csv('data/ipca_indice.csv', delimiter=';')
inflacao.head()

Unnamed: 0,Data,IPCA - geral - índice (dez 1993 = 100)
0,2021/01,5574.49
1,2021/02,5622.43
2,2021/03,5674.72
3,2021/04,5692.31
4,2021/05,5739.56


In [9]:
dolar = pd.read_csv('data/preco_dolar.csv', delimiter=';')
dolar.columns = ['reference_date', 'dollar_price']
dolar['reference_date'] = pd.to_datetime(dolar['reference_date'], format='%d.%m.%Y')
dolar.head()

Unnamed: 0,reference_date,dollar_price
0,2023-01-01,5.0731
1,2022-12-01,5.286
2,2022-11-01,5.1851
3,2022-10-01,5.1791
4,2022-09-01,5.4154


In [10]:
# adicionando coluna reference_date
inflacao['reference_date'] = pd.to_datetime(inflacao['Data'], format='%Y/%m')

# ajustando índice
idx_2023_01 = inflacao['IPCA - geral - índice (dez 1993 = 100)'][24]
inflacao['ipca_index'] = inflacao['IPCA - geral - índice (dez 1993 = 100)']/idx_2023_01

inflacao.drop(['Data', 'IPCA - geral - índice (dez 1993 = 100)'], axis=1, inplace=True)
inflacao.head()

Unnamed: 0,reference_date,ipca_index
0,2021-01-01,0.856507
1,2021-02-01,0.863873
2,2021-03-01,0.871907
3,2021-04-01,0.87461
4,2021-05-01,0.88187


In [11]:
# juntando índice da inflação com dataframe original
df = pd.merge(df, inflacao, how='left', on='reference_date')
df.head()

Unnamed: 0,brand,model,fuel,gear,engine_size,year_model,avg_price_brl,reference_date,age_model,ipca_index
0,Acura,NSX 3.0,Gasoline,manual,3.0,1995,40374.0,2021-01-01,26,0.856507
1,Acura,NSX 3.0,Gasoline,manual,3.0,1994,38939.0,2021-01-01,27,0.856507
2,Acura,NSX 3.0,Gasoline,manual,3.0,1993,37648.0,2021-01-01,28,0.856507
3,Acura,NSX 3.0,Gasoline,manual,3.0,1992,35962.0,2021-01-01,29,0.856507
4,Acura,NSX 3.0,Gasoline,manual,3.0,1991,32863.0,2021-01-01,30,0.856507


In [12]:
# retirando outliers de preço
lower_bound = df['avg_price_brl'].quantile(0.10)
upper_bound = df['avg_price_brl'].quantile(0.90)
df = df[(df['avg_price_brl'] >= lower_bound) & (df['avg_price_brl'] <= upper_bound)]

In [13]:
df['avg_price_brl'] = df['avg_price_brl']/df['ipca_index']
# drop irrelevant features for the analysis
# we can't use the model, there are simply too many (extremely specific)
df.drop(['ipca_index', 'model'], axis=1, inplace=True)
df.head()

Unnamed: 0,brand,fuel,gear,engine_size,year_model,avg_price_brl,reference_date,age_model
0,Acura,Gasoline,manual,3.0,1995,47137.969859,2021-01-01,26
1,Acura,Gasoline,manual,3.0,1994,45462.56027,2021-01-01,27
2,Acura,Gasoline,manual,3.0,1993,43955.275406,2021-01-01,28
3,Acura,Gasoline,manual,3.0,1992,41986.815081,2021-01-01,29
4,Acura,Gasoline,manual,3.0,1991,38368.630888,2021-01-01,30


## Encoding dos dados categóricos

Temos que codificar os dados categóricos em números para estimar modelos.

A data de referência é transformada em um rank (da data mais baixa para a mais alta), as outras 'categorias' são transformadas por codificação one-hot.

In [14]:
# encoding dos dados categóricos
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)

cat_df = df.select_dtypes(exclude=["int", "float"])
rankcat_df = cat_df[["reference_date"]]
onehot_df = cat_df.drop('reference_date', axis=1)

rankcat_df = rankcat_df.apply(le.fit_transform)

onehot_transf = ohe.fit_transform(onehot_df)
onehot_df = pd.DataFrame(onehot_transf, columns=ohe.get_feature_names_out(onehot_df.columns), index=onehot_df.index)

# juntando os dados novamente
cat_df = pd.concat([rankcat_df, onehot_df], axis=1)
num_df = df.select_dtypes(include=['int', 'float'])
df = pd.concat([num_df, cat_df], axis=1)
df.head()

Unnamed: 0,engine_size,year_model,avg_price_brl,age_model,reference_date,brand_AM Gen,brand_Acura,brand_Agrale,brand_Alfa Romeo,brand_Asia Motors,...,brand_VW - VolksWagen,brand_Volvo,brand_Wake,brand_Walk,brand_smart,fuel_Alcohol,fuel_Diesel,fuel_Gasoline,gear_automatic,gear_manual
0,3.0,1995,47137.969859,26,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3.0,1994,45462.56027,27,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,3.0,1993,43955.275406,28,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,3.0,1992,41986.815081,29,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,3.0,1991,38368.630888,30,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


## Separando dados para treinamento

Agora, separamos a variável que queremos prever (preço) das variáveis dependentes e separamos os dados em grupo de treino e teste.

In [33]:
X = df.drop(columns=["avg_price_brl"])
y = df["avg_price_brl"]

X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.25, random_state=0)

# Treinando modelos

Testamos os seguintes modelos de regressão:
- Regressão linear
- Árvore de decisão

In [34]:
# descrevendo o dataset
df.describe()

Unnamed: 0,engine_size,year_model,avg_price_brl,age_model,reference_date,brand_AM Gen,brand_Acura,brand_Agrale,brand_Alfa Romeo,brand_Asia Motors,...,brand_VW - VolksWagen,brand_Volvo,brand_Wake,brand_Walk,brand_smart,fuel_Alcohol,fuel_Diesel,fuel_Gasoline,gear_automatic,gear_manual
count,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,...,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0,479203.0
mean,2.191179,2009.304986,64208.889428,12.264184,12.137389,9.2e-05,0.000766,0.003366,0.00192,0.000939,...,0.102812,0.020436,0.000962,0.00082,0.001461,0.008896,0.17922,0.811883,0.284735,0.715265
std,0.948273,8.143566,52605.962354,8.175783,7.193795,0.009582,0.027664,0.05792,0.043774,0.03063,...,0.303714,0.141486,0.031001,0.028626,0.038192,0.093898,0.383537,0.390806,0.451289,0.451289
min,1.0,1985.0,10429.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.6,2003.0,25405.177934,6.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2.0,2010.0,45752.244926,11.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,2.5,2016.0,85915.19023,18.0,18.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,6.7,2023.0,271968.596446,38.0,24.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
# função de fit
model_preds = []

def fit_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = round(r2_score(y_test, y_pred),4)
    adj_r2 = round(1 - (1-r2)*(len(y)-1)/(len(y)-X.shape[1]-1),4)
    mae = round(mean_absolute_error(y_test, y_pred),4)
    mad = round(median_absolute_error(y_test, y_pred),4)
    rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
    model_preds.append([model_name, r2, adj_r2, rmse, mae, mad])
    print("R2 = ", r2)
    print("R2 ajustado = ", adj_r2)
    print("RMSE = ", rmse)
    print("MAE = ", mae)
    print("MAD = ", mad)

In [36]:
## Regressão Linear
lr_model = LinearRegression()
fit_model(lr_model, "Regressão Linear")

R2 =  0.7303
R2 ajustado =  0.7302
RMSE =  27293.2664
MAE =  20034.5114
MAD =  15385.5


In [42]:
## Ridge
ridge_model = Ridge()
fit_model(ridge_model, "Ridge")

R2 =  0.7303
R2 ajustado =  0.7302
RMSE =  27293.4814
MAE =  20034.4877
MAD =  15386.9243


In [39]:
## XGBoost
xgboost_model = XGBRegressor()
fit_model(xgboost_model, "XGBoost")

R2 =  0.9286
R2 ajustado =  0.9286
RMSE =  14047.3
MAE =  8565.271
MAD =  4790.3688


In [45]:
## KNN
knn_model = KNeighborsRegressor(n_neighbors=6)
fit_model(knn_model, "KNN")

R2 =  0.8886
R2 ajustado =  0.8886
RMSE =  17543.1248
MAE =  10262.3517
MAD =  5108.6298


In [40]:
## Árvore de decisão
dectree_model = DecisionTreeRegressor()
fit_model(dectree_model, "Árvore de decisão")

R2 =  0.9159
R2 ajustado =  0.9159
RMSE =  15243.6947
MAE =  8325.6196
MAD =  3664.7391


In [23]:
## Random Forest
## (esse demora bastante pra rodar)
randfor_model = RandomForestRegressor()
fit_model(randfor_model, "Random Forest")

R2 =  0.922
R2 ajustado =  0.922
RMSE =  14654.1233
MAE =  8030.1738
MAD =  3558.5522
