## 0.0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor



pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_raw = pd.read_csv('../data/treino.csv')
df_test = pd.read_csv('../data/teste.csv')

### 1.0. Data Description

df1 = df_test.copy()

# rename columns

df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado']

#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:]

for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')

## 2.0. Data Filtering

import sweetviz as sv
# my_report = sv.analyze(df1, target_feat='preco')
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

df1 =df1.drop(['elegivel_revisao'], axis=1)

### 3.1. Rescaling

min_max_cols = [
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']


for column in min_max_cols:
    mms = pickle.load( open(f'../parameters/{column}_scaler.pkl', 'rb'))
    df1[column] = mms.fit_transform(df1[[column]].values)
    

# df1['preco'] = np.log1p(df1['preco'])

### 3.2. Encoding

df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 1,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 2}


# target encoders
for column in ['marca', 'modelo', 'versao', 'tipo']:
    target = pickle.load( open(f'../parameters/{column}_encode.pkl', 'rb'))
    df1[column] = df1[column].map(target)


# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor', 'estado_vendedor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

## 4.0. Feature Selection

df2 = df1.drop('id', axis=1).copy()



cols_selected = ['versao', 'modelo', 'cambio', 'odometro', 'tipo', 'ano_modelo',
       'marca', 'ano_de_fabricacao']

X = df2[cols_selected]
## Model Train


#model definition


model = pd.read_pickle('../parameters/model.pkl')

#model predict
y_hat = model.predict(X.fillna(np.log1p(df_raw['preco'].mean())))

# y_test_ = np.expm1(y_test)
y_hat_ = np.expm1(y_hat)



# y_test_ = np.expm1(y_test)

df_final = pd.DataFrame()
df_final['ID'] = df_test['ID']
df_final['preco'] = y_hat_

df_final.to_csv('../data/sample.csv', index=False)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
df_final

Unnamed: 0,ID,preco
0,24813264385557040124808779273028388499,59435.929688
1,295636316453795508942188530111300065666,115231.046875
2,101258309166227950735244624080888109884,94517.296875
3,28348734455782469411126661985772047409,75206.867188
4,193163160502972147671913739170248305797,111150.187500
...,...,...
39441,238233399351588823822117090805568390727,37019.074219
39442,64621912306231118962468441892654163025,152483.796875
39443,100311033226508317456901122129284293382,143945.406250
39444,217317181330151694133399005110777689124,313467.937500


In [5]:
for num in np.arange(100,200, 20)

    model = RandomForestRegressor(n_estimators=num)
    

Unnamed: 0,ID,preco
0,24813264385557040124808779273028388499,64941.112590
1,295636316453795508942188530111300065666,108953.078355
2,101258309166227950735244624080888109884,93739.157522
3,28348734455782469411126661985772047409,73153.698282
4,193163160502972147671913739170248305797,112610.125482
...,...,...
39441,238233399351588823822117090805568390727,36553.449135
39442,64621912306231118962468441892654163025,151609.121217
39443,100311033226508317456901122129284293382,145802.311748
39444,217317181330151694133399005110777689124,365881.188896


In [76]:
not_in_df = []
for i in ['versao', 'modelo', 'marca']:
    
    for feature in df_test[i].unique():
        if feature not in df_raw[i].unique():
            not_in_df.append(feature)
            
        

In [77]:
not_in_df

['1.8 MPFI GRAPHITE 8V FLEX 4P AUTOMÁTICO',
 '1.4 TFSI FLEX SEDAN PRESTIGE PLUS TECH TIPTRONIC',
 '2.0 16V HÍBRIDO X LINE XDRIVE30E STEPTRONIC',
 '2.5 SPORT 4X2 CE 8V TURBO DIESEL 2P MANUAL',
 '2.0 SE 16V FLEX 4P AUTOMÁTICO',
 '2.0 TFSI AMBIENTE LIMO 180CV GASOLINA 4P MULTITRONIC',
 '2.8 16V TURBO DIESEL LT CD 4X4 AUTOMÁTICO',
 '1.0 MPI COMFORTLINE 12V FLEX 4P MANUAL',
 '1.8 CGI TOURING AVANTGARDE 16V GASOLINA 4P AUTOMÁTICO',
 '1.4 16V TSI TRENDLINE GASOLINA 4P MANUAL',
 '2.0 16V GASOLINA X LINE XDRIVE30I STEPTRONIC',
 '2.5 DLX 4X4 CD 8V TURBO DIESEL 4P MANUAL',
 '3.5 HPE 4X4 V6 24V GASOLINA 4P AUTOMÁTICO',
 '1.0 MPFI LIFE 8V FLEX 2P MANUAL',
 '1.3 S 4X4 16V GASOLINA 2P MANUAL',
 '1.8 CGI SPORT TURBO 16V GASOLINA 2P AUTOMÁTICO',
 '3.0 V6 SUPERCHARGED S AWD 4P AUTOMÁTICO',
 '1.6 OCEAN 16V FLEX 4P MANUAL',
 '2.0 S 4X4 16V GASOLINA 4P AUTOMÁTICO',
 '2.5 TFSI GASOLINA SPORTBACK QUATTRO S-TRONIC',
 '1.6 16V SCE FLEX GT LINE MANUAL',
 '3.0 TFSI GASOLINA PERFORMANCE QUATTRO TIPTRONIC',
 '1.6 