1. Treinar um modelo sem as features de modelo, versao, marca
2. Usar esse modelo treinado para prever preço no df teste
3. Fazer novo target encoding para features que nao aparecem no treino

4. Treinar modelo com encodings atualizados

## Imports

In [179]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_raw = pd.read_csv('../data/treino.csv')
not_in = pd.read_excel('../data/not_in_train.xlsx')

df_test = pd.read_csv('../data/teste.csv')


In [180]:
# stratified cross validation
def cross_validation(model_name, model, x_train, y_train):

    rmse_list = []
    mae_list = []
    mape_list = []

    skf = KFold(n_splits=5, shuffle=True, random_state=5)

    for train_index, test_index in skf.split(x_train, y_train):

        x_train_cv = x_train.iloc[train_index]
        y_train_cv = y_train.iloc[train_index]

        x_test_cv = x_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]


        # model training
        model.fit(x_train_cv, y_train_cv)

        # prediction
        y_hat = model.predict(x_test_cv)

        y_hat_ = np.expm1(y_hat)
        y_test_cv = np.expm1(y_test_cv)

        # metrics
        MAPE = np.round(mean_absolute_percentage_error( y_test_cv, y_hat_ ), 2)
        MAE = np.round(mean_absolute_error(y_test_cv, y_hat_), 2)
        RMSE = np.round(np.sqrt(mean_squared_error(y_test_cv, y_hat_)), 2)
#         print(MAPE)
        # append to list
        rmse_list.append(RMSE)
        mae_list.append(MAE) 
        mape_list.append(MAPE)

    avg_mae, stf_mae = np.round( np.mean( mae_list ), 2 ), np.round( np.std( mae_list ), 2 )
    avg_mape, stf_mape = np.round( np.mean( mape_list ), 2 ), np.round( np.std( mape_list ), 2 )
    avg_rmse, stf_rmse = np.round( np.mean( rmse_list ), 2 ), np.round( np.std( rmse_list ), 2 )

    return avg_mae

## Train dataset without marca, modelo e versao

In [202]:
df1 = df_raw.copy()

# rename columns

df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado', 'preco']

#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:-1]


for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')



df1 =df1.drop(['elegivel_revisao',   'versao'], axis=1)

min_max_cols = ['num_fotos',  
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']

mms = pp.MinMaxScaler()

for column in min_max_cols:

    df1[column] = mms.fit_transform(df1[[column]].values)
    pickle.dump(mms, open(f'../parameters/{column}_scaler.pkl', 'wb'))



df1['preco'] = np.log1p(df1['preco'])

df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 1,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 2}

In [203]:
# target encoders
for column in ['tipo', 'marca', 'modelo']:
    target = df1.groupby(column)['preco'].mean()
    df1[column] = df1[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))

# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor', 'estado_vendedor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

df2 = df1.drop('id', axis=1).copy()

y = df2['preco']
X = df2.drop('preco', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [183]:
# # #model definition
# forest = ExtraTreesRegressor(n_estimators=250, random_state=3, n_jobs=-1)


# # #data preparation
# x_train_n = X_train 
# y_train_n = y_train.values
# forest.fit(X_train, y_train_n)



# importances = forest.feature_importances_
# std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
# indices = np.argsort(importances)[::-1]

# #print the feature ranking
# print('Feature ranking:')
# df = pd.DataFrame()
# for i, j in zip(x_train_n, forest.feature_importances_):
#     aux = pd.DataFrame({'feature':i, 'importance': j},index=[0])
#     df = pd.concat([df,aux],axis=0)
    
# print(df.sort_values('importance', ascending=False))

# #Plot the impurity-based feature importances of the forest
# plt.figure()
# plt.title('Feature Importances')
# plt.bar(range(x_train_n.shape[1]),importances[indices], color='r', yerr=std[indices], align='center')
# plt.xticks(range(x_train_n.shape[1]),indices)
# plt.xlim([-1, x_train_n.shape[1]])
# plt.show();

In [184]:
# cols_selected = df.sort_values('importance', ascending=False).head(10)['feature'].values


In [205]:
cols_selected = ['odometro', 'tipo', 'cambio', 'ano_modelo', 'ano_de_fabricacao', 'marca', 'modelo']

In [206]:
y_test_ = np.expm1(y_test)


def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    y_hat_ = np.expm1(y_hat)
    
    MAPE = np.round(mean_absolute_percentage_error( y_test_, y_hat_ ), 2)
    MAE = np.round(mean_absolute_error(y_test_, y_hat_), 2)
    RMSE = np.round(np.sqrt(mean_squared_error(y_test_, y_hat_)), 2)
    
    
    print('{}\n MAE: {}\n MAPE: {}\n RMSE: {}'.format(nome, MAE, MAPE, RMSE))
    return MAE

In [207]:
# simple_model_test(LinearRegression(), 'Linear Regressor')
# simple_model_test(RandomForestRegressor(), 'RandomForestRegressor')
# simple_model_test(LGBMRegressor(), 'LGBMRegressor')
# simple_model_test(Lasso(), 'Lasso')
# simple_model_test(XGBRegressor(), 'XGBRegressor')

In [208]:
min_mae = 999999
best_param = {}

In [209]:
# for i in range(300):
#     learning_rate = random.choice(np.arange(0.065, 0.85, 0.002))
#     num_leaves = random.choice(np.arange(55,80,2))
#     max_depth = random.choice(np.arange(30,70,5))
#     min_child_weight = random.choice(np.arange(0.001,0.003,0.0005))
#     n_estimators = 100
#     boosting_type = 'gbdt'

#     model = LGBMRegressor(learning_rate = learning_rate,
#                           num_leaves = num_leaves,
#                           max_depth =max_depth ,
#                          n_estimators = n_estimators,
#                           min_child_weight=min_child_weight,
#                           boosting_type= boosting_type)
    
#     mae =  cross_validation('lgbm',model, X_train[cols_selected], y_train)
#     if i%10==0:
#         print(i, 100)
#         print()
        
#     if mae<min_mae:
#         min_mae = mae
#         best_param['learning_rate']= learning_rate
#         best_param['num_leaves']= num_leaves
#         best_param['max_depth']= max_depth
#         best_param['n_estimators']= n_estimators
#         best_param['min_child_weight'] = min_child_weight
#         best_param['boosting_type'] = boosting_type
        
#         print(f'get better: {mae}')
# #         print(learning_rate, num_leaves, max_depth, n_estimators)

In [210]:
# cross_validation('F',RandomForestRegressor(n_estimators=1500), X_train[cols_selected], y_train)

In [211]:
# model definition
model = LGBMRegressor(boosting_type= 'gbdt',learning_rate= 0.07100000000000001, num_leaves= 57, max_depth= 45, n_estimators= 100, min_child_weight=0.002)
print(cross_validation('lgbm',model, X_train[cols_selected], y_train))
# #train
model.fit(X_train[cols_selected], y_train)

#test


28236.23


LGBMRegressor(learning_rate=0.07100000000000001, max_depth=45,
              min_child_weight=0.002, num_leaves=57)

In [212]:
# rename columns
df1 = df_test.copy()

In [213]:
df1.shape

(39446, 28)

In [214]:
df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado', ]

#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:]


for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')



df1 =df1.drop(['elegivel_revisao', 'versao'], axis=1)

min_max_cols = ['num_fotos',  
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']

In [215]:
for column in min_max_cols:
    mms = pickle.load(open(f'../parameters/{column}_scaler.pkl', 'rb'))
    df1[column] = mms.fit_transform(df1[[column]].values)
    



df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 1,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 2}


# target encoders
for column in ['tipo', 'marca', 'modelo']:
    target = pickle.load( open(f'../parameters/{column}_encode.pkl', 'rb'))
    df1[column] = df1[column].map(target)
    

# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor', 'estado_vendedor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

In [217]:
# df2 = df1.drop('id', axis=1).copy()

X = df2

In [218]:
y_hat = model.predict(X[cols_selected])

y_hat = np.expm1(y_hat)

df_test['preco'] = y_hat

df_test.to_pickle('../data/precos_modelo_marca.pkl')

In [147]:
# df_test= df_test.drop([ 'marca_label', 'modelo_label', 'versao_label'], axis=1)

In [219]:
df_total = pd.concat([df_raw, df_test])


In [220]:

for i in ['marca', 'modelo','versao' ]:
    loc = df_test[(~df_test[i].isin(df_raw[i].unique()))]
    new_value = np.log1p(loc.groupby(i)['preco'].mean())
    actual_encode = pd.read_pickle(f'../parameters/{i}_encode.pkl')
    actual_encode = pd.concat([actual_encode, new_value])
    pickle.dump(actual_encode, open(f'../parameters/{i}_encode.pkl', 'wb'))

In [221]:
target = pd.read_pickle('../parameters/marca_encode.pkl')
target

marca
ALFA ROMEO       11.155027
AUDI             11.891094
BMW              12.097786
BRM              10.774007
CHERY            11.655779
CHEVROLET        11.325122
CHRYSLER         11.025705
CITROËN          11.027485
DODGE            11.178814
EFFA             10.378516
FERRARI          11.990643
FIAT             11.386366
FORD             11.535763
HONDA            11.424791
HYUNDAI          11.271073
IVECO            11.926209
JAC              11.502916
JAGUAR           12.237732
JEEP             11.775875
KIA              11.635663
LAMBORGHINI      14.117162
LAND ROVER       12.193824
LEXUS            12.057564
LIFAN            11.246682
MASERATI         11.996006
MERCEDES-BENZ    11.948558
MINI             11.800861
MITSUBISHI       11.704017
NISSAN           11.485058
PEUGEOT          11.658108
PORSCHE          12.551343
RAM              12.646215
RENAULT          11.215537
SMART            11.313195
SSANGYONG        11.226159
SUBARU           11.600308
SUZUKI           11.35