## 0.0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR




pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_raw = pd.read_csv('../data/treino.csv')
df_test = pd.read_csv('../data/teste.csv')

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


### 1.0. Data Description

In [31]:
df1 = df_raw.copy()

# rename columns

df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado', 'preco']

#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:-1]


for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)
    

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')

## 2.0. Data Filtering

In [32]:
import sweetviz as sv
# my_report = sv.analyze(df1, target_feat='preco')
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

df1 =df1.drop(['elegivel_revisao'], axis=1)

In [33]:
# plt.figure(figsize=(15,7))
# sns.barplot(data=df1[df1['cor'].isin(['Branco', 'Preto', 'Cinza', 'Prata'])], y='preco', x='estado_vendedor', hue='cor', estimator=np.median);

In [34]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='num_portas', estimator=np.mean);

In [35]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='ipva_pago', estimator=np.mean);

## 3.0. Data Transformation


In [36]:
num_attributes = df1.select_dtypes(exclude='object')



### 3.1. Rescaling

In [37]:
min_max_cols = ['num_fotos',  
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']

mms = pp.MinMaxScaler()

for column in min_max_cols:

    df1[column] = mms.fit_transform(df1[[column]].values)
    pickle.dump(mms, open(f'../parameters/{column}_scaler.pkl', 'wb'))
    




df1['preco'] = np.log1p(df1['preco'])

### 3.2. Encoding

In [38]:
df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 1,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 1}

# aux = df1[['estado_vendedor', 'preco']].groupby('estado_vendedor').mean().reset_index().sort_values('preco')
# aux['regiao'] = aux['estado_vendedor'].map(map_regiao)



# plt.figure(figsize=(15,7))
# sns.barplot(data=aux, x='estado_vendedor', y='preco', hue='regiao')



# target encoders
for column in ['marca',  'versao',  'estado_vendedor', 'tipo', 'modelo']:
    target = df1.groupby(column)['preco'].mean()
    df1[column] = df1[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))


# frequency encoders
# for column in [ ]:
#     frequency = df1.groupby(column)['preco'].count() / len(df1)
#     df1[column] = df1[column].map(frequency)
# #     pickle.dump(frequency, open(f'../parameters/{column}_encode.pkl', 'wb'))

In [39]:
# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
# df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

## 4.0. Feature Selection

In [40]:
df2 = df1.drop('id', axis=1).copy()

y = df2['preco']
X = df2.drop('preco', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# # #training and test dataset for Boruta
# X_train_n = X_train.values
# y_train_n = y_train.values.ravel()

In [41]:
# #define model
# rf = RandomForestRegressor(n_jobs=-1, n_estimators=1000)

# #define boruta
# boruta= BorutaPy(rf, n_estimators='auto', verbose=2, random_state=41).fit(X_train_n, y_train_n)

### Selected cols from boruta

In [42]:
# cols_selected = boruta.support_.tolist()

#  #best features
# X_train_fs = X_train
# cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

# #not selected boruta
# cols_not_selected_boruta = list(np.setdiff1d(X_train_fs.columns , cols_selected_boruta))

# cols_selected_boruta

### Extra trees feature importancia

In [43]:
# # #model definition
# forest = ExtraTreesRegressor(n_estimators=250, random_state=3, n_jobs=-1)


# # #data preparation
# x_train_n = X_train 
# y_train_n = y_train.values
# forest.fit(X_train, y_train_n)



# importances = forest.feature_importances_
# std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
# indices = np.argsort(importances)[::-1]

# #print the feature ranking
# print('Feature ranking:')
# df = pd.DataFrame()
# for i, j in zip(x_train_n, forest.feature_importances_):
#     aux = pd.DataFrame({'feature':i, 'importance': j},index=[0])
#     df = pd.concat([df,aux],axis=0)
    
# print(df.sort_values('importance', ascending=False))

# #Plot the impurity-based feature importances of the forest
# plt.figure()
# plt.title('Feature Importances')
# plt.bar(range(x_train_n.shape[1]),importances[indices], color='r', yerr=std[indices], align='center')
# plt.xticks(range(x_train_n.shape[1]),indices)
# plt.xlim([-1, x_train_n.shape[1]])
# plt.show();

# cols_selected = df.sort_values('importance', ascending=False).head(8)['feature'].values
# # cols_selected = ['versao', 'modelo', 'cambio', 'odometro', 'tipo', 'ano_modelo',
# #        'marca', 'ano_de_fabricacao']

## Model Train

### Linear Regression

In [44]:
y_test_ = np.expm1(y_test)


def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    y_hat_ = np.expm1(y_hat)
    
    MAPE = np.round(mean_absolute_percentage_error( y_test_, y_hat_ ), 2)
    MAE = np.round(mean_absolute_error(y_test_, y_hat_), 2)
    RMSE = np.round(np.sqrt(mean_squared_error(y_test_, y_hat_)), 2)
    
    
    print('{}\n MAE: {}\n MAPE: {}\n RMSE: {}'.format(nome, MAE, MAPE, RMSE))
    return MAE

In [16]:
# simple_model_test(LinearRegression(), 'Linear Regressor')
# simple_model_test(RandomForestRegressor(), 'RandomForestRegressor')
# simple_model_test(LGBMRegressor(), 'LGBMRegressor')
# simple_model_test(Lasso(), 'Lasso')
# simple_model_test(XGBRegressor(), 'XGBRegressor')
# simple_model_test(SVR(), 'SVR')

## Cross Validation

In [17]:
# stratified cross validation
def cross_validation(model_name, model, x_train, y_train):

    rmse_list = []
    mae_list = []
    mape_list = []

    skf = KFold(n_splits=5, shuffle=True, random_state=5)

    for train_index, test_index in skf.split(x_train, y_train):

        x_train_cv = x_train.iloc[train_index]
        y_train_cv = y_train.iloc[train_index]

        x_test_cv = x_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]


        # model training
        model.fit(x_train_cv, y_train_cv)

        # prediction
        y_hat = model.predict(x_test_cv)

        y_hat_ = np.expm1(y_hat)
        y_test_cv = np.expm1(y_test_cv)

        # metrics
        MAPE = np.round(mean_absolute_percentage_error( y_test_cv, y_hat_ ), 2)
        MAE = np.round(mean_absolute_error(y_test_cv, y_hat_), 2)
        RMSE = np.round(np.sqrt(mean_squared_error(y_test_cv, y_hat_)), 2)
#         print(MAPE)
        # append to list
        rmse_list.append(RMSE)
        mae_list.append(MAE) 
        mape_list.append(MAPE)

    avg_mae, stf_mae = np.round( np.mean( mae_list ), 2 ), np.round( np.std( mae_list ), 2 )
    avg_mape, stf_mape = np.round( np.mean( mape_list ), 2 ), np.round( np.std( mape_list ), 2 )
    avg_rmse, stf_rmse = np.round( np.mean( rmse_list ), 2 ), np.round( np.std( rmse_list ), 2 )

    return avg_mae

In [18]:
def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

## Hyperparameter Tuning

### RF

In [551]:
RandomForestRegressor(criterion='gini')

RandomForestRegressor(criterion='gini')

In [565]:
rf_params = {}





In [573]:
min_rf = 999999

In [26]:
import random
for i in range(300):
#     criterion = random.choice(['gini', 'entropy', 'log_loss'])
    min_samples_split = random.choice([2,3,4])
    max_depth = random.choice(np.arange(60,90,10))
    min_samples_leaf = random.choice(np.arange(2,5,1))
    n_estimators = random.choice(np.arange(500,1000,100))
#     n_estimators = random.choice(np.arange(100,200,10))

#     print(criterion)

    # model
    model_rf = RandomForestRegressor( 
#     criterion =criterion,
    min_samples_split = min_samples_split,
    max_depth = max_depth,
    min_samples_leaf =min_samples_leaf,
    n_estimators = n_estimators)
    
    
    rf_params['min_samples_split'] = min_samples_split
    rf_params['max_depth'] = max_depth
    rf_params['min_samples_leaf'] = min_samples_leaf
    rf_params['n_estimators'] = n_estimators
    #Performance
#     result = cross_validation(x_training, 5, 'XGBoost Regressor', model_xgb, verbose=True)
    print(rf_params)
    MAE = cross_validation('xgboost',model_rf, X_train[cols_selected], y_train)
    print(MAE)
    
    if MAE<min_rf:
        print('got better')
        best_model = model_rf
        min_rf = MAE
        best_param = model_rf.get_params()



NameError: name 'rf_params' is not defined

NameError: name 'best_model' is not defined

In [19]:
#  param = {'n_estimators': [1100, 1000, 1500, 1300, 1200, 1400],
# 'eta': [0.01, 0.03, 0.02],
# 'max_depth': [ 5, 9, 7, 8],
# 'subsample': [0.3, 0.5, 0.7],
# 'colsample_bytree': [0.3, 0.7, 0.9],
# 'min_child_weight': [3, 8, 15]
# }

# MAX_EVAL = 10

# import random

# for i in range( MAX_EVAL ):
#     # choose values for parameters randomly
#     hp = { k: random.sample(  v, 1 )[0] for k, v in param.items() }
#     print( hp )

#     # model
#     model_xgb = XGBRegressor( objective='reg:squarederror',
#     n_estimators=hp['n_estimators'],
#     eta=hp['eta'],
#     max_depth=hp['max_depth'],
#     subsample=hp['subsample'],
#     colsample_bytree=hp['colsample_bytree'],
#     min_child_weight=hp['min_child_weight'] )
    
#     #Performance
# #     result = cross_validation(x_training, 5, 'XGBoost Regressor', model_xgb, verbose=True)
#     MAE = simple_model_test(model_xgb, 'xgboost')
#     print(MAE)
# #     if MAE<min_:
# #         min_ = MAE
# #         best_param = eval(hp)


        




In [267]:
# best_param = param_list[np.argmin(mae_list)]

# model = XGBRegressor(n_estimators=1000,
#  eta=0.03,
#  max_depth=8,
#  subsample=0.7,
#  colsample_bytree=0.9,
#  min_child_weight=8)

# cross_validation('xgb',model, X_train[cols_selected], y_train)

In [20]:
min_mae = 999999
best_param = {}

#     learning_rate = random.choice(np.arange(0.072, 0.08, 0.002))
#     num_leaves = random.choice(np.arange(60,90,10))
#     max_depth = random.choice(np.arange(60,90,10))
#     min_child_weight = random.choice(np.arange(0.001,0.003,0.0005))
#     n_estimators = random.choice(np.arange(90,110,10))

In [27]:


for i in range(300):
    learning_rate = random.choice(np.arange(0.08, 0.2, 0.005))
    num_leaves = random.choice(np.arange(50,150,20))
    max_depth = random.choice(np.arange(50,150,20))
    min_child_weight = random.choice(np.arange(0.001,0.003,0.0005))
    n_estimators = random.choice(np.arange(50,300,50))
    boosting_type = 'gbdt'
#     n_estimators = random.choice(np.arange(100,200,10))


    model = LGBMRegressor(learning_rate = learning_rate,
                          num_leaves = num_leaves,
                          max_depth =max_depth ,
                         n_estimators = n_estimators,
                          min_child_weight=min_child_weight,
                          boosting_type= boosting_type)
    mae =  cross_validation('lgbm',model, X_train[cols_selected], y_train)
    
#     print(mae)
    if i%10==0:
        print(i+1, 100)
    if mae<min_mae:
        
        min_mae = mae
        best_param['learning_rate']= learning_rate
        best_param['num_leaves']= num_leaves
        best_param['max_depth']= max_depth
        best_param['n_estimators']= n_estimators
        best_param['min_child_weight'] = min_child_weight
        best_param['boosting_type'] = boosting_type
        
        print(f'get better: MAE {mae}')
        print(learning_rate, num_leaves, max_depth, n_estimators)


1 100
get better: MAE 25288.12
0.12000000000000004 130 50 150
get better: MAE 24966.77
0.10000000000000002 70 110 50
11 100
get better: MAE 24920.83
0.09000000000000001 130 50 50
21 100
31 100
41 100
51 100
61 100


KeyboardInterrupt: 

In [30]:
X_train[cols_selected_selected]

Unnamed: 0,versao,cambio,marca,odometro,tipo,ano_modelo,ano_de_fabricacao,modelo
3627,11.018349,0,11.386366,0.186768,11.787826,0.846154,0.891892,11.090824
10864,11.081735,0,11.775875,0.184668,11.574008,0.769231,0.864865,11.476776
36152,12.324877,2,11.801713,0.222466,11.787826,0.769231,0.864865,12.234758
38144,11.860511,2,12.097786,0.194243,11.574008,0.653846,0.783784,11.768316
37361,11.446993,2,11.801713,0.048368,11.574008,0.923077,0.945946,11.386258
...,...,...,...,...,...,...,...,...
18110,11.065291,0,11.549883,0.092808,11.300422,0.884615,0.918919,11.318405
51,11.910565,2,11.549883,0.173251,11.300422,0.769231,0.837838,11.725100
27194,11.708650,2,11.635663,0.238224,11.574008,0.653846,0.756757,11.681362
25804,10.780624,1,11.215537,0.207042,11.300422,0.769231,0.837838,10.954536


In [None]:
cross_validation('', ExtraTreesRegressor(n_estimators=500), X_train[cols_selected], y_train)

In [596]:
simple_model_test(ExtraTreesRegressor(), '')


 MAE: 79395.54
 MAPE: 0.76
 RMSE: 109894.8


79395.54

In [520]:
best_param

{'learning_rate': 0.074,
 'num_leaves': 80,
 'max_depth': 70,
 'n_estimators': 100,
 'min_child_weight': 0.001,
 'boosting_type': 'gbdt'}

In [21]:
model = LGBMRegressor(learning_rate = 0.074,num_leaves = 80,max_depth = 70 ,
                         n_estimators = 100, boosting_type='gbdt', min_child_weight = 0.001)

# send_model(model)

In [522]:
# sns.histplot(pp.RobustScaler().fit_transform(df_raw[['odometro']].values))

In [23]:
cols_selected = ['versao', 'cambio', 'marca', 'odometro', 'tipo', 'ano_modelo',
       'ano_de_fabricacao',  'modelo']

In [24]:
cross_validation('lgbm',model, X_train[cols_selected], y_train)

24941.47

In [397]:
cols_selected

['versao',
 'modelo',
 'cambio',
 'odometro',
 'tipo',
 'ano_modelo',
 'marca',
 'ano_de_fabricacao']

In [583]:
send_model(RandomForestRegressor(min_samples_split= 3,
 max_depth= None,
 min_samples_leaf= 4,
 n_estimators= 1500))

Model submited


In [582]:
cols_selected

['versao',
 'cambio',
 'marca',
 'odometro',
 'tipo',
 'ano_modelo',
 'ano_de_fabricacao',
 'modelo']