## 0.0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor



pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_raw = pd.read_csv('../data/treino.csv')
df_test = pd.read_csv('../data/teste.csv')

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


### 1.0. Data Description

In [2]:
df1 = df_raw.copy()

# rename columns

df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado', 'preco']

In [3]:
#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:-1]


for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')

## 2.0. Data Filtering

In [4]:
import sweetviz as sv
# my_report = sv.analyze(df1, target_feat='preco')
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

df1 =df1.drop(['elegivel_revisao'], axis=1)

In [7]:
# plt.figure(figsize=(15,7))
# sns.barplot(data=df1[df1['cor'].isin(['Branco', 'Preto', 'Cinza', 'Prata'])], y='preco', x='estado_vendedor', hue='cor', estimator=np.median);

In [6]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='num_portas', estimator=np.mean);

KeyboardInterrupt: 

In [None]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='ipva_pago', estimator=np.mean);

## 3.0. Data Transformation


In [8]:
num_attributes = df1.select_dtypes(exclude='object')



In [9]:
# 'num_fotos',   --min max
#  'ano_de_fabricacao',   --min max  e feature >2015 e <2015
#  'ano_modelo',           --min max  e feature >2015 e <2015
#  'odometro',              --min max 
#  'num_portas',               --min max
#  'entrega_delivery',              0 e  1
#  'troca',                         0 e  1
#  'aceita_troca',            
#  'dono_unico',
#  'todas_revisoes_concessionaria',
#  'ipva_pago',
#  'licenciado',
#  'garantia_de_fabrica',
#  'todas_revisoes_agenda',
#  'preco'                   -- log1p

### 3.1. Rescaling

In [10]:
min_max_cols = ['num_fotos',  
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']

mms = pp.MinMaxScaler()

for column in min_max_cols:

    df1[column] = mms.fit_transform(df1[[column]].values)
    pickle.dump(mms, open(f'../parameters/{column}_scaler.pkl', 'wb'))



df1['preco'] = np.log1p(df1['preco'])

### 3.2. Encoding

In [11]:
df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 1,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 2}


# target encoders
for column in ['marca', 'modelo', 'versao', 'tipo']:
    target = df1.groupby(column)['preco'].mean()
    df1[column] = df1[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))

# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor', 'estado_vendedor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

## 4.0. Feature Selection

In [12]:
df2 = df1.drop('id', axis=1).copy()

In [13]:
y = df2['preco']
X = df2.drop('preco', axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [15]:
# #training and test dataset for Boruta
X_train_n = X_train.values
y_train_n = y_train.values.ravel()

In [16]:
# #define model
# rf = RandomForestRegressor(n_jobs=-1, n_estimators=1000)

# #define boruta
# boruta= BorutaPy(rf, n_estimators='auto', verbose=2, random_state=41).fit(X_train_n, y_train_n)

### Selected cols from boruta

In [17]:
# cols_selected = boruta.support_.tolist()

#  #best features
# X_train_fs = X_train
# cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

# #not selected boruta
# cols_not_selected_boruta = list(np.setdiff1d(X_train_fs.columns , cols_selected_boruta))

# cols_selected_boruta

### Extra trees feature importancia

In [18]:
# # #model definition
# forest = ExtraTreesRegressor(n_estimators=250, random_state=3, n_jobs=-1)


# # #data preparation
# x_train_n = X_train 
# y_train_n = y_train.values
# forest.fit(X_train, y_train_n)



# importances = forest.feature_importances_
# std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
# indices = np.argsort(importances)[::-1]

# #print the feature ranking
# print('Feature ranking:')
# df = pd.DataFrame()
# for i, j in zip(x_train_n, forest.feature_importances_):
#     aux = pd.DataFrame({'feature':i, 'importance': j},index=[0])
#     df = pd.concat([df,aux],axis=0)
    
# print(df.sort_values('importance', ascending=False))

# #Plot the impurity-based feature importances of the forest
# plt.figure()
# plt.title('Feature Importances')
# plt.bar(range(x_train_n.shape[1]),importances[indices], color='r', yerr=std[indices], align='center')
# plt.xticks(range(x_train_n.shape[1]),indices)
# plt.xlim([-1, x_train_n.shape[1]])
# plt.show();

# cols_selected = df.sort_values('importance', ascending=False).head(8)['feature'].values
cols_selected = ['versao', 'modelo', 'cambio', 'odometro', 'tipo', 'ano_modelo',
       'marca', 'ano_de_fabricacao']

## Model Train

### Linear Regression

In [28]:
y_test_ = np.expm1(y_test)


def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    y_hat_ = np.expm1(y_hat)
    
    MAPE = np.round(mean_absolute_percentage_error( y_test_, y_hat_ ), 2)
    MAE = np.round(mean_absolute_error(y_test_, y_hat_), 2)
    RMSE = np.round(np.sqrt(mean_squared_error(y_test_, y_hat_)), 2)
    
    
    print('{}\n MAE: {}\n MAPE: {}\n RMSE: {}'.format(nome, MAE, MAPE, RMSE))
    return MAE

In [20]:
simple_model_test(LinearRegression(), 'Linear Regressor')
simple_model_test(RandomForestRegressor(), 'RandomForestRegressor')
simple_model_test(LGBMRegressor(), 'LGBMRegressor')
simple_model_test(Lasso(), 'Lasso')
simple_model_test(XGBRegressor(), 'XGBRegressor')

Linear Regressor
 MAE: 26085.97
 MAPE: 0.21
 RMSE: 41451.71


KeyboardInterrupt: 

## Cross Validation

In [35]:
# stratified cross validation
def cross_validation(model_name, model, x_train, y_train):

    rmse_list = []
    mae_list = []
    mape_list = []

    skf = KFold(n_splits=5, shuffle=True, random_state=5)

    for train_index, test_index in skf.split(x_train, y_train):

        x_train_cv = x_train.iloc[train_index]
        y_train_cv = y_train.iloc[train_index]

        x_test_cv = x_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]


        # model training
        model.fit(x_train_cv, y_train_cv)

        # prediction
        y_hat = model.predict(x_test_cv)

        y_hat_ = np.expm1(y_hat)
        y_test_cv = np.expm1(y_test_cv)

        # metrics
        MAPE = np.round(mean_absolute_percentage_error( y_test_cv, y_hat_ ), 2)
        MAE = np.round(mean_absolute_error(y_test_cv, y_hat_), 2)
        RMSE = np.round(np.sqrt(mean_squared_error(y_test_cv, y_hat_)), 2)
#         print(MAPE)
        # append to list
        rmse_list.append(RMSE)
        mae_list.append(MAE) 
        mape_list.append(MAPE)

    avg_mae, stf_mae = np.round( np.mean( mae_list ), 2 ), np.round( np.std( mae_list ), 2 )
    avg_mape, stf_mape = np.round( np.mean( mape_list ), 2 ), np.round( np.std( mape_list ), 2 )
    avg_rmse, stf_rmse = np.round( np.mean( rmse_list ), 2 ), np.round( np.std( rmse_list ), 2 )

    return avg_mae

In [39]:
def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

In [None]:
for num in np.arange(50, 200, 50):

    model = RandomForestRegressor(n_estimators =num,)
    print(num, cross_validation('LGBM',model, X_train[cols_selected], y_train))

50 26697.87
100 26516.5


In [102]:
model = RandomForestRegressor(n_estimatores)

In [40]:
send_model(model)

Model submited


In [29]:
parameters = pd.DataFrame()


## Hyperparameter Tuning

In [43]:
 param = {'n_estimators': [500, 1000, 1500, 1300, 1200, 1400],
'eta': [0.01, 0.03, 0.02],
'max_depth': [ 5, 9, 7, 8],
'subsample': [0.3, 0.5, 0.7],
'colsample_bytree': [0.3, 0.7, 0.9],
'min_child_weight': [3, 8, 15]
}

MAX_EVAL = 5

min_=30000
import random
final_result = pd.DataFrame()

for i in range( MAX_EVAL ):
    # choose values for parameters randomly
    hp = { k: random.sample(  v, 1 )[0] for k, v in param.items() }
    print( hp )

    # model
    model_xgb = XGBRegressor( objective='reg:squarederror',
    n_estimators=hp['n_estimators'],
    eta=hp['eta'],
    max_depth=hp['max_depth'],
    subsample=hp['subsample'],
    colsample_bytree=hp['colsample_bytree'],
    min_child_weight=hp['min_child_weight'] )
    
    #Performance
#     result = cross_validation(x_training, 5, 'XGBoost Regressor', model_xgb, verbose=True)
    MAE = simple_model_test(model_xgb, 'xgboost')
    if MAE < min_:
        min_ = MAE
        fine_params = hp
        
    parameters['param'] = str(hp)
    parameters['mae'] = MAE
#     final_result = pd.concat([final_result, result])



{'n_estimators': 1000, 'eta': 0.02, 'max_depth': 8, 'subsample': 0.3, 'colsample_bytree': 0.7, 'min_child_weight': 3}
xgboost
 MAE: 23866.37
 MAPE: 0.19
 RMSE: 37896.71
{'n_estimators': 1300, 'eta': 0.03, 'max_depth': 9, 'subsample': 0.3, 'colsample_bytree': 0.9, 'min_child_weight': 8}
xgboost
 MAE: 24207.06
 MAPE: 0.19
 RMSE: 38478.44
{'n_estimators': 1200, 'eta': 0.02, 'max_depth': 9, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min_child_weight': 3}
xgboost
 MAE: 23774.06
 MAPE: 0.19
 RMSE: 37803.94
{'n_estimators': 1500, 'eta': 0.01, 'max_depth': 5, 'subsample': 0.3, 'colsample_bytree': 0.9, 'min_child_weight': 3}
xgboost
 MAE: 24250.14
 MAPE: 0.19
 RMSE: 38225.28
{'n_estimators': 1500, 'eta': 0.02, 'max_depth': 5, 'subsample': 0.3, 'colsample_bytree': 0.7, 'min_child_weight': 8}
xgboost
 MAE: 24098.34
 MAPE: 0.19
 RMSE: 38164.98


In [37]:
model = XGBRegressor(n_estimators=1000, eta=0.01, max_depth=9, subsample=0.7, colsample_bytree=0.7,min_child_weight=15)
cross_validation('xgb',model, X_train[cols_selected], y_train)

25031.65

In [None]:
send_mode

In [None]:
# {'n_estimators': 1000, 'eta': 0.01, 'max_depth': 9, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 15}
# xgboost
#  MAE: 23804.12
#  MAPE: 0.19
#  RMSE: 37775.52

