In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from category_encoders.count import CountEncoder
from pycaret.regression import *

## Data ingest (Carregar dados)

In [2]:
df_train = pd.read_csv('datasets/treino.csv')
df_test = pd.read_csv('datasets/teste.csv')

In [3]:
y = df_train['preco']
X = df_train.drop(columns=['ID', 'preco'])


In [4]:
cat_cols = df_train.select_dtypes(include=['object']).columns[1:]

ce = CountEncoder(cols=cat_cols, return_df=True)
X_pp = ce.fit_transform(df_train.drop(columns=['ID', 'preco']), df_train['preco'])

X_pp = X_pp.fillna(0)

In [5]:
X_pp = X_pp.fillna(0)
df_pp = pd.concat([X_pp,y], axis=1)

In [14]:
exp_name = setup(data = df_pp,  target = 'preco', fold_shuffle=True, session_id=2)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,preco
2,Original Data,"(39446, 28)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(27612, 50)"


In [15]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,28083.6385,1952002342.4,44099.6988,0.7161,0.2962,0.2389,6.835
rf,Random Forest Regressor,28001.4835,2012815441.5896,44768.4246,0.7074,0.2933,0.2411,11.53
lightgbm,Light Gradient Boosting Machine,28978.1248,2015031954.8897,44796.8344,0.707,0.3025,0.2524,0.422
et,Extra Trees Regressor,30194.2276,2281966289.0653,47672.5955,0.6682,0.3112,0.2593,11.77
gbr,Gradient Boosting Regressor,34308.4395,2615181159.6129,51067.7008,0.6193,0.3525,0.3082,3.795
lasso,Lasso Regression,43684.757,3835906534.4,61857.7023,0.4415,0.4799,0.4116,1.449
ridge,Ridge Regression,43683.4285,3835677465.6,61855.523,0.4415,0.4819,0.4116,0.048
llar,Lasso Least Angle Regression,43625.651,3836819380.1452,61865.426,0.4414,0.477,0.4105,0.049
lr,Linear Regression,44352.7109,3908158054.4,62444.207,0.4307,0.4892,0.418,1.462
lar,Least Angle Regression,43895.53,3916963215.8089,62463.2294,0.4301,0.4816,0.4141,0.058


# Random Grid Search

https://medium.com/@sergei740/hyperparameter-tuning-lightgbm-using-random-grid-search-dc11c2f8c805

In [17]:
#Set the minimum error arbitrarily large
min = 99999999999999999999999 
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 5
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count
    try:
        d_train = lgb.Dataset(X_pp, label=y) #Load in data
        params = {} #initialize parameters
        params['learning_rate'] = np.random.uniform(0, 1)
        params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
        params['objective'] = 'regression'
        params['metric'] = 'mae'
        params['sub_feature'] = np.random.uniform(0, 1)
        params['num_leaves'] = np.random.randint(20, 300)
        params['min_data'] = np.random.randint(10, 100)
        params['max_depth'] = np.random.randint(5, 200)
        iterations = np.random.randint(10, 10000)
        print(params, iterations)
        #Train using selected parameters
        clf = lgb.train(params, d_train, iterations)
        y_pred=clf.predict(x_test) #Create predictions on test set
        mae=mean_absolute_error(y_pred,y_test)
        print('MAE:', mae)
        if mae < min:
            min = mae
            pp = params 

    except: #in case something goes wrong
        print('failed with')
        print(params)
print("*" * 5)
print('Minimum is: ', min)
print('Used params', pp)

iteration number 0
{'learning_rate': 0.43599490214200376, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9315408658211117, 'num_leaves': 95, 'min_data': 17, 'max_depth': 167} 443
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 39446, number of used features: 25
[LightGBM] [Info] Start training from score 133385.874346
failed with
{'learning_rate': 0.43599490214200376, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9315408658211117, 'num_leaves': 95, 'min_data': 17, 'max_depth': 167}
iteration number 1
{'learning_rate': 0.2046486340378425, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.11995054259638982, 'num_leaves': 238, 'min_data': 30, 'max_depth': 129} 3710
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wis

In [8]:
lg = LGBMRegressor( boosting_type = 'gbdt', 
                    learning_rate = 0.5, 
                    objective = 'regression',
                    metric = 'mae',
                    sub_feature = '1',
                    min_data = 50,
                    num_leaves = 100,
                    max_depth = 100,
                    )


In [9]:
lg

LGBMRegressor(learning_rate=0.5, max_depth=100, metric='mae', min_data=50,
              num_leaves=100, objective='regression', sub_feature='1')

In [10]:
lg.fit(X_pp,y)



LGBMRegressor(learning_rate=0.5, max_depth=100, metric='mae', min_data=50,
              num_leaves=100, objective='regression', sub_feature='1')

In [11]:
cross_validate(lg,X_pp,y,cv=5, scoring = 'neg_mean_absolute_error')



{'fit_time': array([3.31708789, 2.80212402, 2.82961988, 2.8332479 , 2.6179018 ]),
 'score_time': array([0.1187799 , 0.16395307, 0.11828399, 0.10785317, 0.11069012]),
 'test_score': array([-29892.37072991, -29440.65063701, -29180.58118895, -30046.27720409,
        -29376.2427185 ])}

In [15]:
from xgboost import XGBRegressor

In [16]:
xgb = XGBRegressor()

In [17]:
cross_validate(xgb, X_pp, y, cv=5 , scoring = 'neg_mean_absolute_error')

{'fit_time': array([8.69047904, 7.93983102, 8.19033098, 7.887043  , 7.65695572]),
 'score_time': array([0.0628109 , 0.07897496, 0.09733987, 0.06109405, 0.05985594]),
 'test_score': array([-27447.44848654, -27776.9627502 , -27419.58262113, -28078.66509503,
        -27644.96926975])}

# Target Encoding

In [12]:
from category_encoders import TargetEncoder

In [15]:
te = TargetEncoder(cols=cat_cols, return_df=True)
X_te = te.fit_transform(df_train.drop(columns=['ID', 'preco']), df_train['preco'])
X_te

Unnamed: 0,num_fotos,marca,modelo,versao,ano_de_fabricacao,ano_modelo,odometro,cambio,num_portas,tipo,blindado,cor,tipo_vendedor,cidade_vendedor,estado_vendedor,tipo_anuncio,entrega_delivery,troca,elegivel_revisao,attr_veiculo_aceita_troca,attr_veiculo_único_dono,attr_veiculo_todas_as_revisões_feitas_pela_concessionária,attr_veiculo_ipva_pago,attr_veiculo_licenciado,attr_veiculo_garantia_de_fábrica,attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro,attr_veiculo_alienado
0,8.0,118864.653876,182866.374487,131535.788104,2017,2017.0,55672.0,146579.444142,4,159605.703146,132761.368548,137370.648821,139405.670277,126858.181814,130172.754418,139388.581834,False,False,False,137292.198975,128855.668778,155175.790175,132517.360059,131832.036820,127083.012803,152716.869063,
1,8.0,223453.245531,265969.455964,260124.440884,2017,2017.0,47858.0,146579.444142,4,225908.426740,132761.368548,130548.873396,139405.670277,132172.449766,142683.743177,139388.581834,False,False,False,137292.198975,128855.668778,155175.790175,132517.360059,135200.506546,127083.012803,152716.869063,
2,14.0,93135.094963,88113.881275,60509.440259,2012,2013.0,122323.0,146579.444142,4,118647.979849,132761.368548,95447.326849,124173.335282,132172.449766,130172.754418,122900.784683,True,True,False,137292.198975,128855.668778,123583.735935,135108.255819,135200.506546,127083.012803,128542.554079,
3,8.0,316500.644566,404300.455891,302878.465349,2020,2021.0,14207.0,146579.444142,2,159605.703146,132761.368548,130548.873396,139405.670277,126858.181814,130172.754418,139388.581834,False,False,False,137292.198975,128855.668778,155175.790175,135108.255819,135200.506546,127083.012803,128542.554079,
4,8.0,197789.362549,258674.002124,173976.599553,2015,2016.0,43760.0,146579.444142,4,225908.426740,132761.368548,137370.648821,139405.670277,163260.666402,130172.754418,139388.581834,False,False,False,137292.198975,128855.668778,155175.790175,135108.255819,135200.506546,127083.012803,152716.869063,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39441,12.0,118864.653876,54820.282462,64365.126207,2015,2015.0,73755.0,80328.343354,4,118647.979849,132761.368548,95447.326849,124173.335282,101398.919243,100976.198417,122900.784683,False,False,False,137292.198975,141713.525225,123583.735935,132517.360059,131832.036820,127083.012803,128542.554079,
39442,17.0,200118.573156,110893.345690,115389.974308,2016,2017.0,78859.0,146579.444142,4,92666.521106,132761.368548,130548.873396,124173.335282,161197.162613,130172.754418,122900.784683,True,True,False,137292.198975,128855.668778,123583.735935,135108.255819,135200.506546,127083.012803,128542.554079,
39443,8.0,122361.749137,124756.346147,127504.577807,2021,2022.0,9793.0,146579.444142,4,118647.979849,132761.368548,198371.819072,139405.670277,126858.181814,130172.754418,139388.581834,False,False,False,122209.105666,141713.525225,155175.790175,132517.360059,131832.036820,170037.863615,152716.869063,
39444,15.0,118864.653876,182866.374487,179221.915854,2018,2019.0,51592.0,146579.444142,4,118647.979849,132761.368548,130548.873396,124173.335282,137959.439901,130172.754418,131115.176021,True,False,False,137292.198975,128855.668778,123583.735935,135108.255819,135200.506546,127083.012803,128542.554079,


## PyCaret

In [18]:
X_te = X_te.fillna(0)
df_te = pd.concat([X_te,y], axis=1)

5m 12.4s

In [19]:
exp_name_te = setup(data = df_te,  target = 'preco', fold_shuffle=True, session_id=2)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,preco
2,Original Data,"(39446, 28)"
3,Missing Values,False
4,Numeric Features,14
5,Categorical Features,13
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(27612, 29)"


5m 38.1s

In [20]:
best_model_te = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,25538.9489,1664572422.3627,40704.208,0.7581,0.2624,0.2128,0.362
rf,Random Forest Regressor,25635.8181,1692767830.7693,41051.8367,0.754,0.2629,0.2121,9.97
xgboost,Extreme Gradient Boosting,25724.8305,1704413875.2,41187.1457,0.7524,0.2655,0.2129,4.662
gbr,Gradient Boosting Regressor,26352.9111,1745853318.2725,41720.2796,0.7459,0.2727,0.2224,2.734
et,Extra Trees Regressor,26351.7143,1822271824.7481,42598.8777,0.735,0.2698,0.2167,9.303
lasso,Lasso Regression,28008.2072,1865505945.6,43141.0719,0.7283,0.3314,0.2444,0.703
ridge,Ridge Regression,28008.5926,1865521779.2,43141.2504,0.7283,0.3299,0.2444,0.032
lar,Least Angle Regression,28009.2771,1865537823.9487,43141.4378,0.7283,0.3308,0.2444,0.03
llar,Lasso Least Angle Regression,27957.2009,1866108641.7837,43147.976,0.7282,0.3213,0.2436,0.035
lr,Linear Regression,28024.201,1869958400.0,43193.6527,0.7277,0.3218,0.2439,1.102


In [21]:
best_model_te

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Model development (treino do modelo)

In [32]:
lgb_te = LGBMRegressor(random_state=42)
lgb_te.fit(X_te, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [33]:
cross_validate(lgb_te, X_te, y, cv=5 , scoring = 'neg_mean_absolute_error')

{'fit_time': array([2.1498692 , 1.41065621, 1.12146378, 1.11571002, 1.12784386]),
 'score_time': array([0.06631017, 0.07766986, 0.07091117, 0.06538701, 0.06647801]),
 'test_score': array([-25640.3945284 , -25493.68995401, -24812.6449792 , -25509.34482423,
        -25284.30173385])}

In [46]:
cross_validate(lgb_te, X_te, y, cv=5 , scoring = 'r2')


{'fit_time': array([3.78340578, 1.47349882, 1.23296404, 1.39820004, 1.12553692]),
 'score_time': array([0.08545709, 0.08214307, 0.07252979, 0.07718396, 0.08854103]),
 'test_score': array([0.73701339, 0.78081417, 0.77081394, 0.75765325, 0.75265389])}

## Feature Eng. (Aplicar mesmas transformações no dataset de teste)

In [37]:
X_test_te = te.transform(df_test.drop(columns=['ID']))
X_test_te = X_test_te.fillna(0)

## Predict the test dataset (Prever preços) 

In [38]:
pred = lgb_te.predict(X_test_te)

## Final solution (gerar base final para envio no kaggle)
25951.79698

In [39]:
df_final = pd.concat([df_test[['ID']], pd.Series(pred, name='preco')], axis=1)
df_final.head()

Unnamed: 0,ID,preco
0,24813264385557040124808779273028388499,58839.064853
1,295636316453795508942188530111300065666,104275.398884
2,101258309166227950735244624080888109884,91361.110385
3,28348734455782469411126661985772047409,77564.661804
4,193163160502972147671913739170248305797,107246.815874


In [40]:
df_final.to_csv('submission_lgb_te.csv', index=False) #Arquivo de submissão pronto para ser enviado.

## Aplicando Logaritmo no Preço
Kaggle: 25472.58227

In [41]:
y_log = np.log(y)

In [42]:
lgb_te_log = LGBMRegressor(random_state=42)
lgb_te_log.fit(X_te, y_log)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [43]:
cross_validate(lgb_te_log, X_te, y_log, cv=5 , scoring = 'neg_mean_absolute_error')


{'fit_time': array([2.10160017, 1.54752111, 1.40292501, 1.20840478, 1.14586973]),
 'score_time': array([0.10065198, 0.062006  , 0.05999422, 0.06848311, 0.05977321]),
 'test_score': array([-0.19282887, -0.19275201, -0.18955417, -0.19244424, -0.19413899])}

In [45]:
cross_validate(lgb_te_log, X_te, y_log, cv=5 , scoring = 'r2')


{'fit_time': array([1.6535759 , 1.25450611, 1.17486787, 1.18305302, 1.15058208]),
 'score_time': array([0.07926488, 0.07475781, 0.07008886, 0.07551694, 0.07213116]),
 'test_score': array([0.79699405, 0.80686166, 0.80571425, 0.79931438, 0.79466372])}

In [47]:
pred = lgb_te_log.predict(X_test_te)


In [48]:
df_final = pd.concat([df_test[['ID']], pd.Series(np.exp(pred), name='preco')], axis=1)
df_final.head()

Unnamed: 0,ID,preco
0,24813264385557040124808779273028388499,60994.236623
1,295636316453795508942188530111300065666,102218.210059
2,101258309166227950735244624080888109884,91039.625178
3,28348734455782469411126661985772047409,72737.379436
4,193163160502972147671913739170248305797,104864.091051


In [49]:
df_final.to_csv('submission_lgb_te_log.csv', index=False) #Arquivo de submissão pronto para ser enviado.


## Pycaret com logaritmo no preço
Melhor modelo: LightGBM

In [51]:
df_te_log = pd.concat([X_te,np.log(y)], axis=1)

26m 57.3s

In [53]:
exp_name_te_log = setup(data = df_te_log,  target = 'preco', fold_shuffle=True, session_id=2)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,preco
2,Original Data,"(39446, 28)"
3,Missing Values,False
4,Numeric Features,14
5,Categorical Features,13
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(27612, 29)"


5m 37.5s

In [54]:
best_model_te_log = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1935,0.065,0.2548,0.7996,0.0203,0.0167,0.317
xgboost,Extreme Gradient Boosting,0.1949,0.0657,0.2563,0.7972,0.0205,0.0169,4.401
rf,Random Forest Regressor,0.1946,0.0665,0.2579,0.7947,0.0206,0.0168,9.861
gbr,Gradient Boosting Regressor,0.2005,0.0698,0.2642,0.7846,0.0211,0.0173,2.64
et,Extra Trees Regressor,0.1996,0.0705,0.2654,0.7825,0.0212,0.0172,9.307
knn,K Neighbors Regressor,0.2165,0.0814,0.2853,0.7488,0.0228,0.0187,0.381
ada,AdaBoost Regressor,0.2254,0.0869,0.2948,0.7319,0.0235,0.0195,2.571
lar,Least Angle Regression,0.2271,0.0886,0.2976,0.7267,0.0237,0.0196,0.044
br,Bayesian Ridge,0.2271,0.0886,0.2976,0.7267,0.0237,0.0196,0.127
ridge,Ridge Regression,0.2271,0.0886,0.2976,0.7267,0.0237,0.0196,0.023
