In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from category_encoders.count import CountEncoder
from pycaret.regression import *

## Data ingest (Carregar dados)

In [5]:
df_train = pd.read_csv('datasets/treino.csv')
df_test = pd.read_csv('datasets/teste.csv')

In [6]:
y = df_train['preco']
X = df_train.drop(columns=['ID', 'preco'])


In [7]:
cat_cols = df_train.select_dtypes(include=['object']).columns[1:]

te = CountEncoder(cols=cat_cols, return_df=True)
X_pp = te.fit_transform(df_train.drop(columns=['ID', 'preco']), df_train['preco'])

X_pp = X_pp.fillna(0)

In [8]:
df_pp = pd.concat([X_pp,y], axis=1)

In [14]:
exp_name = setup(data = df_pp,  target = 'preco', fold_shuffle=True, session_id=2)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,preco
2,Original Data,"(39446, 28)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(27612, 50)"


In [15]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,28083.6385,1952002342.4,44099.6988,0.7161,0.2962,0.2389,6.835
rf,Random Forest Regressor,28001.4835,2012815441.5896,44768.4246,0.7074,0.2933,0.2411,11.53
lightgbm,Light Gradient Boosting Machine,28978.1248,2015031954.8897,44796.8344,0.707,0.3025,0.2524,0.422
et,Extra Trees Regressor,30194.2276,2281966289.0653,47672.5955,0.6682,0.3112,0.2593,11.77
gbr,Gradient Boosting Regressor,34308.4395,2615181159.6129,51067.7008,0.6193,0.3525,0.3082,3.795
lasso,Lasso Regression,43684.757,3835906534.4,61857.7023,0.4415,0.4799,0.4116,1.449
ridge,Ridge Regression,43683.4285,3835677465.6,61855.523,0.4415,0.4819,0.4116,0.048
llar,Lasso Least Angle Regression,43625.651,3836819380.1452,61865.426,0.4414,0.477,0.4105,0.049
lr,Linear Regression,44352.7109,3908158054.4,62444.207,0.4307,0.4892,0.418,1.462
lar,Least Angle Regression,43895.53,3916963215.8089,62463.2294,0.4301,0.4816,0.4141,0.058


# Random Grid Search

In [9]:
import lightgbm as lgb

https://medium.com/@sergei740/hyperparameter-tuning-lightgbm-using-random-grid-search-dc11c2f8c805

In [17]:
#Set the minimum error arbitrarily large
min = 99999999999999999999999 
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 5
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count
    try:
        d_train = lgb.Dataset(X_pp, label=y) #Load in data
        params = {} #initialize parameters
        params['learning_rate'] = np.random.uniform(0, 1)
        params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
        params['objective'] = 'regression'
        params['metric'] = 'mae'
        params['sub_feature'] = np.random.uniform(0, 1)
        params['num_leaves'] = np.random.randint(20, 300)
        params['min_data'] = np.random.randint(10, 100)
        params['max_depth'] = np.random.randint(5, 200)
        iterations = np.random.randint(10, 10000)
        print(params, iterations)
        #Train using selected parameters
        clf = lgb.train(params, d_train, iterations)
        y_pred=clf.predict(x_test) #Create predictions on test set
        mae=mean_absolute_error(y_pred,y_test)
        print('MAE:', mae)
        if mae < min:
            min = mae
            pp = params 

    except: #in case something goes wrong
        print('failed with')
        print(params)
print("*" * 5)
print('Minimum is: ', min)
print('Used params', pp)

iteration number 0
{'learning_rate': 0.43599490214200376, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9315408658211117, 'num_leaves': 95, 'min_data': 17, 'max_depth': 167} 443
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 39446, number of used features: 25
[LightGBM] [Info] Start training from score 133385.874346
failed with
{'learning_rate': 0.43599490214200376, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9315408658211117, 'num_leaves': 95, 'min_data': 17, 'max_depth': 167}
iteration number 1
{'learning_rate': 0.2046486340378425, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.11995054259638982, 'num_leaves': 238, 'min_data': 30, 'max_depth': 129} 3710
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wis

In [18]:
lg = lgb.LGBMRegressor(boosting_type = 'gbdt', 
                       learning_rate = 0.5, 
                       objective = 'regression',
                       metric = 'mae',
                       sub_feature = '1',
                       min_data = 50,
                       num_leaves = 100,
                       max_depth = 100,
                       n_estimators = 1000,
                       )


In [19]:
lg

LGBMRegressor(learning_rate=0.5, max_depth=100, metric='mae', min_data=50,
              n_estimators=1000, num_leaves=100, objective='regression',
              sub_feature='1')

In [13]:
lg.fit(X_pp,y)



LGBMRegressor(learning_rate=0.5, max_depth=100, metric='mae', min_data=50,
              num_leaves=100, objective='regression', sub_feature='1')

In [20]:
cross_validate(lg,X_pp,y,cv=5, scoring = 'neg_mean_absolute_error')



{'fit_time': array([26.79805398, 14.14591837, 13.36167073, 13.26213288, 19.3214519 ]),
 'score_time': array([1.38829088, 1.16934967, 1.07487488, 1.04356313, 1.08997202]),
 'test_score': array([-32378.78653794, -32272.68584767, -31872.20036036, -32705.78488292,
        -32210.45789616])}

In [15]:
from xgboost import XGBRegressor

In [16]:
xgb = XGBRegressor()

In [17]:
cross_validate(xgb, X_pp, y, cv=5 , scoring = 'neg_mean_absolute_error')

{'fit_time': array([8.69047904, 7.93983102, 8.19033098, 7.887043  , 7.65695572]),
 'score_time': array([0.0628109 , 0.07897496, 0.09733987, 0.06109405, 0.05985594]),
 'test_score': array([-27447.44848654, -27776.9627502 , -27419.58262113, -28078.66509503,
        -27644.96926975])}