In [1]:
# Importing libraries
import joblib

import numpy as np # linear algebra
import pandas as pd # data processing/analys
import lightgbm as lgb # gradient boosting framework that uses tree based learning algorithms
import warnings

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings("ignore")

In [2]:
#Using Pandas to load/read train, test and sample_sub set
train = pd.read_csv('DATA/train.csv')
test = pd.read_csv('DATA/test.csv')
sample_submission = pd.read_csv('DATA/sample_submission.csv')

In [3]:
# Droping the lable("revenue") before Scaling.
X = train.drop(["revenue"], axis=1)
#Saving label as y, using numpy log1p on label(logarithmic element-wise) to get better score.
y = np.log1p(train['revenue'])

# This is how we transform back again(necessary after predict to get the right score)
print(np.expm1(y))

In [9]:
# A very light weight pipeline with Robust Scaler, which takes care of outliers
pipeline = Pipeline([
    ("Rbt_scaler", RobustScaler())
])

prepared = pipeline.fit_transform(X)


In [10]:
joblib.dump(pipeline, "../model/pipeline.joblib", compress = 1)

['../model/pipeline.joblib']

In [11]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}

lgb_model = lgb.LGBMRegressor(**params, n_estimators = 10000, nthread = 4, n_jobs = -1)


In [12]:
Random_Search_Params ={
    "max_depth": [4],
    "min_data_in_leaf": [15],
    'learning_rate': [0.01],
    'num_leaves': [40],
    'boosting_type' : ['gbdt'],
    'objective' : ['regression'],
    'random_state' : [501], 
    }

n_HP_points_to_test = 50

random_search = RandomizedSearchCV(
    estimator=lgb_model, param_distributions= Random_Search_Params, 
    n_iter=n_HP_points_to_test,
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

random_search.fit(prepared, y)

# Using parameters already set above, replace in the best from the random search
params['learning_rate'] = random_search.best_params_['learning_rate']
params['max_depth'] = random_search.best_params_['max_depth']
params['num_leaves'] = random_search.best_params_['num_leaves']


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.5min finished




In [14]:
random_search.best_estimator_

LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, bagging_seed=11,
              boosting='gbdt', feature_fraction=0.9, lambda_l1=0.2,
              learning_rate=0.01, max_depth=4, metric='rmse',
              min_data_in_leaf=15, n_estimators=10000, nthread=4, num_leaves=40,
              objective='regression', random_state=501, verbosity=-1)

In [20]:
joblib.dump(random_search.best_estimator_, "../model/model.joblib", compress = 1)

['../model/model.joblib']