<a href="https://colab.research.google.com/github/kyokicchi/ggl_an_codes/blob/master/trainLGBmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train model

In [0]:
!pip install lightgbm

In [0]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from google.colab import drive
from google.colab import files
from sklearn.model_selection import train_test_split
import pickle

In [0]:
drive.mount('/content/gdrive')

In [0]:
%cd ../content/gdrive/My Drive/projects/ggl_an/
%ls

In [0]:
s_data = 'user_train.csv'
filename = 'model_LGB.sav'
s_col_tgt = 'transactionRevenue'
l_col_drop = [s_col_tgt]

In [0]:
%%time
df_train = pd.read_csv(s_data, index_col = 0)

In [0]:
%%time
df_y = df_train[s_col_tgt]
df_x = df_train.drop(l_col_drop, axis=1)

In [0]:
%%time
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y)



In [0]:
%%time
from sklearn.model_selection import GridSearchCV

def applyGSCV(model, param, X, Y):
    res = GridSearchCV(model, param, cv=3)
    res.fit(X, Y)
    return res


model = lgb.LGBMRegressor()

param = {
    'learning_rate': [0.1,0.15],
    'n_estimators': [200,500],
    'num_leaves': [50,100],
}
  
model_LGB = applyGSCV(model, param, x_train, y_train)



In [0]:
print(model_LGB.best_score_)
print(model_LGB.best_estimator_)


In [0]:
%%time
model = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=400, n_jobs=-1, num_leaves=50, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

model.fit(x_train, 
          y_train,
          eval_set=[(x_train, y_train),(x_test, y_test)],
          eval_metric='rmse',
          early_stopping_rounds = 300)

In [0]:
%%time
pickle.dump(model_LGB, open(filename, 'wb'))


# Predict / Output

In [0]:
%%time

model = pickle.load(open('model_LGB.sav', 'rb'))

In [0]:
%%time
df_user_train = pd.read_csv('user_train.csv', index_col = 0, dtype={'fullVisitorId': 'str'})
df_user_test = pd.read_csv('user_test.csv', index_col = 0, dtype={'fullVisitorId': 'str'})


In [0]:

df_y = df_user_train['transactionRevenue']
df_x = df_user_train.drop(['transactionRevenue'], axis=1)
df_tgt = df_user_test.drop(['transactionRevenue'], axis=1)

df_x.shape, df_y.shape, df_tgt.shape


In [0]:
%time

from sklearn.metrics import mean_squared_error

pred = model.predict(df_x)
mse = mean_squared_error(df_y, pred)
rmse = np.sqrt(mse)
print(rmse)

In [0]:
%%time
pred_tgt = model.predict(df_tgt)


In [0]:
pred_tgt.shape

In [0]:
df_y[df_y<=10.0].value_counts()

In [0]:
pred_tgt[pred_tgt<5.0] = 0
df_out = pd.DataFrame(index = df_tgt.index)
df_out['PredictedLogRevenue'] = pred_tgt


In [0]:
df_out.head()

In [0]:
df_out.describe()

In [0]:
%%time
df_out.to_csv('output_LGB.csv')