# EDA

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [2]:
# dirs
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

In [6]:
# read data
train = pd.read_pickle(os.path.join(input_dir, 'train.pkl'))
test = pd.read_pickle(os.path.join(input_dir, 'test.pkl'))
sample_submission = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'), dtype={'fullVisitorId': 'str'})

In [8]:
y = train['transactionRevenue'].fillna(0)
drop_cols = ['date', 'fullVisitorId', 'sessionId', 'visitId', 'visitStartTime', 'gclId', 'source']
X = train.drop(drop_cols+['transactionRevenue'], axis=1)

In [9]:
gbm_params = {
    'learning_rate':[0.2]
    ,'n_estimators':[i for i in range(20, 200, 20)]
#     ,'num_leaves':[100]
}

gbm = GridSearchCV(
    estimator=lgb.LGBMRegressor(random_state=0)
    ,param_grid=gbm_params
    ,cv=5
)

In [10]:
gbm.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=0,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.2], 'n_estimators': [20, 40, 60, 80, 100, 120, 140, 160, 180]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
pred = gbm.predict(X)
pred = pd.Series(data=pred, name='pred')

In [12]:
pred_df = pd.concat([y, pred], axis=1)
pred_df[pred_df['transactionRevenue']>0].head()

Unnamed: 0,transactionRevenue,pred
752,37860000.0,1792615.0
753,306670000.0,15922030.0
799,68030000.0,5635665.0
802,26250000.0,5907846.0
859,574150000.0,36728220.0


In [14]:
pred_test = gbm.predict(test.drop(drop_cols, axis=1))

In [15]:
pred_test_df = pd.DataFrame(
    {
        'fullVisitorId': test['fullVisitorId'],
        'PredictedLogRevenue': pred_test
    }
)

In [116]:
submission = pred_test_df.groupby('fullVisitorId')['PredictedLogRevenue'].sum()
submission = submission.to_frame()
submission['PredictedLogRevenue'][submission['PredictedLogRevenue']<0] = 0
submission['PredictedLogRevenue'] = np.log(submission['PredictedLogRevenue'].values+1)
submission.to_csv(os.path.join(output_dir, 'submission_20181001.csv'))

In [115]:
submission.head()

Unnamed: 0_level_0,PredictedLogRevenue
fullVisitorId,Unnamed: 1_level_1
259678714014,13.041229
49363351866189,12.83303
53049821714864,11.734423
59488412965267,12.83303
85840370633780,11.734423


In [74]:
# trainとtestで重複していない値の数
col_diff_dict = {}
for col in test.columns:
    cnt = len(set(train[col].unique()) ^ set(test[col].unique()))
    col_diff_dict[col] = cnt
col_diff_dict

{'channelGrouping': 0,
 'date': 638,
 'fullVisitorId': 1316051,
 'sessionId': 1706608,
 'visitId': 1665797,
 'visitNumber': 84,
 'visitStartTime': 1667423,
 'browser': 95,
 'deviceCategory': 0,
 'isMobile': 0,
 'operatingSystem': 6,
 'city': 531,
 'continent': 0,
 'country': 15,
 'metro': 43,
 'networkDomain': 30150,
 'region': 214,
 'subContinent': 0,
 'bounces': 0,
 'hits': 91,
 'newVisits': 0,
 'pageviews': 89,
 'adContent': 57,
 'adNetworkType': 1,
 'gclId': 58925,
 'isVideoAd': 0,
 'page': 4,
 'slot': 1,
 'campaign': 29,
 'isTrueDirect': 0,
 'keyword': 4710,
 'medium': 0,
 'referralPath': 2720,
 'source': 296}