# Ref: Kernels


https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields

https://www.kaggle.com/artgor/nn-baseline

https://www.kaggle.com/ogrellier/user-level-lightgbm-lb-1-4480

# setup and import

In [29]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

path = '../data/ggl-rev-pred/'
print(os.listdir(path))

['train.csv', 'test_wip.csv', 'train_wip.csv', 'test.csv', 'sample_submission.csv']


# Load data

In [6]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, 
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [7]:
%%time
df_train_raw = load_df(path + 'train.csv')

Loaded train.csv. Shape: (903653, 55)
CPU times: user 1min 49s, sys: 3.27 s, total: 1min 52s
Wall time: 1min 53s


In [8]:
%%time
df_test_raw = load_df(path + 'test.csv')

Loaded test.csv. Shape: (804684, 53)
CPU times: user 1min 57s, sys: 14.8 s, total: 2min 12s
Wall time: 3min 50s


##  keep _raw for resuming point

In [14]:
%%time
df_train = df_train_raw.copy()

CPU times: user 1.42 s, sys: 3.8 s, total: 5.22 s
Wall time: 23.8 s


In [15]:
%%time
df_test = df_test_raw.copy()

CPU times: user 2.17 s, sys: 7.94 s, total: 10.1 s
Wall time: 1min 3s


# Clean up datasets

## apply adjustments to both train and test datasets

In [16]:
def applyEdits(df):
    # fill NA's
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True)

    l_fillwithzero = ['totals.bounces','totals.newVisits','totals.pageviews',
    'trafficSource.adContent','trafficSource.keyword','trafficSource.adwordsClickInfo.adNetworkType',
    'trafficSource.adwordsClickInfo.adNetworkType','trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.gclId','trafficSource.adwordsClickInfo.page',
    'trafficSource.adwordsClickInfo.slot']
    for x in l_fillwithzero:
        df[x] = df[x].fillna(0)

    # add / edit columns
    df['browser_category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser_operatingSystem'] = df['device.browser'] + '_' + df['device.operatingSystem']
    df['source_country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']                                                                   

    
    # user access stat data column
    df['dummy'] = 1
    df['user_cumcnt_per_day'] = (df[['fullVisitorId','date', 'dummy']].groupby(['fullVisitorId','date'])['dummy'].cumcount()+1)
    df['user_sum_per_day'] = df[['fullVisitorId','date', 'dummy']].groupby(['fullVisitorId','date'])['dummy'].transform(sum)
    df['user_cumcnt_sum_ratio_per_day'] = df['user_cumcnt_per_day'] / df['user_sum_per_day'] 

    # fix and breakdown date info
    df['date'] = pd.to_datetime(df['date'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:]))
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['weekofyear'] = df['date'].dt.weekofyear

    #convert to number
    l_toNum = [ 'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews',]
    for x in l_toNum:
        df[x] = df[x].values.astype(np.int64)

    l_bools = ['trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect' ]
    for x in l_bools:
        df[x] = df[x].values.astype(np.int64) * 1

    #delete unecessary columns
    l_drop = ['dummy','trafficSource.referralPath','device.browser','device.deviceCategory','device.operatingSystem','trafficSource.source',
             'geoNetwork.country']
    for x in l_drop:
        df.drop(x, axis=1, inplace=True)
        
    return df

In [17]:
%%time
df_train = applyEdits(df_train)

CPU times: user 25.2 s, sys: 5.92 s, total: 31.2 s
Wall time: 52.2 s


In [18]:
%%time
df_test = applyEdits(df_test)

CPU times: user 23.3 s, sys: 10.6 s, total: 33.9 s
Wall time: 1min 25s


## apply only for train dataset

In [19]:
# to float and log
df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].astype(float)
df_train['totals.transactionRevenue'] = np.log1p(df_train['totals.transactionRevenue'].fillna(0))

## delete columns with no valid info

In [20]:
%%time
cols_to_drop = [col for col in df_train.columns if df_train[col].nunique() == 1]
df_train.drop(cols_to_drop, axis=1, inplace=True)
df_test.drop([col for col in cols_to_drop if col in df_test.columns], axis=1, inplace=True)


CPU times: user 6.56 s, sys: 1.13 s, total: 7.69 s
Wall time: 11.7 s


## category to number labels

In [21]:
no_use = ['fullVisitorId', 'sessionId', 'visitId', 'visitStartTime', 'totals.transactionRevenue']

cat_cols = [x for x in df_train.columns if x not in no_use and type(df_train[x][0]) == str]

num_cols = [x for x in df_train.columns if x not in no_use and x not in cat_cols and type(df_train[x][0]) != str and type(df_train[x][0]) != bool]


In [22]:
%%time
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(df_train[col].values.astype('str')) + list(df_test[col].values.astype('str')))
    df_train[col] = lbl.transform(list(df_train[col].values.astype('str')))
    df_test[col] = lbl.transform(list(df_test[col].values.astype('str')))


CPU times: user 1min 33s, sys: 22.1 s, total: 1min 55s
Wall time: 3min 10s


## export editted CSVs

In [23]:
%%time
#export
df_train.to_csv(path +'train_wip.csv')
df_test.to_csv(path +'test_wip.csv')

CPU times: user 36.5 s, sys: 819 ms, total: 37.3 s
Wall time: 46.6 s


## save editted CSVs

In [25]:
%%time
df_train_forReset = df_train.copy()
df_test_forReset = df_test.copy()

CPU times: user 147 ms, sys: 88.7 ms, total: 236 ms
Wall time: 254 ms


## load editted CSVs

In [26]:
#df_train = df_train_forReset.copy()
#df_test = df_test_forReset.copy()

# create new dataset: aggregated by users

In [27]:

def aggregate_by_users(df, cat_cols):
    aggs = {
        'date': ['min', 'max'],
        'totals.transactionRevenue': ['sum', 'size'],
        'totals.hits': ['sum', 'min', 'max', 'mean', 'median'],
        'totals.pageviews': ['sum', 'min', 'max', 'mean', 'median'],
        'totals.bounces': ['sum', 'mean', 'median'],
        'totals.newVisits': ['sum', 'mean', 'median']
    }

    for f in cat_cols + ['weekday', 'day', 'month', 'weekofyear']:
        aggs[f] = ['min', 'max', 'mean', 'median', 'var', 'std']

    users = df.groupby('fullVisitorId').agg(aggs)

    new_columns = [
        k + '_' + agg for k in aggs.keys() for agg in aggs[k]
    ]
    users.columns = new_columns

    users['date_diff'] = (users.date_max - users.date_min).astype(np.int64) // (24 * 3600 * 1e9)
    
    return users

In [28]:
%%time
df_user_train = aggregate_by_users(df_train, cat_cols)

df_test['totals.transactionRevenue'] = 0
df_user_test = aggregate_by_users(df_test, cat_cols)



CPU times: user 35.8 s, sys: 18.1 s, total: 53.9 s
Wall time: 2min 42s


## export aggregated csv

In [30]:
%%time
df_user_train.to_csv(path +'user_train_wip.csv')
df_user_test.to_csv(path +'user_test_wip.csv')

CPU times: user 1min 40s, sys: 408 ms, total: 1min 41s
Wall time: 1min 43s


## Load saved CSV

In [31]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

path = '../data/ggl-rev-pred/'
print(os.listdir(path))

['user_train_wip.csv', 'user_test_wip.csv', 'train.csv', 'test_wip.csv', 'train_wip.csv', 'test.csv', 'sample_submission.csv']


In [None]:
%%time
df_user_train = pd.read_csv(path +'user_train_wip.csv', index_col = 0)
df_user_test = pd.read_csv(path +'user_test_wip.csv', index_col = 0)

# Light GB model

In [33]:
%%time
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df_y = df_user_train['totals.transactionRevenue_sum']
df_x = df_user_train.drop(['date_min', 'date_max', 'totals.transactionRevenue_sum'], axis=1)
df_tgt = df_user_test.drop(['date_min', 'date_max', 'totals.transactionRevenue_sum'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, random_state=1)
x_eval, x_valid, y_eval, y_valid = train_test_split(x_test, y_test, random_state=1)

lgb_params = {
    'learning_rate': 0.03,
    'n_estimators': 2000,
    'num_leaves': 128,
    'subsample': 0.2217,
    'colsample_bytree': 0.6810,
    'min_split_gain': np.power(10.0, -4.9380),
    'reg_alpha': np.power(10.0, -3.2454),
    'reg_lambda': np.power(10.0, -4.8571),
    'min_child_weight': np.power(10.0, 2),
    'silent': True
}

model = lgb.LGBMRegressor(**lgb_params)


CPU times: user 2.18 s, sys: 1.55 s, total: 3.73 s
Wall time: 8.81 s


In [34]:
%%time
model.fit(x_train, 
          y_train,
          eval_set=[(x_train, y_train),(x_eval, y_eval)],
          eval_metric='rmse',
          early_stopping_rounds=10)

[1]	training's rmse: 2.87483	training's l2: 8.26466	valid_1's rmse: 2.79533	valid_1's l2: 7.81385
Training until validation scores don't improve for 10 rounds.
[2]	training's rmse: 2.8429	training's l2: 8.08208	valid_1's rmse: 2.7649	valid_1's l2: 7.64466
[3]	training's rmse: 2.81263	training's l2: 7.9109	valid_1's rmse: 2.73627	valid_1's l2: 7.48716
[4]	training's rmse: 2.78349	training's l2: 7.74783	valid_1's rmse: 2.70816	valid_1's l2: 7.33414
[5]	training's rmse: 2.75584	training's l2: 7.59465	valid_1's rmse: 2.68194	valid_1's l2: 7.19278
[6]	training's rmse: 2.72903	training's l2: 7.44759	valid_1's rmse: 2.65643	valid_1's l2: 7.05663
[7]	training's rmse: 2.70391	training's l2: 7.31113	valid_1's rmse: 2.63275	valid_1's l2: 6.93137
[8]	training's rmse: 2.67988	training's l2: 7.18173	valid_1's rmse: 2.61013	valid_1's l2: 6.81276
[9]	training's rmse: 2.65692	training's l2: 7.05924	valid_1's rmse: 2.58924	valid_1's l2: 6.70417
[10]	training's rmse: 2.63533	training's l2: 6.94496	valid_

[84]	training's rmse: 2.11795	training's l2: 4.48571	valid_1's rmse: 2.14095	valid_1's l2: 4.58365
[85]	training's rmse: 2.11603	training's l2: 4.47757	valid_1's rmse: 2.1404	valid_1's l2: 4.5813
[86]	training's rmse: 2.1139	training's l2: 4.46858	valid_1's rmse: 2.13924	valid_1's l2: 4.57636
[87]	training's rmse: 2.11198	training's l2: 4.46046	valid_1's rmse: 2.13857	valid_1's l2: 4.57347
[88]	training's rmse: 2.11021	training's l2: 4.453	valid_1's rmse: 2.13784	valid_1's l2: 4.57036
[89]	training's rmse: 2.10769	training's l2: 4.44237	valid_1's rmse: 2.13611	valid_1's l2: 4.56299
[90]	training's rmse: 2.10592	training's l2: 4.43489	valid_1's rmse: 2.13548	valid_1's l2: 4.56029
[91]	training's rmse: 2.1041	training's l2: 4.42724	valid_1's rmse: 2.13505	valid_1's l2: 4.55842
[92]	training's rmse: 2.10217	training's l2: 4.41912	valid_1's rmse: 2.13427	valid_1's l2: 4.55509
[93]	training's rmse: 2.09967	training's l2: 4.40861	valid_1's rmse: 2.13277	valid_1's l2: 4.54869
[94]	training's 

[168]	training's rmse: 1.97809	training's l2: 3.91283	valid_1's rmse: 2.08449	valid_1's l2: 4.3451
[169]	training's rmse: 1.97726	training's l2: 3.90954	valid_1's rmse: 2.08458	valid_1's l2: 4.34549
[170]	training's rmse: 1.9758	training's l2: 3.90379	valid_1's rmse: 2.08393	valid_1's l2: 4.34275
[171]	training's rmse: 1.97496	training's l2: 3.90045	valid_1's rmse: 2.08376	valid_1's l2: 4.34206
[172]	training's rmse: 1.97412	training's l2: 3.89715	valid_1's rmse: 2.08368	valid_1's l2: 4.34172
[173]	training's rmse: 1.97333	training's l2: 3.89402	valid_1's rmse: 2.08354	valid_1's l2: 4.34115
[174]	training's rmse: 1.97249	training's l2: 3.89073	valid_1's rmse: 2.08339	valid_1's l2: 4.34051
[175]	training's rmse: 1.97164	training's l2: 3.88736	valid_1's rmse: 2.08348	valid_1's l2: 4.3409
[176]	training's rmse: 1.97068	training's l2: 3.88357	valid_1's rmse: 2.08339	valid_1's l2: 4.34053
[177]	training's rmse: 1.96986	training's l2: 3.88034	valid_1's rmse: 2.08338	valid_1's l2: 4.34049
[17

[251]	training's rmse: 1.90278	training's l2: 3.62059	valid_1's rmse: 2.07235	valid_1's l2: 4.29464
[252]	training's rmse: 1.90218	training's l2: 3.61827	valid_1's rmse: 2.07242	valid_1's l2: 4.29491
[253]	training's rmse: 1.9014	training's l2: 3.61531	valid_1's rmse: 2.0723	valid_1's l2: 4.29442
[254]	training's rmse: 1.90087	training's l2: 3.6133	valid_1's rmse: 2.07234	valid_1's l2: 4.2946
[255]	training's rmse: 1.89991	training's l2: 3.60967	valid_1's rmse: 2.07155	valid_1's l2: 4.29131
[256]	training's rmse: 1.89872	training's l2: 3.60515	valid_1's rmse: 2.07125	valid_1's l2: 4.29008
[257]	training's rmse: 1.89797	training's l2: 3.60228	valid_1's rmse: 2.07107	valid_1's l2: 4.28933
[258]	training's rmse: 1.89741	training's l2: 3.60018	valid_1's rmse: 2.07113	valid_1's l2: 4.28958
[259]	training's rmse: 1.89677	training's l2: 3.59772	valid_1's rmse: 2.07121	valid_1's l2: 4.2899
[260]	training's rmse: 1.89623	training's l2: 3.5957	valid_1's rmse: 2.07122	valid_1's l2: 4.28997
[261]	

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.681,
       importance_type='split', learning_rate=0.03, max_depth=-1,
       min_child_samples=20, min_child_weight=100.0,
       min_split_gain=1.1534532578210929e-05, n_estimators=2000, n_jobs=-1,
       num_leaves=128, objective=None, random_state=None,
       reg_alpha=0.00056832923913833827, reg_lambda=1.3896326196366016e-05,
       silent=True, subsample=0.2217, subsample_for_bin=200000,
       subsample_freq=0)

## モデル保存

In [35]:
%%time
import pickle
filename = 'lgb_model.sav'
pickle.dump(model, open(path +filename, 'wb'))

CPU times: user 423 ms, sys: 4.12 ms, total: 427 ms
Wall time: 144 ms


## モデル読み込み

In [None]:
import pickle
filename = 'lgb_model.sav'
model = pickle.load(open(path +filename, 'rb'))

## モデル評価

In [36]:
y_pred = model.predict(x_valid)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)
print(rmse)

2.10716869906


# 予測Outputファイル作成

In [37]:
%%time
pred_tgt = model.predict(df_tgt)

CPU times: user 24.5 s, sys: 1.27 s, total: 25.7 s
Wall time: 9.25 s


In [38]:
pred_tgt[pred_tgt<0] = 0

In [40]:
df_tgt['PredictedLogRevenue'] = pred_tgt
df_tgt[['PredictedLogRevenue']].to_csv(path + "output.csv", index=True)