# Ref: Kernels


https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields

https://www.kaggle.com/artgor/nn-baseline

https://www.kaggle.com/ogrellier/user-level-lightgbm-lb-1-4480

https://www.kaggle.com/dimitreoliveira/deep-learning-keras-ga-revenue-prediction


# setup and import

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

path = '../data/ggl/'
print(os.listdir(path))

['train.csv', 'train_parsed.csv', 'test.csv']


# Parse data

In [2]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, 
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

In [None]:
%%time
df_train = load_df(path + 'train.csv')

In [3]:
%%time
df_test = load_df(path + 'test.csv')

CPU times: user 1min 35s, sys: 2.73 s, total: 1min 38s
Wall time: 1min 38s


# delete columns with no valid info

In [14]:
%%time
cols_to_drop = [col for col in df_train.columns if df_train[col].nunique() == 1]
df_train.drop(cols_to_drop, axis=1, inplace=True)
df_test.drop([col for col in cols_to_drop if col in df_test.columns], axis=1, inplace=True)


CPU times: user 3.33 s, sys: 35.9 ms, total: 3.37 s
Wall time: 3.11 s


# apply adjustments to both train and test datasets

In [31]:
def applyEdits(df):
    
    # fill NA's

    l_fillwithzero = ['adContent','keyword','adwordsClickInfo.adNetworkType','adwordsClickInfo.adNetworkType',
                      'adwordsClickInfo.gclId','adwordsClickInfo.page','adwordsClickInfo.slot']
    for x in l_fillwithzero:
        df[x] = df[x].fillna(0)

    # add / edit columns
    df['browser_category'] = df['browser'] + '_' + df['deviceCategory']
    df['browser_operatingSystem'] = df['browser'] + '_' + df['operatingSystem']
    df['source_country'] = df['source'] + '_' + df['country']
    
    # user access stat data column
    df['dummy'] = 1
    df['user_cumcnt_per_day'] = (df[['fullVisitorId','date', 'dummy']].groupby(['fullVisitorId','date'])['dummy'].cumcount()+1)
    df['user_sum_per_day'] = df[['fullVisitorId','date', 'dummy']].groupby(['fullVisitorId','date'])['dummy'].transform(sum)
    df['user_cumcnt_sum_ratio_per_day'] = df['user_cumcnt_per_day'] / df['user_sum_per_day'] 

    # fix and breakdown date info
    df['date'] = pd.to_datetime(df['date'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:]))
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekend'] = df['weekday'].apply(lambda x: 1 if x>=5 else 0)

    #convert to number
    l_toNum = ['hits','pageviews']
    for x in l_toNum:
        df[x] = df[x].values.astype(np.int64)

    #delete unecessary columns
    l_drop = ['dummy','referralPath','browser','deviceCategory','operatingSystem','source',
              'country','date']
    for x in l_drop:
        df.drop(x, axis=1, inplace=True)
        
    return df

In [32]:
%%time
df_train = applyEdits(df_train)

CPU times: user 8.89 s, sys: 1.55 s, total: 10.4 s
Wall time: 8.64 s


In [33]:
%%time
df_test = applyEdits(df_test)

CPU times: user 7.64 s, sys: 1.39 s, total: 9.03 s
Wall time: 7.31 s


# apply only for train dataset

In [37]:
# to float and log
df_train['transactionRevenue'] = df_train['transactionRevenue'].astype(float)
df_train['transactionRevenue'] = np.log1p(df_train['transactionRevenue'].fillna(0))

# category to number labels

In [74]:
no_use = ['fullVisitorId', 'sessionId', 'visitId', 'visitStartTime', 'transactionRevenue']

cat_cols = [x for x in df_train.columns if x not in no_use and type(df_train[x][0]) == str]

num_cols = [x for x in df_train.columns if x not in no_use and x not in cat_cols and type(df_train[x][0]) != str]


In [75]:
%%time
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(df_train[col].values.astype('str')) + list(df_test[col].values.astype('str')))
    df_train[col] = lbl.transform(list(df_train[col].values.astype('str')))
    df_test[col] = lbl.transform(list(df_test[col].values.astype('str')))


CPU times: user 1min 41s, sys: 22.6 s, total: 2min 4s
Wall time: 1min 46s


In [76]:
def toFloat(df):
    num_cols = [x for x in df.columns if type(df_train[x][0]) != 'str']
    for x in num_cols:
        df[x] = (df[x] * 1).astype(float)
    return df


In [77]:
%%time
df_train = toFloat(df_train)
df_test = toFloat(df_test)


CPU times: user 2.31 s, sys: 693 ms, total: 3 s
Wall time: 1.66 s


# resume point

In [78]:
%%time
df_train.to_csv(path + 'train_wip.csv')
df_test.to_csv(path + 'test_wip.csv')

CPU times: user 39.6 s, sys: 352 ms, total: 40 s
Wall time: 40.9 s


In [73]:
%%time

import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

path = '../data/ggl/'

df_train = pd.read_csv(path +'train_wip.csv', index_col = 0, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(path +'test_wip.csv', index_col = 0, dtype={'fullVisitorId': 'str'})

print(os.listdir(path))

['train_wip.csv', 'train.csv', 'test_wip.csv', 'test.csv']
CPU times: user 9.87 s, sys: 503 ms, total: 10.4 s
Wall time: 10 s


# create new dataset: aggregated by users

In [94]:

def aggregate_by_users(df, cat_cols):
    aggs = {
        'transactionRevenue': ['sum', 'size'],
        'hits': ['sum', 'min', 'max', 'mean', 'median'],
        'visitNumber': ['sum', 'min', 'max', 'mean', 'median'],
        'pageviews': ['sum', 'min', 'max', 'mean', 'median'],
        'weekend': ['mean'],
        'year': ['mean'],
        'isMobile': ['mean'],
    }

    for f in cat_cols + ['day', 'month', 'weekofyear']:
        aggs[f] = ['min', 'max', 'mean', 'median', 'var', 'std']

    users = df.groupby('fullVisitorId').agg(aggs)

    new_columns = [
        k + '_' + agg for k in aggs.keys() for agg in aggs[k]
    ]
    users.columns = new_columns

    return users

In [95]:
%%time
no_use = ['fullVisitorId', 'sessionId', 'visitId', 'visitStartTime']
cat_cols = [x for x in df_train.columns if x not in no_use and type(df_train[x][0]) == str]


df_user_train = aggregate_by_users(df_train, cat_cols)

df_test['transactionRevenue'] = 0
df_user_test = aggregate_by_users(df_test, cat_cols)



CPU times: user 7.03 s, sys: 2.66 s, total: 9.69 s
Wall time: 5.04 s


In [100]:
%%time
df_user_train = df_user_train.fillna(0)
df_user_test = df_user_test.fillna(0)

df_user_train.to_csv(path + 'user_train.csv')
df_user_test.to_csv(path + 'user_test.csv')

CPU times: user 30.7 s, sys: 569 ms, total: 31.3 s
Wall time: 31.2 s


# TODO：Normalizeバージョンを作る

In [None]:
normalized_features = ['visitNumber', 'hits', 'pageviews', 
                       'mean_hits_per_day', 'mean_pageviews_per_day', 
                       'sum_hits_per_day', 'sum_pageviews_per_day']

# Normalize using Min-Max scaling
scaler = preprocessing.MinMaxScaler()
X_train[normalized_features] = scaler.fit_transform(X_train[normalized_features])
X_val[normalized_features] = scaler.transform(X_val[normalized_features])
test[normalized_features] = scaler.transform(test[normalized_features])

# 以降　別ファイルで学習済みmodelからOutput作成作業用

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle

from keras.models import load_model


path = '../data/ggl/'
print(os.listdir(path))

Using TensorFlow backend.
  return f(*args, **kwds)


['model_NN.h5', 'user_train_wip.csv', 'output.csv', 'model_XGB.sav', 'user_test_wip.csv', 'train.csv', 'test_wip.csv', 'train_wip.csv', 'model_LGB.sav', 'test.csv', 'sample_submission.csv']


In [2]:
%%time
df_user_train = pd.read_csv(path +'user_train_wip.csv', index_col = 0, dtype={'fullVisitorId': 'str'})
df_user_test = pd.read_csv(path +'user_test_wip.csv', index_col = 0, dtype={'fullVisitorId': 'str'})

df_y = df_user_train['totals.transactionRevenue_sum']
df_x = df_user_train.drop(['date_min', 'date_max', 'totals.transactionRevenue_sum'], axis=1)
df_tgt = df_user_test.drop(['date_min', 'date_max', 'totals.transactionRevenue_sum'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, random_state=1)
x_eval, x_valid, y_eval, y_valid = train_test_split(x_test, y_test, random_state=1)

CPU times: user 16 s, sys: 1.56 s, total: 17.6 s
Wall time: 17.7 s


In [6]:
%%time
models = []

filename = 'model_LGB.sav'
model_LGB = pickle.load(open(path +filename, 'rb'))
models.append(model_LGB)

filename = 'model_XGB.sav'
model_XGB = pickle.load(open(path +filename, 'rb'))
models.append(model_XGB)

filename = 'model_NN.h5'
model_NN = load_model(path + filename)



CPU times: user 445 ms, sys: 20.2 ms, total: 465 ms
Wall time: 454 ms


In [9]:
%%time
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_valid_n = pd.DataFrame(sc.fit_transform(x_valid.fillna(0)),columns=x_valid.columns)
df_tgt_n = pd.DataFrame(sc.fit_transform(df_tgt.fillna(0)),columns=df_tgt.columns)

CPU times: user 1.28 s, sys: 1.34 s, total: 2.63 s
Wall time: 2.62 s


In [10]:
for model in models:
    y_pred = model.predict(x_valid.fillna(0))
    mse = mean_squared_error(y_valid, y_pred)
    rmse = np.sqrt(mse)
    print(rmse)

1.85918849781
1.96902928766


In [11]:
y_pred = model_NN.predict(x_valid_n)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)
print(rmse)

2.22930468212


In [12]:
%%time
preds = []
for model in models:
    pred_tmp = model.predict(df_tgt)
    preds.append(pred_tmp)

CPU times: user 36.7 s, sys: 1.49 s, total: 38.2 s
Wall time: 11.8 s


In [14]:
pred_tmp = model_NN.predict(df_tgt_n)

array([[-0.09956587],
       [ 0.35228747],
       [ 0.29945278],
       ..., 
       [-0.27611756],
       [ 0.49377429],
       [ 0.10136279]], dtype=float32)

In [16]:
preds.append(pred_tmp)

In [17]:
%%time
df_preds = pd.DataFrame(preds).T

CPU times: user 29.6 s, sys: 189 ms, total: 29.8 s
Wall time: 29.8 s


In [22]:
pred_tgt = df_preds[df_preds.columns].sum(axis=1)/len(preds)
pred_tgt[pred_tgt<0] = 0
pred_tgt = pred_tgt.values.tolist()

In [23]:
df_tgt['PredictedLogRevenue'] = pred_tgt

In [24]:
df_tgt[['PredictedLogRevenue']].to_csv(path + "output.csv", index=True)