## Load and test data submission

In [72]:
#IMPORTING REQUIRED LIBRARIES
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
import time
from datetime import datetime

import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import gc
gc.enable()


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [32]:
def load_df(csv_path='./input/train/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

print(os.listdir("./input/"))

['sample_submission.csv.zip', 'test', 'train']


In [53]:
%%time
train_df = load_df()
test_df = load_df("./input/test/test.csv")

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)
Wall time: 11min 58s


In [64]:
target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
target = target.apply(lambda x: np.log1p(x))
del train_df['totals.transactionRevenue']

columns_to_remove = [col for col in train_df.columns if train_df[col].nunique() == 1]
print("Nb. of variables with unique value: {}".format(len(columns_to_remove)))

for col in columns_to_remove:
    if set(['not available in demo dataset']) ==  set(train_df[col].unique()): continue
    print(col, train_df[col].dtypes, train_df[col].unique())
    
train_df['totals.bounces'] = train_df['totals.bounces'].fillna('0')
test_df['totals.bounces'] = test_df['totals.bounces'].fillna('0')

train_df['totals.newVisits'] = train_df['totals.newVisits'].fillna('0')
test_df['totals.newVisits'] = test_df['totals.newVisits'].fillna('0')

train_df['trafficSource.adwordsClickInfo.isVideoAd'] = train_df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)
test_df['trafficSource.adwordsClickInfo.isVideoAd'] = test_df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)

train_df['trafficSource.isTrueDirect'] = train_df['trafficSource.isTrueDirect'].fillna(False)
test_df['trafficSource.isTrueDirect'] = test_df['trafficSource.isTrueDirect'].fillna(False)

columns = [col for col in train_df.columns if train_df[col].nunique() > 1]

train_df = train_df[columns]
test_df = test_df[columns]

trn_len = train_df.shape[0]
merged_df = pd.concat([train_df, test_df])


KeyError: 'totals.transactionRevenue'

## Feature Engineering

In [63]:
merged_df['diff_visitId_time'] = merged_df['visitId'] - merged_df['visitStartTime']
merged_df['diff_visitId_time'] = (merged_df['diff_visitId_time'] != 0).astype(int)
del merged_df['visitId']
del merged_df['sessionId']


KeyError: 'visitId'

In [65]:
## DATES

format_str = '%Y%m%d' 
merged_df['formated_date'] = merged_df['date'].apply(lambda x: datetime.strptime(str(x), format_str))
merged_df['month'] = merged_df['formated_date'].apply(lambda x:x.month)
merged_df['quarter_month'] = merged_df['formated_date'].apply(lambda x:x.day//8)
merged_df['day'] = merged_df['formated_date'].apply(lambda x:x.day)
merged_df['weekday'] = merged_df['formated_date'].apply(lambda x:x.weekday())

del merged_df['date']
del merged_df['formated_date']

KeyError: 'date'

In [62]:
merged_df.columns

Index(['channelGrouping', 'fullVisitorId', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.deviceCategory', 'device.isMobile',
       'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
       'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.networkDomain',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source', 'diff_visitId_time', 'month', 'quarter_month',
       'weekday', 'mean_hits_per_day'],
      dtype='object')

In [59]:
merged_df['totals.hits'] = merged_df['totals.hits'].astype(int)
merged_df['mean_hits_per_day'] = merged_df.groupby(['day'])['totals.hits'].transform('mean')
del  merged_df['day']

merged_df['formated_visitStartTime'] = merged_df['visitStartTime'].apply(
    lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)))
merged_df['formated_visitStartTime'] = pd.to_datetime(merged_df['formated_visitStartTime'])
merged_df['visit_hour'] = merged_df['formated_visitStartTime'].apply(lambda x: x.hour)

del merged_df['visitStartTime']
del merged_df['formated_visitStartTime']


KeyError: 'day'

In [66]:
## ENCODE CATEGORICAL VALUES

for col in merged_df.columns:
    if col in ['fullVisitorId', 'month', 'quarter_month', 'weekday', 'visit_hour', 'WoY']: continue
    if merged_df[col].dtypes == object or merged_df[col].dtypes == bool:
        merged_df[col], indexer = pd.factorize(merged_df[col])
numerics = [col for col in merged_df.columns if 'totals.' in col]
numerics += ['visitNumber', 'mean_hits_per_day', 'fullVisitorId']
categorical_feats =  [col for col in merged_df.columns if col not in numerics]
for col in categorical_feats:
    merged_df[col] = merged_df[col].astype(int)
#merged_df['fullVisitorId'] = merged_df['fullVisitorId'].astype(float)
train_df = merged_df[:trn_len]
test_df = merged_df[trn_len:]

## Train

LGBM

In [68]:
param = {'num_leaves': 300,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 1,
         "verbosity": -1}
         
trn_cols = [col for col in train_df.columns if col not in ['fullVisitorId']]


In [73]:
## CV
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
start = time.time()
features = list(train_df[trn_cols].columns)
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][trn_cols], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][trn_cols], label=target.iloc[val_idx], categorical_feature=categorical_feats)
    
    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][trn_cols], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[trn_cols], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.76495	valid_1's rmse: 1.77149
[200]	training's rmse: 1.63449	valid_1's rmse: 1.67787
[300]	training's rmse: 1.55936	valid_1's rmse: 1.63867
[400]	training's rmse: 1.50931	valid_1's rmse: 1.622
[500]	training's rmse: 1.47129	valid_1's rmse: 1.61593
[600]	training's rmse: 1.4397	valid_1's rmse: 1.61362
[700]	training's rmse: 1.41213	valid_1's rmse: 1.61286
Early stopping, best iteration is:
[680]	training's rmse: 1.41725	valid_1's rmse: 1.61275
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.76497	valid_1's rmse: 1.76302
[200]	training's rmse: 1.63377	valid_1's rmse: 1.6729
[300]	training's rmse: 1.55909	valid_1's rmse: 1.63669
[400]	training's rmse: 1.50886	valid_1's rmse: 1.62274
[500]	training's rmse: 1.47065	valid_1's rmse: 1.61762
[600]	training's rmse: 1.43923	valid_1's rmse: 1.61629
[700]	training's rmse: 1.41142	valid_1's rmse: 1.61548
[800]	training's r

## Submit

In [74]:
submission = test_df[['fullVisitorId']].copy()
submission.loc[:, 'PredictedLogRevenue'] = np.expm1(predictions)
grouped_test = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
grouped_test["PredictedLogRevenue"] = np.log1p(grouped_test["PredictedLogRevenue"])
grouped_test.to_csv('submit.csv',index=False)
