In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline

from pandas.io.json import json_normalize
import json
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_df(csv_path='data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    p = 0.1
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     skiprows=lambda i: i>0 and random.random() > p)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = ["{}.{}".format(column, subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {}. Shape: {}".format(os.path.basename(csv_path), df.shape))
    return df

In [3]:
train_data = load_df()
test_data = load_df('data/test.csv')

Loaded train.csv. Shape: (90059, 54)
Loaded test.csv. Shape: (80313, 53)


In [4]:
train_data.shape

(90059, 54)

In [5]:
shops_or_not = lambda x : x.total

In [6]:
train_data['fullVisitorId'].nunique()

85579

In [7]:
train_data["totals.transactionRevenue"] = train_data["totals.transactionRevenue"].astype('float')
y_clf = (train_data['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
y_reg = train_data['totals.transactionRevenue'].fillna(0)

In [8]:
y_reg.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: totals.transactionRevenue, dtype: float64

In [9]:
y_clf.head()

0    0
1    0
2    0
3    0
4    0
Name: totals.transactionRevenue, dtype: uint8

In [10]:
y_reg.max()

17855500000.0

In [11]:
y_clf.mean()

0.012291941949166657

In [12]:
y_clf.max()

1

In [13]:
y_reg.max()

17855500000.0

In [14]:
def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day
date_format(train_data)
date_format(test_data)

In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80313 entries, 0 to 80312
Data columns (total 57 columns):
channelGrouping                                      80313 non-null object
date                                                 80313 non-null datetime64[ns]
fullVisitorId                                        80313 non-null object
sessionId                                            80313 non-null object
socialEngagementType                                 80313 non-null object
visitId                                              80313 non-null int64
visitNumber                                          80313 non-null int64
visitStartTime                                       80313 non-null int64
device.browser                                       80313 non-null object
device.browserSize                                   80313 non-null object
device.browserVersion                                80313 non-null object
device.deviceCategory                                80313 no

In [16]:
excluded_features = [
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime', 'non_zero_proba', 'vis_date'
]

categorical_features = [
    _f for _f in train_data.columns
    if (_f not in excluded_features) & (train_data[_f].dtype == 'object')
]

In [17]:
if 'totals.transactionRevenue' in train_data.columns:
    del train_data['totals.transactionRevenue']

if 'totals.transactionRevenue' in test_data.columns:
    del test_data['totals.transactionRevenue']

In [18]:
for f in categorical_features:
    train_data[f], indexer = pd.factorize(train_data[f])
    test_data[f] = indexer.get_indexer(test_data[f])

In [26]:
import lightgbm as lgb
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss

folds = GroupKFold(n_splits=5)

train_features = [_f for _f in train_data.columns if _f not in excluded_features]
print(train_features)
oof_clf_preds = np.zeros(train_data.shape[0])
sub_clf_preds = np.zeros(test_data.shape[0])
for fold_, (trn_, val_) in enumerate(folds.split(y_clf, y_clf, groups=train_data['fullVisitorId'])):
    trn_x, trn_y = train_data[train_features].iloc[trn_], y_clf.iloc[trn_]
    val_x, val_y = train_data[train_features].iloc[val_], y_clf.iloc[val_]
    
    clf = lgb.LGBMClassifier(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=1000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        early_stopping_rounds=50,
        verbose=50
    )
    
    oof_clf_preds[val_] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    print(roc_auc_score(val_y, oof_clf_preds[val_]))
    sub_clf_preds += clf.predict_proba(test_data[train_features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
roc_auc_score(y_clf, oof_clf_preds)

['channelGrouping', 'socialEngagementType', 'visitNumber', 'device.browser', 'device.browserSize', 'device.browserVersion', 'device.deviceCategory', 'device.flashVersion', 'device.isMobile', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystem', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.city', 'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.networkLocation', 'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 'totals.visits', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.adwordsClickInfo.gclId', 'trafficSour

0.9848648871902576