In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline

from pandas.io.json import json_normalize
import json
import random
import warnings
warnings.filterwarnings("ignore")

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def load_df(csv_path='data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    p = 0.1
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     skiprows=lambda i: i>0 and random.random() > p)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = ["{}.{}".format(column, subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {}. Shape: {}".format(os.path.basename(csv_path), df.shape))
    return df

In [3]:
train_data = load_df()
test_data = load_df('data/test.csv')

Loaded train.csv. Shape: (90184, 54)
Loaded test.csv. Shape: (80185, 53)


In [4]:
train_data.shape

(90184, 54)

In [5]:
shops_or_not = lambda x : x.total

In [6]:
train_data['fullVisitorId'].nunique()

85740

In [7]:
train_data["totals.transactionRevenue"] = train_data["totals.transactionRevenue"].astype('float')
y_clf = (train_data['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
y_reg = train_data['totals.transactionRevenue'].fillna(0)

In [8]:
y_reg.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: totals.transactionRevenue, dtype: float64

In [9]:
y_clf.head()

0    0
1    0
2    0
3    0
4    0
Name: totals.transactionRevenue, dtype: uint8

In [10]:
y_reg.max()

5498000000.0

In [11]:
y_clf.mean()

0.012274904639403885

In [12]:
y_clf.max()

1

In [13]:
y_reg.max()

5498000000.0

In [14]:
def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day
date_format(train_data)
date_format(test_data)

In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null object
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null object
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null object
device.browserSize                                   80185 non-null object
device.browserVersion                                80185 non-null object
device.deviceCategory                                80185 no

In [16]:
excluded_features = [
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime', 'non_zero_proba', 'vis_date'
]

categorical_features = [
    _f for _f in train_data.columns
    if (_f not in excluded_features) & (train_data[_f].dtype == 'object')
]

In [17]:
if 'totals.transactionRevenue' in train_data.columns:
    del train_data['totals.transactionRevenue']

if 'totals.transactionRevenue' in test_data.columns:
    del test_data['totals.transactionRevenue']

In [18]:
for f in categorical_features:
    train_data[f], indexer = pd.factorize(train_data[f])
    test_data[f] = indexer.get_indexer(test_data[f])

['channelGrouping', 'socialEngagementType', 'device.browser', 'device.browserSize', 'device.browserVersion', 'device.deviceCategory', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystem', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.city', 'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.networkLocation', 'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 'totals.visits', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.adwordsClickInfo.gclId', 'trafficSource.adwordsClickInfo.isVideoAd', 't

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null object
device.browserSize                                   80185 non-null object
device.browserVersion                                80185 non-null object
device.deviceCategory                                80185 non-

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null object
device.deviceCategory                                80185 non-nu

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80185 entries, 0 to 80184
Data columns (total 57 columns):
channelGrouping                                      80185 non-null int64
date                                                 80185 non-null datetime64[ns]
fullVisitorId                                        80185 non-null object
sessionId                                            80185 non-null object
socialEngagementType                                 80185 non-null int64
visitId                                              80185 non-null int64
visitNumber                                          80185 non-null int64
visitStartTime                                       80185 non-null int64
device.browser                                       80185 non-null int64
device.browserSize                                   80185 non-null int64
device.browserVersion                                80185 non-null int64
device.deviceCategory                                80185 non-nul