In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline

from pandas.io.json import json_normalize
import json
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report

In [2]:
def load_df(csv_path='data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    p = 0.1
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     skiprows=lambda i: i>0 and random.random() > p)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = ["{}.{}".format(column, subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print("Loaded {}. Shape: {}".format(os.path.basename(csv_path), df.shape))
    return df

In [3]:
train_data = load_df()
test_data = load_df('data/test.csv')

Loaded train.csv. Shape: (90048, 54)
Loaded test.csv. Shape: (80398, 53)


In [8]:
train_data.shape
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90048 entries, 0 to 90047
Data columns (total 54 columns):
channelGrouping                                      90048 non-null object
date                                                 90048 non-null int64
fullVisitorId                                        90048 non-null object
sessionId                                            90048 non-null object
socialEngagementType                                 90048 non-null object
visitId                                              90048 non-null int64
visitNumber                                          90048 non-null int64
visitStartTime                                       90048 non-null int64
device.browser                                       90048 non-null object
device.browserSize                                   90048 non-null object
device.browserVersion                                90048 non-null object
device.deviceCategory                                90048 non-null ob

In [9]:
shops_or_not = lambda x : x.total

In [10]:
train_data['fullVisitorId'].nunique()

85530

In [11]:
train_data["totals.transactionRevenue"] = train_data["totals.transactionRevenue"].astype('float')
y_clf = (train_data['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
y_reg = train_data['totals.transactionRevenue'].fillna(0)

In [12]:
y_reg.info()

AttributeError: 'Series' object has no attribute 'info'

In [16]:
y_clf.describe()
y_clf.value_counts()

0    88938
1     1110
Name: totals.transactionRevenue, dtype: int64

In [20]:
y_reg.describe()
# y_reg.value_counts()

count    9.004800e+04
mean     1.586129e+06
std      3.874210e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      6.826960e+09
Name: totals.transactionRevenue, dtype: float64

In [22]:
def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day
date_format(train_data)
date_format(test_data)

In [23]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80398 entries, 0 to 80397
Data columns (total 57 columns):
channelGrouping                                      80398 non-null object
date                                                 80398 non-null datetime64[ns]
fullVisitorId                                        80398 non-null object
sessionId                                            80398 non-null object
socialEngagementType                                 80398 non-null object
visitId                                              80398 non-null int64
visitNumber                                          80398 non-null int64
visitStartTime                                       80398 non-null int64
device.browser                                       80398 non-null object
device.browserSize                                   80398 non-null object
device.browserVersion                                80398 non-null object
device.deviceCategory                                80398 no

In [24]:
excluded_features = [
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime', 'non_zero_proba', 'vis_date'
]

categorical_features = [
    _f for _f in train_data.columns
    if (_f not in excluded_features) & (train_data[_f].dtype == 'object')
]

In [25]:
if 'totals.transactionRevenue' in train_data.columns:
    del train_data['totals.transactionRevenue']

if 'totals.transactionRevenue' in test_data.columns:
    del test_data['totals.transactionRevenue']

In [26]:
for f in categorical_features:
    train_data[f], indexer = pd.factorize(train_data[f])
    test_data[f] = indexer.get_indexer(test_data[f])

In [42]:
import lightgbm as lgb
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss

folds = GroupKFold(n_splits=5)

train_features = [_f for _f in train_data.columns if _f not in excluded_features]
print(train_features)
oof_clf_preds = np.zeros(train_data.shape[0])
sub_clf_preds = np.zeros(test_data.shape[0])
for fold_, (trn_, val_) in enumerate(folds.split(y_clf, y_clf, groups=train_data['fullVisitorId'])):
    trn_x, trn_y = train_data[train_features].iloc[trn_], y_clf.iloc[trn_]
    val_x, val_y = train_data[train_features].iloc[val_], y_clf.iloc[val_]
    
    clf = lgb.LGBMClassifier(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=1000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        early_stopping_rounds=50,
        verbose=50
    )
    
    oof_clf_preds[val_] = clf.predict(val_x, num_iteration=clf.best_iteration_)[:]
    print(roc_auc_score(val_y, oof_clf_preds[val_]))
    sub_clf_preds += clf.predict_proba(test_data[train_features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
roc_auc_score(y_clf, oof_clf_preds)

['channelGrouping', 'socialEngagementType', 'visitNumber', 'device.browser', 'device.browserSize', 'device.browserVersion', 'device.deviceCategory', 'device.flashVersion', 'device.isMobile', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystem', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.city', 'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.networkLocation', 'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 'totals.visits', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.adwordsClickInfo.gclId', 'trafficSour

  if diff:


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.034484
[100]	valid_0's binary_logloss: 0.032615
[150]	valid_0's binary_logloss: 0.0323188
[200]	valid_0's binary_logloss: 0.0324743
Early stopping, best iteration is:
[159]	valid_0's binary_logloss: 0.0322834


  if diff:


0.5764893456073178
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.0306835
[100]	valid_0's binary_logloss: 0.0285514
[150]	valid_0's binary_logloss: 0.028174
[200]	valid_0's binary_logloss: 0.0282323
Early stopping, best iteration is:
[154]	valid_0's binary_logloss: 0.0281618


  if diff:


0.5545862932427644
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.0321343
[100]	valid_0's binary_logloss: 0.0299342
[150]	valid_0's binary_logloss: 0.0299024
Early stopping, best iteration is:
[113]	valid_0's binary_logloss: 0.0297884
0.5641698712799712


  if diff:


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.0320731
[100]	valid_0's binary_logloss: 0.029862
[150]	valid_0's binary_logloss: 0.0295526
Early stopping, best iteration is:
[144]	valid_0's binary_logloss: 0.0295296
0.5800068847299499


  if diff:


0.5685373290716338

In [47]:
oof_clf_preds.shape

(90048,)

In [48]:
sum(oof_clf_preds)

302.0

In [51]:
print(classification_report(y_clf, oof_clf_preds))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99     88938
          1       0.51      0.14      0.22      1110

avg / total       0.98      0.99      0.98     90048



In [34]:
from scipy import stats

In [35]:
print(stats.describe(oof_clf_preds))

DescribeResult(nobs=90048, minmax=(0.00011981964195201018, 0.7890583085190511), mean=0.01207915160177811, variance=0.003526780826070587, skewness=6.568133342324496, kurtosis=48.59116723131517)


In [36]:
oof_clf_preds.min()

0.00011981964195201018

In [37]:
oof_clf_preds.max()

0.7890583085190511

In [38]:
from sklearn.metrics import precision_recall_curve

In [40]:
precision, recall, thresholds = precision_recall_curve(y_clf, oof_clf_preds)