In [1]:
import json
import os
import pickle

import numpy as np
import pandas as pd
import lightgbm as lgb

In [2]:
df_train_full = pd.read_pickle('train_parsed_nojson.pickle')
df_test_full = pd.read_pickle('test_parsed_nojson.pickle')

In [3]:
visitor_set = set(df_test_full.fullVisitorId)

In [6]:
for v in visitor_set:
    print(v)
    print(type(v))
    break

6454274553732454521
<class 'str'>


In [7]:
len(visitor_set)

296530

In [8]:
visitor_list = list(visitor_set)
visitor_list.sort()
visitor_list[:10]

['0000018966949534117',
 '0000039738481224681',
 '0000073585230191399',
 '0000087588448856385',
 '0000149787903119437',
 '0000196310838896290',
 '00001995526696366',
 '0000255704530917106',
 '0000268499301061358',
 '0000276747989270229']

In [12]:
df_has_revenue = df_test_full.totals_transactionRevenue.astype(np.float64).fillna(0.0) > 0.01

In [20]:
len(df_has_revenue.astype(np.int32))

401589

In [22]:
df_test_full = df_test_full.assign(has_revenue=df_has_revenue.astype(np.int32))

In [23]:
df_test_full.head().T

Unnamed: 0,0,1,2,3,4
channelGrouping,Organic Search,Direct,Organic Search,Direct,Organic Search
date,20180511,20180511,20180511,20180511,20180511
fullVisitorId,7460955084541987166,460252456180441002,3461808543879602873,975129477712150630,8381672768065729990
socialEngagementType,Not Socially Engaged,Not Socially Engaged,Not Socially Engaged,Not Socially Engaged,Not Socially Engaged
visitId,1526099341,1526064483,1526067157,1526107551,1526060254
visitNumber,2,166,2,4,1
visitStartTime,1526099341,1526064483,1526067157,1526107551,1526060254
rowId,0,1,2,3,4
device_browser,Chrome,Chrome,Chrome,Chrome,Internet Explorer
device_browserVersion,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset


In [24]:
df = pd.DataFrame({
        'fullVisitorId': df_test_full.fullVisitorId,
        'has_revenue_count': df_test_full.has_revenue,
})
df_revcount = df.groupby('fullVisitorId').sum()

In [34]:
import seaborn as sns

df_power_users = df_revcount[df_revcount.has_revenue_count >= 2]

In [35]:
df_power_users.head()

Unnamed: 0_level_0,has_revenue_count
fullVisitorId,Unnamed: 1_level_1
10364132187346780,4
135135903467220901,2
24858640770880554,2
86996858439022801,3
136175134016205768,2


In [39]:
power_user_set = set(df_power_users.index)

In [42]:
df_test_power_users = df_test_full[df_test_full.fullVisitorId.isin(power_user_set)]

In [48]:
df_test_power_users_5m = df_test_power_users[
    (df_test_power_users.date >= 20180501) & (df_test_power_users.date <= 20180931)]

In [70]:
df_test_power_users_4m = df_test_power_users[
    (df_test_power_users.date >= 20180601) & (df_test_power_users.date <= 20180931)]

In [49]:
df = pd.DataFrame({
    'fullVisitorId': df_test_power_users_5m.fullVisitorId,
    'revenue': df_test_power_users_5m.totals_transactionRevenue.astype(np.float64).fillna(0.0),
}).groupby('fullVisitorId').sum() * (2. / 5.)
df.head()

Unnamed: 0_level_0,revenue
fullVisitorId,Unnamed: 1_level_1
10364132187346780,518160000.0
135135903467220901,0.0
24858640770880554,126976000.0
86996858439022801,36460000.0
136175134016205768,146616000.0


In [50]:
df = np.log1p(df)
df.head()

Unnamed: 0_level_0,revenue
fullVisitorId,Unnamed: 1_level_1
10364132187346780,20.065795
135135903467220901,0.0
24858640770880554,18.659509
86996858439022801,17.411726
136175134016205768,18.803327


In [53]:
df_ss = pd.read_csv('sample_submission_v2.csv.zip', dtype={'fullVisitorId': np.object})

In [55]:
len(df_ss)

296530

In [56]:
df_submit = df_ss.merge(df, how='left', on='fullVisitorId')
df_submit.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue,revenue
0,18966949534117,0.0,
1,39738481224681,0.0,
2,73585230191399,0.0,
3,87588448856385,0.0,
4,149787903119437,0.0,


In [58]:
df_submit[df_submit.revenue.notna()].head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue,revenue
319,10364132187346780,0.0,20.065795
404,135135903467220901,0.0,0.0
762,24858640770880554,0.0,18.659509
2530,86996858439022801,0.0,17.411726
3989,136175134016205768,0.0,18.803327


In [59]:
df_submit.PredictedLogRevenue = df_submit.revenue.fillna(0.0)
df_submit.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue,revenue
0,18966949534117,0.0,
1,39738481224681,0.0,
2,73585230191399,0.0,
3,87588448856385,0.0,
4,149787903119437,0.0,


In [62]:
df_submit = df_submit.drop(columns=['revenue'])

In [63]:
df_submit[df_submit.fullVisitorId.isin(power_user_set)].PredictedLogRevenue.sum()

5570.549252891604

In [64]:
df_submit.PredictedLogRevenue.sum()

5570.549252891602

In [67]:
df_submit[~df_submit.fullVisitorId.isin(power_user_set)].PredictedLogRevenue.sum()

0.0

In [69]:
df_submit.to_csv('submission2_adhoc.csv', index=False)

In [72]:
df_test_power_users_3m = df_test_power_users[
    (df_test_power_users.date >= 20180701) & (df_test_power_users.date <= 20180931)]

In [73]:
df_revenue_3m = pd.DataFrame({
    'fullVisitorId': df_test_power_users_3m.fullVisitorId,
    'revenue': df_test_power_users_3m.totals_transactionRevenue.astype(np.float64).fillna(0.0),
}).groupby('fullVisitorId').sum() * (2. / 3.)
df_revenue_3m = np.log1p(df_revenue_3m)
df_revenue_3m.head()

Unnamed: 0_level_0,revenue
fullVisitorId,Unnamed: 1_level_1
10364132187346780,863600000.0
135135903467220901,0.0
24858640770880554,211626700.0
86996858439022801,11726670.0
136175134016205768,172706700.0


In [80]:
df_submit2 = df_ss.merge(df_revenue_3m, how='left', on='fullVisitorId')
df_submit2.PredictedLogRevenue = df_submit2.revenue.fillna(0.0)
df_submit2.drop(columns=['revenue'], inplace=True)
df_submit2.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,18966949534117,0.0
1,39738481224681,0.0
2,73585230191399,0.0
3,87588448856385,0.0
4,149787903119437,0.0


In [90]:
df_submit2.to_csv('submission3_adhoc.csv', index=False)

In [81]:
len(df_submit2)

296530

In [82]:
len(power_user_set)

318

In [89]:
df_revcount.groupby('fullVisitorId').sum().has_revenue_count.value_counts()

0     292374
1       3838
2        254
3         39
4         16
5          4
14         1
10         1
9          1
8          1
6          1
Name: has_revenue_count, dtype: int64