In [1]:
import feather
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
train = feather.read_dataframe('../data/interim/train.ftr')
test = feather.read_dataframe('../data/interim/test.ftr')
train.shape, test.shape

((903653, 55), (804684, 53))

In [3]:
train[["fullVisitorId", "sessionId", "visitId"]].head(5)

Unnamed: 0,fullVisitorId,sessionId,visitId
0,1131660440785968503,1131660440785968503_1472830385,1472830385
1,377306020877927890,377306020877927890_1472880147,1472880147
2,3895546263509774583,3895546263509774583_1472865386,1472865386
3,4763447161404445595,4763447161404445595_1472881213,1472881213
4,27294437909732085,27294437909732085_1472822600,1472822600


In [4]:
# sessionId = fullVisitorId _ visitId
def check_sessionId(data):
    data['make_sessionId'] = f'{data["fullVisitorId"]}_data["visitId"]'
    sum_ = (data['make_sessionId'] == data['sessionId']).sum()
    return sum_

check_sessionId(train), check_sessionId(test)

(0, 0)

In [5]:
print('Number of unique visitor in train set:', train["sessionId"].nunique(), \
      'out of rows:', len(train), \
      'and ratio is:', train['sessionId'].nunique()/len(train))

print('Number of unique visitor in test set:', test["sessionId"].nunique(), \
      'out of rows:', len(test), \
      'and ratio is:', test['sessionId'].nunique()/len(test))

id_train_test = set(train.sessionId.unique()).intersection(set(test.sessionId.unique()))
print("Number of common visitors in train and test set:", len(id_train_test))

Number of unique visitor in train set: 902755 out of rows: 903653 and ratio is: 0.9990062557198394
Number of unique visitor in test set: 803863 out of rows: 804684 and ratio is: 0.9989797237176332
Number of common visitors in train and test set: 5


In [7]:
summary_train = train.groupby('sessionId').count()['date']
summary_train[summary_train>1].head()

sessionId
0011338928267756760_1471848731    2
0014884852016449602_1483257533    2
0019612470994343873_1496731675    2
0031542961723593073_1495261490    2
0039798900163330977_1485331011    2
Name: date, dtype: int64

### revenueとの関係

In [22]:
notunique_train = summary_train[summary_train>1].index.tolist()
print(len(notunique_train))
n_notnullrevenue_train = train.query('sessionId in @notunique_train')['totals.transactionRevenue'].notnull().sum()
print(n_notnullrevenue_train)
print(n_notnullrevenue_train/len(notunique_train))

898
37
0.04120267260579064


### trainとtestで重複しているsessionId

In [8]:
train.query('sessionId in @id_train_test')[['sessionId', 'visitId', 'visitStartTime']].sort_values('visitId')

Unnamed: 0,sessionId,visitId,visitStartTime
63799,0167247604162700002_1501656404,1501656404,1501656404
63105,9945591060914032258_1501656843,1501656843,1501656843
62993,8775495552152201078_1501657013,1501657013,1501657013
64767,9952572636180683443_1501657166,1501657166,1501657166
65054,3800266955937177725_1501657186,1501657186,1501657186


In [9]:
test.query('sessionId in @id_train_test')[['sessionId', 'visitId', 'visitStartTime']].sort_values('visitId')

Unnamed: 0,sessionId,visitId,visitStartTime
614228,0167247604162700002_1501656404,1501656404,1501657280
614398,9945591060914032258_1501656843,1501656843,1501657213
613010,8775495552152201078_1501657013,1501657013,1501657239
613406,9952572636180683443_1501657166,1501657166,1501657216
613297,3800266955937177725_1501657186,1501657186,1501657203


### visitStartTimeを加えたらユニークになる？

- ユニークになる！

In [49]:
train_tmp = train.copy()
train_tmp['data_type'] = 'train'
test_tmp = test.copy()
test_tmp['data_type'] = 'test'

total = pd.concat([train_tmp, test_tmp], axis=0, sort=True).reset_index(drop=True)
total.shape

(1708337, 57)

In [59]:
total['unique_key'] = total.apply(lambda x: x['sessionId']+'_'+str(x['visitStartTime']), axis=1)
total['unique_key'].head()

0    1131660440785968503_1472830385_1472830385
1     377306020877927890_1472880147_1472880147
2    3895546263509774583_1472865386_1472865386
3    4763447161404445595_1472881213_1472881213
4      27294437909732085_1472822600_1472822600
Name: unique_key, dtype: object

In [60]:
print('Number of unique visitor in total set:', total["unique_key"].nunique(), \
      'out of rows:', len(total), \
      'and ratio is:', total['unique_key'].nunique()/len(total))

Number of unique visitor in total set: 1708337 out of rows: 1708337 and ratio is: 1.0


### 特徴量案

In [120]:
total = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
total.shape

(1708337, 58)

In [121]:
train_range = range(0, len(train))
test_range = range(len(train), len(train)+len(test))

In [122]:
summary_total = total.groupby('sessionId').count()['date']
display(summary_total.sort_values(ascending=False).head())
sessionId_overlap = summary_total[summary_total>1].index.tolist()
len(sessionId_overlap)

sessionId
2975660081934341523_1470462204    2
2048375829958304395_1507445996    2
6238226081716604469_1479196770    2
5150779880512338975_1510559969    2
8688273984148160393_1516952524    2
Name: date, dtype: int64

1724

In [123]:
total['sessionId_overlap_flg'] = 0
total['sessionId_overlap_flg'] = total['sessionId_overlap_flg'].where(~total['sessionId'].isin(sessionId_overlap), 1)
(total['sessionId_overlap_flg']==1).sum()

3448

In [132]:
total.iloc[train_range][['sessionId', 'sessionId_overlap_flg']]

Unnamed: 0,sessionId,sessionId_overlap_flg
0,1131660440785968503_1472830385,0
1,377306020877927890_1472880147,0
2,3895546263509774583_1472865386,0
3,4763447161404445595_1472881213,0
4,27294437909732085_1472822600,0
5,2938943183656635653_1472807194,0
6,1905672039242460897_1472817241,0
7,537222803633850821_1472812602,0
8,4445454811831400414_1472805784,0
9,9499785259412240342_1472812272,0


In [131]:
train['sessionId']

0         1131660440785968503_1472830385
1          377306020877927890_1472880147
2         3895546263509774583_1472865386
3         4763447161404445595_1472881213
4           27294437909732085_1472822600
5         2938943183656635653_1472807194
6         1905672039242460897_1472817241
7          537222803633850821_1472812602
8         4445454811831400414_1472805784
9         9499785259412240342_1472812272
10        0523069750702990437_1472834967
11         982320996976275749_1472849434
12         357659889600827884_1472839882
13        1438082600262726746_1472803483
14        3531015320757710684_1472868337
15        9638207207743070032_1472824614
16        9876750586615598787_1472801099
17        2222266935962032743_1472826820
18        9674781571160116268_1472804607
19        3696906537737368442_1472856874
20        4478318070775453050_1472826420
21        6098154234696452861_1472863754
22        3323434834508685818_1472872530
23        3053576296023059465_1472808484
24         70273