In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import pickle
import random
import numpy as np
import sys
import glob
import pandas as pd

In [2]:
wepick_data_header = [
"v", "u", "seq", "rgtme", "dt", "label", "av", "bq", "dn", "dot", "dv", "dvcid", "g", "lid0",
"lid1", "lid2", "s", "ci", "dgid", "ef", "ls", "pe", "po", "pot", "ps", "set", "sst", "st",
"ti1", "ti2", "ti3", "ti4", "ti5", "tn1", "tn2", "tn3", "tn4", "tn5"
]

In [3]:
data_dir = r'c:\Users\wmp\TensorFlow\DIN_tf_eager'
dic = {}
for fname in glob.glob(os.path.join(data_dir,'*.csv')):
    df = pd.read_csv(fname, header=None, names=wepick_data_header)
    dic[fname] = df

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
x = pd.concat(dic.values(), ignore_index=True)

In [5]:
x = x[['v','u','seq', 'rgtme','dt', 'label', 'ti1', 'ti2']]

In [6]:
# There may be NA in ti1, ti2 (배송2.0 관련?)
x = x.reset_index(drop=True)
x = x.dropna(how='any')

In [7]:
x['ti1'] = x['ti1'].astype('int64')
x['ti2'] = x['ti2'].astype('int64')

In [8]:
def build_map(df, col_name):
  key = sorted(df[col_name].unique().tolist())
  m = dict(zip(key, range(len(key))))
  df[col_name] = df[col_name].map(lambda x: m[x])
  return m, key

In [9]:
origin_x = x.copy()

In [10]:
deal_map, deal_key = build_map(x, 'v')

In [11]:
user_map, user_key = build_map(x, 'u')

In [12]:
ti1_map, ti1_key = build_map(x, 'ti1')
ti2_map, ti2_key = build_map(x, 'ti2')

In [13]:
x = x.sort_values(['u','rgtme'])
x = x.reset_index(drop=True)

In [14]:
ti1_list = np.array([x['ti1'][i] for i in range(len(deal_map))], dtype=np.int32)
ti2_list = np.array([x['ti2'][i] for i in range(len(deal_map))], dtype=np.int32)

In [15]:
x = x.drop(columns=['ti1', 'ti2', 'dt'])

In [16]:
pos = x[x['label']==1]
neg = x[x['label']==0]

In [17]:
x = pd.merge(pos, neg, on=['u','rgtme'])

In [18]:
x.head()

Unnamed: 0,v_x,u,seq_x,rgtme,label_x,v_y,seq_y,label_y
0,360,0,2,1523423608843,1.0,129,64,0.0
1,386,0,11,1523424295459,1.0,76,56,0.0
2,311,0,18,1523424419739,1.0,381,24,0.0
3,376,0,15,1523431012195,1.0,33,14,0.0
4,393,0,33,1523432005678,1.0,313,4,0.0


In [19]:
wepick_data = {'data':x, 
          'deal_map':deal_map, 'deal_key':deal_key,
          'user_map':user_map, 'user_key':user_key, 
          'ti1_map':ti1_map, 'ti1_key':ti1_key,
          'ti2_map':ti2_map, 'ti2_key':ti2_key,
          'ti1_list':ti1_list,
          'ti2_list':ti2_list
         }

In [20]:
with open(os.path.join(data_dir,'wepick_data.pkl'), 'wb') as f:
    pickle.dump(wepick_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [21]:
train_set = []
test_set = []
for u, hist in x.groupby('u'):
    pos = hist['v_x'].tolist()
    neg = hist['v_y'].tolist()
    for i in range(1, len(pos)):
        hist = pos[:i]
        if i != len(pos)-1:
            train_set.append((u, hist, pos[i], 1))
            train_set.append((u, hist, neg[i], 0))
        else:
            label = (pos[i], neg[i])
            test_set.append((u, hist, label))

In [33]:
random.seed(1234)

In [34]:
random.shuffle(train_set)
random.shuffle(test_set)


In [40]:
len(train_set), len(test_set)

(924410, 240012)

In [37]:
with open(os.path.join(data_dir,'wepick_dataset.pkl'), 'wb') as f:
    pickle.dump(train_set,f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_set,f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(ti1_list,f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((len(user_map), len(deal_map), len(ti1_map)), f, pickle.HIGHEST_PROTOCOL)


## logs from training

```Epoch 16 DONE	Cost time: 5413.38
Epoch 17 Global_step 492000	Train_loss: 0.4010	Eval_GAUC: 0.8195	Eval_AUC: 0.8254
Epoch 17 Global_step 493000	Train_loss: 0.4505	Eval_GAUC: 0.8212	Eval_AUC: 0.8271
Epoch 17 Global_step 494000	Train_loss: 0.4460	Eval_GAUC: 0.8204	Eval_AUC: 0.8266
Epoch 17 Global_step 495000	Train_loss: 0.4463	Eval_GAUC: 0.8206	Eval_AUC: 0.8270
Epoch 17 Global_step 496000	Train_loss: 0.4467	Eval_GAUC: 0.8210	Eval_AUC: 0.8273
Epoch 17 Global_step 497000	Train_loss: 0.4469	Eval_GAUC: 0.8189	Eval_AUC: 0.8247
Epoch 17 Global_step 498000	Train_loss: 0.4487	Eval_GAUC: 0.8219	Eval_AUC: 0.8267
Epoch 17 Global_step 499000	Train_loss: 0.4497	Eval_GAUC: 0.8199	Eval_AUC: 0.8254
Epoch 17 Global_step 500000	Train_loss: 0.4481	Eval_GAUC: 0.8252	Eval_AUC: 0.8307
Epoch 17 Global_step 501000	Train_loss: 0.4459	Eval_GAUC: 0.8220	Eval_AUC: 0.8282
Epoch 17 Global_step 502000	Train_loss: 0.4459	Eval_GAUC: 0.8233	Eval_AUC: 0.8285
Epoch 17 Global_step 503000	Train_loss: 0.4456	Eval_GAUC: 0.8232	Eval_AUC: 0.8291
Epoch 17 Global_step 504000	Train_loss: 0.4470	Eval_GAUC: 0.8216	Eval_AUC: 0.8282
Epoch 17 Global_step 505000	Train_loss: 0.4465	Eval_GAUC: 0.8194	Eval_AUC: 0.8251
Epoch 17 Global_step 506000	Train_loss: 0.4467	Eval_GAUC: 0.8216	Eval_AUC: 0.8274
Epoch 17 Global_step 507000	Train_loss: 0.4446	Eval_GAUC: 0.8191	Eval_AUC: 0.8250
Epoch 17 Global_step 508000	Train_loss: 0.4439	Eval_GAUC: 0.8185	Eval_AUC: 0.8246
Epoch 17 Global_step 509000	Train_loss: 0.4446	Eval_GAUC: 0.8180	Eval_AUC: 0.8234
Epoch 17 Global_step 510000	Train_loss: 0.4450	Eval_GAUC: 0.8208	Eval_AUC: 0.8268
Epoch 17 Global_step 511000	Train_loss: 0.4440	Eval_GAUC: 0.8204	Eval_AUC: 0.8264```

In [60]:
x[x['u']==319894].sort_values('rgtme')

Unnamed: 0,v_x,u,seq_x,rgtme,label_x,v_y,seq_y,label_y
896407,372,319894,1,1523447596572,1.0,336,44,0.0
896408,372,319894,1,1523447740829,1.0,354,38,0.0
896409,313,319894,4,1523447753500,1.0,354,38,0.0


In [64]:
origin_x[origin_x['u']==17739649].sort_values('rgtme')

Unnamed: 0,v,u,seq,rgtme,dt,label,ti1,ti2
4904456,3525364,17739649,10,1523421973269,2018-04-11 13,1.0,243030109,243030100
4904457,3525364,17739649,10,1523422520864,2018-04-11 13,1.0,243030109,243030100
3205176,3527645,17739649,1,1523447596572,2018-04-11 20,1.0,243030109,243030100
3563165,3522402,17739649,44,1523447596572,2018-04-11 20,0.0,235050109,235050100
3205175,3527645,17739649,1,1523447740829,2018-04-11 20,1.0,243030109,243030100
4247512,3525124,17739649,38,1523447740829,2018-04-11 20,0.0,241020104,241020100
3832669,3521050,17739649,4,1523447753500,2018-04-11 20,1.0,243030103,243030100
4247513,3525124,17739649,38,1523447753500,2018-04-11 20,0.0,241020104,241020100


In [48]:
np.fliplr([a])[0]

array([4, 3, 2, 1])

In [67]:
deal_key[334]

3522378

In [57]:
deal_map

{1285223: 0,
 1371699: 1,
 1432589: 2,
 1432649: 3,
 1438471: 4,
 1468919: 5,
 1470891: 6,
 1523317: 7,
 1534369: 8,
 1855957: 9,
 1942015: 10,
 2040590: 11,
 2235951: 12,
 2258611: 13,
 2271029: 14,
 2388501: 15,
 2409745: 16,
 2539866: 17,
 2815821: 18,
 2827960: 19,
 2828078: 20,
 2833857: 21,
 2847768: 22,
 2991058: 23,
 3009333: 24,
 3044938: 25,
 3061867: 26,
 3068897: 27,
 3119252: 28,
 3119538: 29,
 3119665: 30,
 3182979: 31,
 3198909: 32,
 3207879: 33,
 3210699: 34,
 3238108: 35,
 3308451: 36,
 3312415: 37,
 3320424: 38,
 3337685: 39,
 3345369: 40,
 3346365: 41,
 3360824: 42,
 3395172: 43,
 3407791: 44,
 3419534: 45,
 3419838: 46,
 3424756: 47,
 3433098: 48,
 3435709: 49,
 3435712: 50,
 3435713: 51,
 3435715: 52,
 3449366: 53,
 3454797: 54,
 3460191: 55,
 3461951: 56,
 3464089: 57,
 3464309: 58,
 3466191: 59,
 3466341: 60,
 3466753: 61,
 3471308: 62,
 3471409: 63,
 3471757: 64,
 3472653: 65,
 3477356: 66,
 3478330: 67,
 3478607: 68,
 3479353: 69,
 3480103: 70,
 3480437: 71,
 3

In [None]:
user_map