In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import pickle
import random
import numpy as np
import sys
import glob
import pandas as pd

In [2]:
wepick_data_header = [
"v", "u", "seq", "rgtme", "dt", "label", "av", "bq", "dn", "dot", "dv", "dvcid", "g", "lid0",
"lid1", "lid2", "s", "ci", "dgid", "ef", "ls", "pe", "po", "pot", "ps", "set", "sst", "st",
"ti1", "ti2", "ti3", "ti4", "ti5", "tn1", "tn2", "tn3", "tn4", "tn5"
]

In [3]:
data_dir = r'/Users/jangmino/tensorflow/DIN_tf_eager'
dic = {}
for fname in glob.glob(data_dir + '/*.csv'):
    df = pd.read_csv(fname, header=None, names=wepick_data_header)
    dic[fname] = df

In [4]:
x = pd.concat(dic.values(), ignore_index=True)

In [5]:
x = x[['v','u','seq', 'rgtme','dt', 'label', 'ti1', 'ti2']]

In [6]:
# There may be NA in ti1, ti2 (배송2.0 관련?)
x = x.reset_index(drop=True)
x = x.dropna(how='any')

In [7]:
x['ti1'] = x['ti1'].astype('int64')
x['ti2'] = x['ti2'].astype('int64')

In [8]:
def build_map(df, col_name):
  key = sorted(df[col_name].unique().tolist())
  m = dict(zip(key, range(len(key))))
  df[col_name] = df[col_name].map(lambda x: m[x])
  return m, key

In [9]:
origin_x = x.copy()

In [10]:
deal_map, deal_key = build_map(x, 'v')

In [11]:
user_map, user_key = build_map(x, 'u')

In [12]:
ti1_map, ti1_key = build_map(x, 'ti1')
ti2_map, ti2_key = build_map(x, 'ti2')

In [13]:
x = x.sort_values(['u','rgtme'])
x = x.reset_index(drop=True)

In [14]:
ti1_list = np.array([x['ti1'][i] for i in range(len(deal_map))], dtype=np.int32)
ti2_list = np.array([x['ti2'][i] for i in range(len(deal_map))], dtype=np.int32)

In [15]:
x = x.drop(columns=['ti1', 'ti2', 'dt'])

In [16]:
pos = x[x['label']==1]
neg = x[x['label']==0]

In [17]:
x = pd.merge(pos, neg, on=['u','rgtme'])

In [18]:
x.head()

Unnamed: 0,v_x,u,seq_x,rgtme,label_x,v_y,seq_y,label_y
0,360,0,2,1523423608843,1,129,64,0
1,386,0,11,1523424295459,1,76,56,0
2,311,0,18,1523424419739,1,381,24,0
3,376,0,15,1523431012195,1,33,14,0
4,393,0,33,1523432005678,1,313,4,0


In [19]:
wepick_data = {'data':x, 
          'deal_map':deal_map, 'deal_key':deal_key,
          'user_map':user_map, 'user_key':user_key, 
          'ti1_map':ti1_map, 'ti1_key':ti1_key,
          'ti2_map':ti2_map, 'ti2_key':ti2_key,
          'ti1_list':ti1_list,
          'ti2_list':ti2_list
         }

In [20]:
with open(data_dir + '/wepick_data.pkl', 'wb') as f:
    pickle.dump(wepick_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [26]:
train_set = []
test_set = []
for u, hist in x.groupby('u'):
    pos = hist['v_x'].tolist()
    neg = hist['v_y'].tolist()
    for i in range(1, len(pos)):
        hist = pos[:i]
        if i != len(pos)-1:
            train_set.append((u, hist, pos[i], 1))
            train_set.append((u, hist, neg[i], 0))
        else:
            label = (pos[i], neg[i])
            test_set.append((u, hist, label))

In [27]:
print(len(train_set))

924410


In [28]:
print(len(test_set))


240012
