In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

  from ._conv import register_converters as _register_converters


# Loading data

In [2]:
direc=os.path.dirname(os.path.realpath('__file__'))
os.path.join(direc)
table = pd.read_csv("category_prediction.csv", names=['userId','age','sex','date','dayofweek','amount','geoId','catId','title'])
table = table.sort_values(by='date')
table['catId'].astype(int)
table['date'] = pd.to_datetime(table['date'], format='%Y-%m-%d')
table = table.set_index(['date'])
#print table

# Category filtering:: 외식,유흥,문화예술,뷰티

In [3]:
#외식/유흥/문화예술/뷰티
table = table.query('amount <= 5000000')
table = table.query('(catId >= 220000 and catId <=240000) or (catId>=260000 and catId<=270000) or (catId>=560000 and catId<=570000) or (catId>=460000 and catId<= 470000) or (catId >= 241114 and catId <= 241114 )')
print table

                     userId  age  sex  dayofweek  amount       geoId   catId  \
date                                                                           
2018-01-01 00:02:11  534536   47    0          2   22200   서울특별시 노원구  221210   
2018-01-01 00:03:14  449134   36    0          2    7000   서울특별시 은평구  561110   
2018-01-01 00:09:12  501328   31    0          2   79000   서울특별시 중랑구  221111   
2018-01-01 00:09:13  491320   50    1          2   38000   서울특별시 중랑구  261310   
2018-01-01 00:09:21  143856   36    1          2    8000   서울특별시 용산구  221810   
2018-01-01 00:10:58  539205   28    1          2   25500   서울특별시 종로구  221815   
2018-01-01 00:11:32  423819   30    1          2    5500   서울특별시 강남구  261310   
2018-01-01 00:12:53  218341   23    0          2   16000   서울특별시 송파구  222113   
2018-01-01 00:13:37  489516   32    1          2   18000   서울특별시 성동구  561110   
2018-01-01 00:13:58  421620   37    0          2    8000   서울특별시 마포구  261310   
2018-01-01 00:14:24  354398   34    1   

# Input scaling

In [4]:
#min-max scaling
epsilon = 0.00000000001
print table['amount'].min()
print table['amount'].max()

table['age'] =    ( table['age'] - table['age'].min() ) /( table['age'].max()-table['age'].min()+epsilon)
table['amount'] = (table['amount'] - table['amount'].min() ) / (table['amount'].max() - table['amount'].min() + epsilon )

1
5000000


# Hyper parameters

In [5]:
factorized_cat = pd.factorize(np.array(table['catId']))
factorized_geo = pd.factorize(np.array(table['geoId']))

original_cat={}
leng = len(factorized_cat[0])
catList = table['catId']

for i in range(0,leng):
    original_cat[ factorized_cat[0][i] ] = catList[i]
    
#params
table['catId'] = factorized_cat[0] 
catMin = int(table.agg({'catId':'min'}))
catMax = int(table.agg({'catId':'max'}))

table['geoId'] = factorized_geo[0]
geoMin = int(table.agg({'geoId':'min'}))
geoMax = int(table.agg({'geoId':'max'}))
sssssss

params={}
params['catCount'] = catMax+1
params['geoCount'] = geoMax+1
params['maxLen'] = 31

params['input_size'] = 1+2+(7)+ params['catCount'] + params['geoCount']

print "Category minmax     : [%s,%s]"% (catMin,catMax)
print "Geo minmax          : [%s,%s]"% (geoMin,geoMax)
print "user Sequence       : [%s~%s]"%(0,params['maxLen']-1)



Category minmax     : [0,140]
Geo minmax          : [0,27]
user Sequence       : [0~30]


# Time window duration

In [6]:
def timerange(mode):

    if mode is 'train':
        input_s = pd.date_range(start='2018-01-01 00:00:00',periods=4,freq='10d').astype(str)
        input_e = pd.date_range(start='2018-01-31 23:59:59',periods=4,freq='10d').astype(str)
        target_s = pd.date_range(start='2018-02-01 00:00:00',periods=4,freq='10d').astype(str)
        target_e = pd.date_range(start='2018-02-10 23:59:59',periods=4,freq='10d').astype(str)
        return zip(input_s,input_e,target_s,target_e)
    
    elif mode is 'test':
        input_s = pd.date_range(start='2018-03-01 00:00:00',periods=1,freq='1d').astype(str)
        input_e = pd.date_range(start='2018-03-31 23:59:59',periods=1,freq='1d').astype(str)
        target_s = pd.date_range(start='2018-04-01 00:00:00',periods=1,freq='1d').astype(str)
        target_e = pd.date_range(start='2018-04-10 23:59:59',periods=1,freq='1d').astype(str)
        return zip(input_s,input_e,target_s,target_e)
    


# Data preprocessing with time window

In [7]:
import time
def df2seq(df,params):
    
    #dataframe to Sequence
    seq_input = []  # (batch,maxLen,input_size)
    seq_label = []  # (batch,maxLen,output_size)
    seq_mask = [] # (batch,maxLen,1)
    seq_len = []    # (batch)
    
    maxLen = params['maxLen']
    categoryCount = params['catCount']
    geoCount = params['geoCount']
    
    cutBound = params['cutBound']
    
    tp = timerange( params['mode'] )
    
    
    for s,e,ts,te in tp:

        userHistories = df.loc[s:e].groupby(['userId'])
        userFutures = df.loc[ts:te].groupby(['userId'])
        
        user_future = userFutures.groups.keys()
        user_history= userHistories.groups.keys()
           
        print s,e
        
        sss = time.time()
        for user in user_future:
        
            if user not in user_history:
                continue
            
            userData = userHistories.get_group((user))
            userTarget = userFutures.get_group((user))
            
            example = userData.loc[s:e]
            answer  = userTarget.loc[ts:te]
            
            if(len(example) < cutBound): continue
            
            start_time = pd.to_datetime(s)
            end_time = pd.to_datetime(e)
            
            #user features
            sex_features = np.zeros((maxLen,2))
            age_features = np.zeros((maxLen,1))
            
            #time features
            week_features = np.zeros((maxLen,7))
            
            #transaction features
            geo_features = np.zeros((maxLen,geoCount))
            cat_features = np.zeros((maxLen,categoryCount))
            
            label = np.zeros((maxLen,categoryCount))
            mask = np.ones((maxLen,1))*0.1
            
            
            dateList = example.index
            ageList = example['age']
            amoList = example['amount']
            sexList = example['sex']
            weekList= example['dayofweek']
            catList = example['catId']
            geoList = example['geoId']
             
            #user features
            sex_features[:,sexList[0]]=1
            age_features[:,0] = ageList[0]
            
            #time feature
            dayofweek_s = start_time.dayofweek
            for day in range(maxLen):
                week_features[day][(dayofweek_s + day)%7]=1
            
            #transaction features
            length = len(dateList)
            for t in range(length):
                today = (dateList[t]-start_time).days
                past = range( max(0, today-10),today)
                label[past , [ catList[t] ] * len(past) ]=1 
                mask[past]=1
                
                geo_features[today][geoList[t]]=1
                cat_features[today][catList[t]]=1
                         
            temp = np.zeros((maxLen,0))
            temp = np.concatenate((temp,age_features),axis=1)
            temp = np.concatenate((temp,sex_features),axis=1)
            temp = np.concatenate((temp,week_features),axis=1)
            temp = np.concatenate((temp,geo_features),axis=1)
            temp = np.concatenate((temp,cat_features),axis=1)
            
            seq_input.append(temp)
            seq_label.append(label)
            seq_len.append(length)  
            seq_mask.append(mask)
            
        print("--- %s seconds ---" %(time.time() - sss))
    return np.array(seq_input),np.array(seq_label),np.array(seq_mask)


In [8]:
params['mode']='train'
params['cutBound']=5
seq_train_input, seq_train_label, seq_train_mask = df2seq(table,params)

print seq_train_input.shape,seq_train_label.shape ,seq_train_mask.shape

2018-01-01 2018-01-31 23:59:59
--- 63.8730020523 seconds ---
2018-01-11 2018-02-10 23:59:59
--- 63.6697821617 seconds ---
2018-01-21 2018-02-20 23:59:59
--- 74.6590688229 seconds ---
2018-01-31 2018-03-02 23:59:59
--- 59.9870588779 seconds ---
(28210, 31, 179) (28210, 31, 141) (28210, 31, 1)


In [9]:
params['mode']='test'
params['cutBound']=1
seq_test_input, seq_test_label, seq_test_mask= df2seq(table,params)

print seq_test_input.shape, seq_test_label.shape ,seq_test_mask.shape

2018-03-01 2018-03-31 23:59:59
--- 71.7370831966 seconds ---
(16548, 31, 179) (16548, 31, 141) (16548, 31, 1)


In [10]:
def converter(userHistories,s,e,userId,params):
    
    maxLen = params['maxLen']
    categoryCount = params['catCount']
    geoCount = params['geoCount']
     
    seq_input=[]
    userData = userHistories.get_group(userId)
    example = userData.loc[s:e]
    
    start_time = pd.to_datetime(s)
    end_time = pd.to_datetime(e)
            
    #user features
    sex_features = np.zeros((maxLen,2))
    age_features = np.zeros((maxLen,1))
            
    #time features
    week_features = np.zeros((maxLen,7))
            
    #transaction features
    geo_features = np.zeros((maxLen,geoCount))
    cat_features = np.zeros((maxLen,categoryCount))
            
    label = np.zeros((maxLen,categoryCount))
            
    dateList = example.index
    ageList = example['age']
    amoList = example['amount']
    sexList = example['sex']
    weekList= example['dayofweek']
    catList = example['catId']
    geoList = example['geoId']
             
    #user features
    sex_features[:,sexList[0]]=1
    age_features[:,0] = ageList[0]
            
    #time feature
    dayofweek_s = start_time.dayofweek
    for day in range(maxLen):
        week_features[day][(dayofweek_s + day)%7]=1
            
    #transaction features
    length = len(dateList)
    for t in range(length):
        today = (dateList[t]-start_time).days
        geo_features[today][geoList[t]]=1
        cat_features[today][catList[t]]=1
                            
    temp = np.zeros((maxLen,0))
    temp = np.concatenate((temp,age_features),axis=1)
    temp = np.concatenate((temp,sex_features),axis=1)
    temp = np.concatenate((temp,week_features),axis=1)
    temp = np.concatenate((temp,geo_features),axis=1)
    temp = np.concatenate((temp,cat_features),axis=1)
            
    seq_input.append(temp)
    
    return np.array(seq_input)

In [12]:
def lstmcell(size, prob_tensor):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm_cell,output_keep_prob=prob_tensor)
    
    return lstm_dropout

max_len = params['maxLen']
input_size = params['input_size']
output_size= params['catCount']

tf.reset_default_graph()

#define lstm
sequence = tf.placeholder( tf.float32, [None, max_len, input_size ])
labels = tf.placeholder( tf.float32, [None,max_len,output_size])
masks = tf.placeholder(tf.float32, [None,max_len,1])

keep_prob = tf.placeholder(tf.float32)

dynamic_length= tf.placeholder(tf.int32, [None])


#Model
rnn_layers = [ lstmcell(size,keep_prob) for size in [50]]
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
outputs,_ = tf.nn.dynamic_rnn(multi_rnn_cell, sequence , dtype=tf.float32)

#predictions = outputs
predictions = (tf.contrib.layers.fully_connected(outputs, output_size))


#print predictions.shape
topk = tf.nn.top_k(predictions,k=10,sorted=True).indices
tops = tf.nn.top_k(predictions,k=10,sorted=True).values
#
loss = tf.reduce_mean(tf.square(labels-predictions)*masks)

train = tf.train.AdamOptimizer(learning_rate= 0.0001).minimize(loss)

epochs=3000
#4000
batchSize = 1024
#70069
trainSize,_,_ = seq_train_input.shape
testSize,_,_ = seq_test_input.shape

sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
    
for it in range(epochs):
    
    st=time.time()
    for b in range((trainSize/batchSize)):
        batch_s = b*batchSize
        batch_e = min(trainSize,batch_s + batchSize)
        l,_ = sess.run([loss,train],feed_dict={sequence: seq_train_input[batch_s:batch_e],
                                              labels:seq_train_label[batch_s:batch_e],
                                                  masks:seq_train_mask[batch_s:batch_e],keep_prob:0.5} )
    closs=0
    cnt=0;
    for b in range((testSize/batchSize)):
        batch_s = b*batchSize
        batch_e = min(testSize,batch_s + batchSize)
        l2= sess.run(loss,feed_dict={sequence: seq_test_input[batch_s:batch_e],
                                              labels: seq_test_label[batch_s:batch_e],
                                              masks:seq_test_mask[batch_s:batch_e],keep_prob:1.0})
        cnt+=1.0
        closs+=l2;
        
    closs/=cnt;
    
    
    
    print ("Loss[%s/%s]:%s  %s"%(it,epochs,closs, time.time() -st))
    if(it%10 ==0):
        userHistories = table['2018-03-01 00:00:00' : '2018-03-31 23:59:59' ].groupby(['userId'])
        test_users_20 = [329439,463618, 536284,551211,453264,420850,166536,434771,391746,503676,178238,567126,291711]
        test_users_30 = [441638,414176,554917,295,533903,44447,469767,445193,441080]

        
        cnt=0
        for user in test_users_20:
            ex = converter(userHistories,'2018-03-01 00:00:00', '2018-03-31 23:59:59',user,params)
        
            R=sess.run(topk,feed_dict={sequence: ex,keep_prob:1.0})
            cnt+=1
            print ("[Test 20-%s]"%(cnt))
            print R[0][-1]
        
        cnt=0
        for user in test_users_30:
            ex = converter(userHistories, '2018-03-01 00:00:00', '2018-03-31 23:59:59',user,params)    
            R=sess.run(topk, feed_dict={sequence: ex,keep_prob:1.0})
            cnt+=1
            print ("[Test 30-%s]"%(cnt))
            print R[0][-1]
        


Loss[0/3000]:0.0066664737241808325  1.85569310188
[Test 20-1]
[ 20   1 100  99  90 112  29  81  95  59]
[Test 20-2]
[ 20   1 100  90  99  29 104 112  82  81]
[Test 20-3]
[ 27 109  92  80  77 110  48 111   1  13]
[Test 20-4]
[109  92  48 110   4  80 111  16 118  13]
[Test 20-5]
[109 110  92  27  48  77  80 111   1 118]
[Test 20-6]
[109 110   4 111  92  48  77 118  80  16]
[Test 20-7]
[ 92 110 109  48 111  80  27  13  77  99]
[Test 20-8]
[ 92  27  13  80  84 110  99   4  74  48]
[Test 20-9]
[ 20   1  99 100 112  90  81  29  82  92]
[Test 20-10]
[ 20   1  99 100 112  90  81  29  95  92]
[Test 20-11]
[ 20   1  99  90 100 112  81  29 104  95]
[Test 20-12]
[ 20   1  99  90 100 112  81  29 104  95]
[Test 20-13]
[ 92  80 109  48  99   4 110  27  13  74]
[Test 30-1]
[ 20   1  99 112  29  90 100  95  82 133]
[Test 30-2]
[ 92 110   4  74  80 109  51 111  48  27]
[Test 30-3]
[ 92  80  13  99  74 111  48  16  84   4]
[Test 30-4]
[110  48  92 109  80 111   4 118  99  77]
[Test 30-5]
[ 13  92  99 110

[Test 30-2]
[20 17 23 34  0 11 21 14  4 50]
[Test 30-3]
[17 20 23 34 21  0 11 14  4  1]
[Test 30-4]
[20 17 23  0 34 21 11  1  9 14]
[Test 30-5]
[17  0 20 23 29  8 34  1 11 31]
[Test 30-6]
[ 0 20 23 34 17  8 31 11  1 21]
[Test 30-7]
[17  0 20 31 11 40 28 21 18 14]
[Test 30-8]
[17 34 20 23 21 31 14 11 40 18]
[Test 30-9]
[17 20 23  0 34 11 21  1 14 31]
Loss[51/3000]:0.005830977781442925  1.47990393639
Loss[52/3000]:0.00582927608047612  1.47953987122
Loss[53/3000]:0.005827677989145741  1.4767780304
Loss[54/3000]:0.005825634056236595  1.48004484177
Loss[55/3000]:0.0058232193114236  1.48894095421
Loss[56/3000]:0.005821157130412757  1.49318909645
Loss[57/3000]:0.0058191683201584965  1.49207210541
Loss[58/3000]:0.005817152763484046  1.48125386238
Loss[59/3000]:0.005815706826979294  1.47043299675
Loss[60/3000]:0.005813848110847175  1.4677169323
[Test 20-1]
[23 17  8 20  0 31 11  1 42 34]
[Test 20-2]
[ 8 23  0 31  1 20 42 17 27 29]
[Test 20-3]
[17  0 20 23 31 11  1 40 21 27]
[Test 20-4]
[ 0 20 1

[Test 30-2]
[20 17 23  0 34 11 21 50 31 14]
[Test 30-3]
[20 17 23 34 21 45 29 31 28 14]
[Test 30-4]
[20 21 34 17  0 23 41  9 14 45]
[Test 30-5]
[29  8 20  0  1 23 44 45 34 17]
[Test 30-6]
[20  0 34 21 31 11 17  1 50 14]
[Test 30-7]
[17  0 20 23 31 28 50 34 14 40]
[Test 30-8]
[17 34 21 20 14 52 45 31 18 11]
[Test 30-9]
[20 17 23  0 34 21 31  4 28 50]
Loss[111/3000]:0.005760567408287898  1.47279882431
Loss[112/3000]:0.005759974359534681  1.47525787354
Loss[113/3000]:0.005759361607488245  1.48196196556
Loss[114/3000]:0.005759147432399914  1.47818803787
Loss[115/3000]:0.0057585283648222685  1.47547793388
Loss[116/3000]:0.005758131679613143  1.47142386436
Loss[117/3000]:0.005758198531111702  1.47150707245
Loss[118/3000]:0.005757644044933841  1.47387504578
Loss[119/3000]:0.005757104052463546  1.4701499939
Loss[120/3000]:0.00575689083780162  1.47303915024
[Test 20-1]
[23 41 20  8 21 17 31  9  4  1]
[Test 20-2]
[ 8 23  1 31 42  0  4 41  6  9]
[Test 20-3]
[17  0  8 23  1 31 20 21 40  9]
[Test 2

Loss[170/3000]:0.005736034887377173  1.46479296684
[Test 20-1]
[21 23 20 41  9 31  8 16  3 28]
[Test 20-2]
[ 8 23 31  1 42  0 41  6  9 40]
[Test 20-3]
[17  0  1 23  8  9 31 40 20  3]
[Test 20-4]
[ 0 20 11 17 21 34 31 40  9 50]
[Test 20-5]
[17 20 34 11 36  4  9 40 14  3]
[Test 20-6]
[ 4  0 20 21  9 23 31 17 11 50]
[Test 20-7]
[ 0 23 20 17  1  8 40  9 31  3]
[Test 20-8]
[ 0 23 17 40 27  1  9  6 50 11]
[Test 20-9]
[ 8 17  0  1 23 42 31  9 40  3]
[Test 20-10]
[ 8  0 23 42  1  4 31 17  9 40]
[Test 20-11]
[17  8 23 42 31  1  9 40 28  6]
[Test 20-12]
[17 21  8 31 20 42  9  1 28 23]
[Test 20-13]
[20  1  8 17  9  3 13  4 40 16]
[Test 30-1]
[17 29 23  9 14 20 34 40 28  3]
[Test 30-2]
[20 23 17  0 11 34 21 50 31 28]
[Test 30-3]
[20 17 23 28 40 50  9  3 72 31]
[Test 30-4]
[20 21  9 34 28 16 14  3 72 40]
[Test 30-5]
[29 20 34 14 31 45  9 40 35  3]
[Test 30-6]
[20  0 34 31 21 50 18 28  9 14]
[Test 30-7]
[17  0 20 23 28 40 31 50  9 32]
[Test 30-8]
[34 21 17 20 14  9 18 39 31 52]
[Test 30-9]
[20 17 23

Loss[221/3000]:0.005730333039537072  1.47627282143
Loss[222/3000]:0.005730160279199481  1.47388195992
Loss[223/3000]:0.005730284901801497  1.47186589241
Loss[224/3000]:0.005730292905354872  1.47684192657
Loss[225/3000]:0.0057305383670609444  1.48180389404
Loss[226/3000]:0.005730521137593314  1.47994399071
Loss[227/3000]:0.0057307232927996665  1.48875308037
Loss[228/3000]:0.005730943317757919  1.48955392838
Loss[229/3000]:0.005730783013859764  1.49668908119
Loss[230/3000]:0.005730877164751291  1.49807596207
[Test 20-1]
[21  9 20 23  3 31 17 16  8 28]
[Test 20-2]
[ 8 31 23  1 42  0  6 40 57 32]
[Test 20-3]
[17  0  9  3 40 31  1 20 16 28]
[Test 20-4]
[20  0 11 31 34 17  9 40 50  3]
[Test 20-5]
[17 11 20 34  9  4 36 40 14 50]
[Test 20-6]
[ 4  9 31  0 20 11  3 50 14 16]
[Test 20-7]
[ 0 23 20 17  1  9 40  8  3 27]
[Test 20-8]
[ 0 23 17 27 40  9 11  6  1 50]
[Test 20-9]
[ 8 17 42  0  1 31  9  3 40 23]
[Test 20-10]
[ 8  0 42  4  1 31  9 23  3 17]
[Test 20-11]
[17  8 42 23  9 31 28 40  1  6]
[T

[Test 30-2]
[20 17 23  0 11 34 31 28 50 27]
[Test 30-3]
[20 17 28 16 40  3  9 72 35 27]
[Test 30-4]
[20  9 28 16 72  3 40 21 50 17]
[Test 30-5]
[29 34 20 45 14 35 16 31  9  3]
[Test 30-6]
[20 34  0 31 17 21  9 28 50 14]
[Test 30-7]
[17  0 20 23 31 40 28 50 32 27]
[Test 30-8]
[34 21 17 20  9 14 45 52 28 18]
[Test 30-9]
[20 17 23 34  0 31 28 50 40  9]
Loss[281/3000]:0.005749893374741077  1.48248314857
Loss[282/3000]:0.00575026313890703  1.47381591797
Loss[283/3000]:0.00575071555795148  1.47084903717
Loss[284/3000]:0.005751385760959238  1.46863603592
Loss[285/3000]:0.005751267628511414  1.47061896324
Loss[286/3000]:0.005751898919697851  1.46802806854
Loss[287/3000]:0.005751807300839573  1.47082495689
Loss[288/3000]:0.005752088996814564  1.47052788734
Loss[289/3000]:0.005751913122367114  1.47058105469
Loss[290/3000]:0.005751514487201348  1.47358417511
[Test 20-1]
[21 20  9 17 23  3 31 28 16 50]
[Test 20-2]
[ 8 31 23  1 42  0 17  6 40 57]
[Test 20-3]
[17  0 31  9 20 40  3  1 16 23]
[Test 20

KeyboardInterrupt: 


- 0: 패스트푸드/일반
- 1: 영화
- 2: 외식/회
- 9: 맥주
- 42: 화장품
- 21: 한식뷔폐
- 17: 한식/일반
- 8 :
- 31: 일식/일반
- 23: 분식/일반
- 50: 일식/돈까스
- 40: 헤어샵

In [71]:
original_cat


{0: 221210,
 1: 561110,
 2: 221111,
 3: 261310,
 4: 221810,
 5: 221815,
 6: 222113,
 7: 261318,
 8: 461312,
 9: 261322,
 10: 261312,
 11: 221410,
 12: 261311,
 13: 561616,
 14: 221114,
 15: 221162,
 16: 561615,
 17: 221110,
 18: 221133,
 19: 221151,
 20: 221510,
 21: 222012,
 22: 221910,
 23: 222110,
 24: 222217,
 25: 261115,
 26: 221122,
 27: 221811,
 28: 221156,
 29: 222114,
 30: 261316,
 31: 221610,
 32: 221914,
 33: 221155,
 34: 222210,
 35: 221157,
 36: 221712,
 37: 221159,
 38: 222218,
 39: 221120,
 40: 461110,
 41: 221148,
 42: 461311,
 43: 561214,
 44: 222211,
 45: 221115,
 46: 221211,
 47: 221113,
 48: 221152,
 49: 221127,
 50: 221621,
 51: 221612,
 52: 221135,
 53: 221137,
 54: 221118,
 55: 222010,
 56: 221620,
 57: 221310,
 58: 222219,
 59: 222112,
 60: 561213,
 61: 221142,
 62: 221820,
 63: 221618,
 64: 221153,
 65: 221130,
 66: 561614,
 67: 221615,
 68: 221121,
 69: 221911,
 70: 561410,
 71: 222014,
 72: 221126,
 73: 221710,
 74: 221821,
 75: 221147,
 76: 561610,
 77: 4612

In [16]:
# test_users_20 = [329439,463618, 536284,551211,453264,420850,166536,434771,391746,503676,178238,567126,291711]
#test_users_30 = [441638,414176,554917,295,533903,44447,469767,445193,441080]

table['2018-03-01':'2018-04-10'].groupby(['userId']).get_group(441080)

Unnamed: 0_level_0,userId,age,sex,dayofweek,amount,geoId,catId,title
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-03-02 12:37:53,441080,0.227723,1,6,0.0013,4,20,홍복
2018-03-06 09:04:15,441080,0.227723,1,3,0.0002,4,0,맥도날드 관훈점
2018-03-06 12:43:10,441080,0.227723,1,3,0.0014,4,45,영주식당
2018-03-07 12:36:33,441080,0.227723,1,4,0.0014,4,20,거구장
2018-03-12 12:48:59,441080,0.227723,1,2,0.0016,4,14,인사동쭈꾸미
2018-03-12 20:49:50,441080,0.227723,1,2,0.0044,15,91,춘천옥닭갈비
2018-03-13 19:22:00,441080,0.227723,1,3,0.0006,15,23,도마김밥
2018-03-13 19:28:19,441080,0.227723,1,3,0.0006,15,23,참맛김밥
2018-03-15 12:03:56,441080,0.227723,1,5,0.0036,4,45,경복궁 종로점
2018-03-19 12:43:19,441080,0.227723,1,2,0.0014,4,20,거구장


[329439,
 463618,
 536284,
 551211,
 453264,
 420850,
 166536,
 434771,
 391746,
 503676,
 178238,
 567126,
 291711]

In [7]:
table['2018-03-01':'2018-04-10'].groupby(['userId']).get_group(461638)

KeyError: 461638

In [None]:
# test_users_20 = [329439,463618, 536284,551211,453264,420850,166536,434771,391746,503676,178238,567126,291711]
#test_users_30 = [441638,414176,554917,295,533903,44447,469767,445193,441080]

# User Sampling


- Sampling based on user group
    - 20대 = [329439,463618, 536284,551211,453264,420850,166536,434771,391746,503676,178238,567126,291711]
        
    - 30대 = [441638,414176,554917,295,533903,44447,469767,445193,441080]
            
 

In [5]:
target_e = pd.date_range(start='2018-02-10 23:59:59',periods=4,freq='10d').astype(str)

In [57]:
masks2 = np.zeros((10,1))

In [59]:
masks2[[1,2]] =1
masks2

array([[0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [46]:
pred = np.ones((2,10,3))*0.1
pred

array([[[0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1]],

       [[0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1]]])

In [40]:
pred*masks

array([[[0., 0., 0.],
        [1., 1., 1.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [1., 1., 1.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [124]:


import tensorflow as tf

num_classes=10
def precision_at_k(pred_top_k,labels,topk):
    #pred_top_k : shape => [batchSize,Numclasses]
    #labels : shape => [batchSize,Numclasses]
    
    hitCounts = tf.reduce_sum(pred_top_k*labels,axis=1)
    return tf.metrics.mean(pred_top_k,labels)

    
logits = tf.placeholder(tf.float32, [None,num_classes])
labels = tf.placeholder(tf.float32, [None,num_classes])

pred_top_k=  tf.reduce_sum ( tf.one_hot( tf.nn.top_k(logits,k=3).indices,
    depth =num_classes),axis=1)


sess= tf.Session()
sess.run(tf.local_variables_initializer())
sess.run(tf.initialize_all_variables())

In [143]:
output1 = [[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
          [0.1,0.2,0.3,0.4,0.5,0.5,0.4,0.3,0.2,0.1]]
output2 = [
    [1,]
]

answer = [
    [0,0,0,0,0,0,0,1,0,0],
    [1.,0,0,0,1,1,0,0,0,1]
]

In [144]:
sess.run(pred_top_k, feed_dict={logits: output} )

array([[0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 0., 0., 0., 0.]], dtype=float32)

In [145]:
sess.run(pred_top_k*labels, feed_dict={logits: output, labels: answer})

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.]], dtype=float32)

In [149]:
sess.run( tf.reduce_sum(pred_top_k*labels,axis=1)/3.0, feed_dict={logits:output,labels:answer})

array([0.33333334, 0.6666667 ], dtype=float32)

In [147]:

a = tf.metrics.mean(tf.cast(tf.reduce_sum(pred_top_k* labels,axis=1),tf.float32)/3.0)

In [150]:
sess.run(tf.local_variables_initializer())
sess.run(a , feed_dict = {logits : output, labels:answer})

sess.run(a, feed_dict = {logits : output, labels:answer})

(0.5, 0.5)