In [32]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import ast
import time
from sklearn import preprocessing

In [71]:
score=pd.read_csv('score.csv',header=None)
job_feature=pd.read_csv('job_feature1.csv',index_col=False)
query_feature=pd.read_csv('query_feature.csv')

In [72]:
score.columns=['jobno','querystring','y']

In [73]:
job_feature=job_feature.iloc[:,1:]

In [74]:
query_feature=query_feature.iloc[:,1:]

In [16]:
with open('train-click.json','r') as f:
    a=f.readlines()
Lis=[ast.literal_eval(i) for i in a]
Lis=[{'joblist':i['joblist'],'querystring':i['querystring']} for i in Lis]

In [33]:

def get_variable(lis,query):
    #1 把list中每個元素都找到job_feature並與query join
    #2 計算出每個query對應到job的rel分數，若沒有值補0
    d = {'jobno': [int(i) for i in lis], 'querystring': [query for i in lis]}
    dat = pd.merge(pd.DataFrame(d), score, how='left', on=['jobno','querystring']).fillna(0)
    #3 merge-job feature
    dat = pd.merge(pd.DataFrame(dat), job_feature, how='left', on=['jobno']).fillna(0)
    #4 merge query feature
    dat = pd.merge(pd.DataFrame(dat), query_feature, how='left', on=['querystring']).fillna(0)
    
    return np.array(dat['y']),np.array(dat.iloc[:,3:])

In [75]:
class ListNet():
    def __init__(self,learning_rate=0.01,list_size=20,query_feature_size=42,job_feature_size=0):
        tf.reset_default_graph()
        self.learning_rate = learning_rate
        self.list_size = list_size 
        self.query_feature_size=query_feature_size
        self.job_feature_size=job_feature_size
        # tf Graph input
        self.x = tf.placeholder(tf.float32, [self.list_size, self.query_feature_size+self.job_feature_size])
        self.y = tf.placeholder(tf.float32, [self.list_size])
        # Create autoencoder network
        self._create_network()
        # Define loss function based variational upper-bound and 
        # corresponding optimizer
        self._create_loss_optimizer()
        
        # Initializing the tensor flow variables
        init = tf.global_variables_initializer()
        
        # Launch the session
        self.sess = tf.InteractiveSession()
        self.sess.run(init)
        self.saver = tf.train.Saver(max_to_keep=100)
    
    def _create_network(self):
        # encoder -> 把x encode成 z ->把z encode成mu,sigma 
        network_weights=self._initialize_weights()
        
        layer1=tf.add(tf.matmul(self.x,network_weights['weights']['Deep1']),network_weights['b']['Deep1'])
        layer1=tf.sigmoid(layer1)
        
        layer2=tf.add(tf.matmul(layer1,network_weights['weights']['Deep2']),network_weights['b']['Deep2'])
        layer2=tf.sigmoid(layer2)
        
        self.t=tf.add(tf.matmul(layer2,network_weights['weights']['out']),network_weights['b']['out'])
        
    def _initialize_weights(self):
        all_weights={
                "weights":{
                    "Deep1":tf.get_variable('C_W_1', shape=(self.query_feature_size+self.job_feature_size,10), initializer=tf.contrib.layers.xavier_initializer()),
                    "Deep2":tf.get_variable('C_W_2', shape=(10,5), initializer=tf.contrib.layers.xavier_initializer()) ,
                    "out":tf.get_variable('out_w', shape=(5,1), initializer=tf.contrib.layers.xavier_initializer()) 
                },
                "b":{
                    "Deep1":tf.get_variable('C_b_1', shape=(10), initializer=tf.contrib.layers.xavier_initializer()),
                    "Deep2":tf.get_variable('C_b_2', shape=(5), initializer=tf.contrib.layers.xavier_initializer()) ,
                    "out":tf.get_variable('out_b', shape=(1), initializer=tf.contrib.layers.xavier_initializer()) 
                }
            }
        return all_weights
    
    def _create_loss_optimizer(self):
        real_score=tf.divide(tf.exp(self.y),tf.reduce_sum(tf.exp(self.y)))
        model_score=tf.divide(tf.exp(self.t),tf.reduce_sum(tf.exp(self.t)))
        self.cost = -tf.reduce_sum(tf.multiply(real_score,tf.log(model_score)))
        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
    
    def partial_fit(self, X,Y):
        """Train model based on mini-batch of input data.

        Return cost of mini-batch.
        """
        #opt, cost = self.sess.run((self.optimizer, self.cost), 
    #                                   feed_dict={self.x: X})
        opt,cost=self.sess.run((self.optimizer,self.cost),{self.x:X,self.y:Y})
        return cost
    
    def restore_model(self,checkpoint):
        self.saver.restore(self.sess,checkpoint)
    
    def predict(self,List,query):
        _,x=get_variable(List,query)
        sort_list=self.sess.run(self.t,{self.x:x})
        sort_list=pd.DataFrame({"index":sort_list.flatten(),"joblist":List})
        return  sort_list.sort_values(by=['index'])
    
    def NDCG(self,List,query):
        y,x=get_variable(List,query)
        sort_list=self.sess.run(self.t,{self.x:x})
        sort_list=pd.DataFrame({"index":sort_list.flatten(),"joblist":List})
        sort_list=sort_list.sort_values(by=['index'])
        sort_true=pd.DataFrame({"index":y,"joblist":List})
        sort_true=sort_true.sort_values(by=['index'])
        sort_true['fake']=sort_list['joblist']
        DCG=pd.merge(sort_list,sort_true,on="joblist",how="left")
        print(DCG['index_y'])
        DCG=self.discount_rel(list(DCG['index_y']))
        IDCG=sorted(y,reverse=True)
        print(IDCG)
        IDCG=self.discount_rel(IDCG)
        return DCG/IDCG
        
    def discount_rel(self,lis):
        return np.sum([lis[i]/np.log2(i+2) for i in range(len(lis))])
        
    
        

In [76]:
def train( learning_rate=0.01,
          batch_size=2000, training_epochs=10, display_step=5):
    listnet = ListNet()
    # Training cycle
    
    for epoch in range(training_epochs):
        # Loop over all batches
        tStart = time.time()
        disp=0
        for i in Lis:
#             print(batch_xs.shape)
            # Fit training using batch data
            Y,X=get_variable(i['joblist'],i['querystring'])
            
            cost = listnet.partial_fit(X,Y)       
            disp+=1
            if disp%display_step*100==0:
                print("Epoch:", '[%04d]' % (epoch+1))
                print("cost=", "{:.9f}".format(cost)) 
                listnet.saver.save(listnet.sess, "checkpoints/i_l.ckpt")
    
                  
#         Display logs per epoch step
        if epoch % display_step*500 == 0:
            tEnd = time.time()
            
            print("Epoch:", '%04d' % (epoch+1), 
                  "cost=", "{:.9f}".format(avg_cost),
                  "time=",tEnd-tStart)
        
    return ListNet

In [77]:
train()

('Epoch:', '[0001]')
('cost=', '59.915786743')
('Epoch:', '[0001]')
('cost=', '59.916046143')


KeyboardInterrupt: 

In [78]:
lisnet=ListNet()

In [79]:
lisnet.restore_model(tf.train.latest_checkpoint('checkpoints'))

INFO:tensorflow:Restoring parameters from checkpoints/i_l.ckpt


In [80]:
lisnet.NDCG(Lis[0]['joblist'],Lis[0]['querystring'])

0     1.0
1     0.0
2     0.0
3     1.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    2.0
19    0.0
Name: index_y, dtype: float64
[2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


0.60475102714555273

In [82]:
for i in Lis[1:10]:
    print("NDCG:",lisnet.NDCG(i['joblist'],i['querystring']))

0     0.0
1     0.0
2     1.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
Name: index_y, dtype: float64
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
('NDCG:', 0.5)
0     0.0
1     0.0
2     1.0
3     0.0
4     1.0
5     1.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
Name: index_y, dtype: float64
[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
('NDCG:', 0.58334161051493316)
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     1.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
Name: index_y, dtype: float64
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0