In [8]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
seed = 1
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)
import os 
import csv
import pickle
import time
import h5py
from collections import defaultdict
import tensorflow as tf
import json
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy.ma as ma

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

In [188]:
from dataloader import load_data, DataLoader, DataLoader_time
from parser import get_parser
from utils import norm, normalize, is_normalized_matrix, extract_data, save_args, load_args, \
    save_embeddings, load_embeddings, DataStruct, save_model_tf, save_best_tf, load_model_tf
from train import get_train_data
from logger import Logger
from model import init_params, crossentropy, choose_emb, choose_geo_loss, STSkipgram
from multiprocess_tools import multiprocess_compute_distance

In [13]:
args = get_parser(['--CITY', 'NYC', '--LOG_DIR', 'log_test', '--normalize_weight', '--WITH_TIME', '--WITH_GPS', '--WITH_TIMESTAMP', 
                   '--geo_reg_type', 'l2'])

In [219]:
origin_data, dicts = load_data(os.path.join(args.ROOT, 'data','{}_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk'.format(args.CITY) ))
args.vocabulary_size = dicts.vocabulary_size
data, idx = extract_data(origin_data, args)

loading data from /home/haibin2/data/checkins/data/NYC_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk
args.pattern: hand
indices setting : WITH_TIME normalize_weight WITH_GPS WITH_TIMESTAMP resume
indices: [0, 2, 3, 4, 5]


In [15]:
train_data = get_train_data(data)

Mode:both, size:(162302, 2, 5) size:(162302, 2, 5) total size:(324604, 2, 5)


In [251]:
def slice_func(mat, sizes):
    from functools import reduce
    indices = [0]+[reduce(lambda x,y:x+y, sizes[:i]) for i in range(1,len(sizes)+1)]
    tmp = list()
    for i in range(len(indices)-1):
        start, end = indices[i], indices[i+1]
        tmp.append(mat[:,start:end])
    return tmp

class STSkipgram(object):
    def __init__(self, args, pretrained_emb=None, pretrained_weight=None, pretrained_time=None):
        self.pretrained_emb   = init_params((args.vocabulary_size, args.sem_dim+args.geo_dim+args.free_dim), pretrained_emb)
        self.pretrained_weight= init_params((args.vocabulary_size, args.sem_dim+args.geo_dim+args.free_dim), pretrained_weight)
        self.pretrained_time  = init_params((args.n_timeslot, args.sem_dim), pretrained_time)
        
        # --Placeholders
        self.center_loc   = tf.placeholder(tf.int32, shape=[None], name='center_loc')
        self.label_loc    = tf.placeholder(tf.int32, shape=[None, 1], name='label_loc') #specific shape for sampled_softamx_loss
        self.weight_decay = tf.placeholder(tf.float32, shape=[None], name='weight_decay')
        self.coor_center  = tf.placeholder(tf.float32, shape=[None,2], name='coor_center')
        self.coor_label   = tf.placeholder(tf.float32, shape=[None,2], name='coor_label')
        self.label_t      = tf.placeholder(tf.int32, shape=[None], name='label_t')
        
        # --Weights
        slice_indices       = [args.sem_dim, args.geo_dim, args.free_dim]
        self.softmax_biases = tf.get_variable('bias', initializer=tf.zeros([args.vocabulary_size]), trainable=False)
        self.softmax_weights= tf.get_variable('weights', initializer=self.pretrained_weight)
        self.embeddings     = tf.get_variable('embeddings', initializer=self.pretrained_emb)
        self.time_embeddings= tf.get_variable('time_embeddings', initializer=self.pretrained_time)
        self.sem_emb, self.geo_emb, self.free_emb = slice_func(self.embeddings, slice_indices)
        self.sem_wht, self.geo_wht, self.free_wht = slice_func(self.softmax_weights, slice_indices)
        
        # --Retrive Embeddings
        self.main_emb, self.context_emb = choose_emb(args, emb=self.embeddings, weight=self.softmax_weights)
        self.emb_from_sem   = tf.nn.embedding_lookup(self.sem_emb, self.center_loc)
        self.emb_from_geo_x = tf.nn.embedding_lookup(self.geo_emb, self.center_loc)
        self.emb_from_geo_y = tf.nn.embedding_lookup(self.geo_emb, tf.squeeze(self.label_loc)) # due to the special shape of label_loc
        self.emb_from_whole = tf.nn.embedding_lookup(self.main_emb, self.center_loc)
        
        # --Skipgram loss
        self.skipgram_loss = tf.nn.sampled_softmax_loss(
            inputs=self.emb_from_whole, 
            weights=self.context_emb, 
            biases=self.softmax_biases, labels=self.label_loc, 
            num_sampled=args.num_negative_sample, num_classes=args.vocabulary_size)
        self.weighted_skipgram_loss = tf.reduce_mean(self.skipgram_loss*self.weight_decay)

        # --GEO regularizer
        self.euclidean_dis = tf.norm(self.coor_center-self.coor_label,ord='euclidean', axis=-1)
        self.euclidean_sim = tf.exp(-1*self.euclidean_dis*args.geo_temp) #rescale it to (0,1)
        cosine_sim = tf.reduce_mean(
            tf.multiply(tf.nn.l2_normalize(self.emb_from_geo_x, axis=-1),
                        tf.nn.l2_normalize(self.emb_from_geo_y, axis=-1)), axis=-1)
        self.cosine_sim = 0.5*(cosine_sim+1) #rescale it to (0,1)
        self.geo_loss_l2 = args.regulation_weight*tf.losses.mean_squared_error(labels=self.euclidean_sim, 
                                        predictions=self.cosine_sim)
        self.geo_loss_xn = args.regulation_weight*crossentropy(y=self.euclidean_sim, 
                                        p=self.cosine_sim)
        self.geo_loss = choose_geo_loss(args.geo_reg_type, loss_l2=self.geo_loss_l2, loss_xn=self.geo_loss_xn)
        
        # --Temporal regularizer
        emb_dot_time = tf.matmul(self.emb_from_sem, tf.transpose(self.time_embeddings))
        self.time_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.label_t, logits=emb_dot_time))
        
        # --Normalization
        self.normalize_geo_emb_op = tf.assign(self.geo_emb, tf.nn.l2_normalize(self.geo_emb, axis=1))
        self.normalize_sem_emb_op = tf.assign(self.sem_emb, tf.nn.l2_normalize(self.sem_emb, axis=1))
        self.normalize_geo_wht_op = tf.assign(self.geo_wht, tf.nn.l2_normalize(self.geo_wht, axis=1))
        self.normalize_sem_wht_op = tf.assign(self.sem_wht, tf.nn.l2_normalize(self.sem_wht, axis=1))
        
        # --Optimization
        global_step_g = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(args.lr, global_step_g, 5000, 0.5, staircase=True) 
        self.optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_t = self.optimizer.minimize(self.time_loss)
        self.train_skipgram = self.optimizer.minimize(self.skipgram_loss)
        self.train_geo = self.optimizer.minimize(self.geo_loss)

        # --Summaries
        self.trainable_params = tf.trainable_variables()
        self.all_params = tf.global_variables()

In [264]:
def update(losses, sk, geo, t):
    assert type(losses) is dict, 'losses is expected to be dict'
    losses['geo'].append(geo)
    losses['skipgram'].append(sk)
    losses['time'].append(t)
    return losses

def compute_weight_decay(t1, t2, temp):
    return np.exp(-1*((t1-t2)/60*temp)**2)

def evaluate(emb, evaluator):
    result = evaluator.evaluate(emb)
    evaluator.update_history(res_dict=result)
    evaluator.save_history()
    return result

def train(graph, sess, model, args, evaluator_emb, evaluator_weight, logger, dataloader, dataloader_time):
    save_args(args)
    losses = {'geo':[], 'skipgram':[], 'time':[]}
    n_batch = 0
    n_epoch = 0
    tick0 = time.time()
    
    best_criteria = BestCriteria(['{}_f1_{}'.format(mode, k) for mode in ['sub', 'root'] for k in [1,5,10]])
    with graph.as_default():
        saver = tf.train.Saver(model.all_params)
        if args.resume:
            sess = load_model_tf(saver, args, sess)
            evaluator.load_history(args)
        else:
            logger.renew_log_file()
            sess.run(tf.global_variables_initializer())
        logger.log('\nStart training')
        
        while dataloader.get_epoch() < args.num_epoch:
            if args.normalize_weight:
                _ = sess.run([model.normalize_geo_emb_op, model.normalize_sem_emb_op,
                              model.normalize_geo_wht_op, model.normalize_sem_wht_op])

            epoch_tick = time.time()
            emb, weight = sess.run([model.sem_emb, model.sem_wht])
            result_emb = evaluate(emb, evaluator_emb)
            result_weight = evaluate(weight, evaluator_weight)
            result = result_emb if args.main_emb == 'emb' else result_weight
            save_model_tf(saver, sess, args)
            if best_criteria.should_save(result):
                tmp = dict(result)
                tmp['epoch'] = n_epoch
                tmp['batch'] = n_batch
                save_best_tf(saver, sess, args, {'args':vars(args), 'result':tmp})
            #-- Optimization steps 
            while n_epoch >= dataloader.get_epoch():
                center, context = next(dataloader.dg)
                sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
                          {model.center_loc:center.ids, 
                           model.label_loc:context.ids.reshape(-1,1),
                           model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
                           model.coor_center:center.coors, 
                           model.coor_label:context.coors})
                
                loc, time_label = next(dataloader_time.dg)
                t_loss, _ = sess.run([model.time_loss, model.train_t],
                         {model.center_loc:loc, model.label_t:time_label})
                
                losses = update(losses, sk=sk_loss, geo=geo_loss, t=t_loss)
                
                if n_batch % 100 == 0:
                    losses = {k:np.mean(v) for k, v in losses.items()}
                    evaluator_emb.update_history(losses=losses)
                    evaluator_weight.update_history(losses=losses)
                    logstr = '[{}] LOSS '.format(n_batch) + "".join(['{} : {:.6f} '.format(k, v) for k, v in losses.items()])
                    losses = {'geo':[], 'skipgram':[], 'time':[]}
                    logger.log(logstr)
                    
                n_batch += 1
            #-----------------------
            n_epoch += 1
            logstr = '#'*50+'\n'
            logstr += 'Ecpoh {}, used time: {}, eval: {}'.format(n_epoch, time.time()-epoch_tick, result)
            logger.log(logstr)
    logger.log('FINISH, USED TIME:{}'.format(time.time()-tick0))
    return sess

In [266]:
# args.time_temp = 0.01
# args.geo_temp = 10
# args.main_emb = 'emb'
# args.regulation_weight = 10
# args.num_epoch = 30
# args.resume = False
# args.n_processes = 3
# args.batch_size = 256

In [255]:
sess.close()

In [265]:
dataloader = DataLoader(train_data, args)
dataloader_time = DataLoader_time(data, args, idx)
evaluator_emb = Evaluator(args, dicts, mode='emb')
evaluator_weight = Evaluator(args, dicts, mode='weight')
logger = Logger(os.path.join(args.LOG_DIR, 'log_txt'))

graph = tf.Graph()
with graph.as_default():
    model = STSkipgram(args)
    sess = tf.Session(graph=graph, config=config)
state = train(graph, sess, model, args, evaluator_emb, evaluator_weight, logger, dataloader, dataloader_time)

Saved args to log_test/args.json

Start training
eval distance
Job Done, used time 2.911085367202759
Job Done, used time 1.6089904308319092
eval translation
saved history to log_test/emb_history.pk
eval distance
Job Done, used time 2.7766852378845215
Job Done, used time 1.6020898818969727
eval translation
saved history to log_test/weight_history.pk
Saved model to log_test/saved/model.ckpt
Saved BEST model to log_test/best/model.ckpt
[0] LOSS geo : 1.463893 time : 1.925902 skipgram : 2.332961 
[100] LOSS geo : 1.407218 time : 26.781513 skipgram : 7.538828 
[200] LOSS geo : 1.415469 time : 25.976307 skipgram : 8.395305 
[300] LOSS geo : 1.404142 time : 25.259361 skipgram : 7.223943 
[400] LOSS geo : 1.408293 time : 22.245857 skipgram : 6.958654 
[500] LOSS geo : 1.399266 time : 19.082109 skipgram : 7.394409 
[600] LOSS geo : 1.392944 time : 16.423002 skipgram : 6.208087 


KeyboardInterrupt: 

In [116]:
# with graph.as_default():
#     sess = tf.Session(graph=graph, config=config)
#     sess.run(tf.global_variables_initializer())
#     center, context = next(dataloader.dg)
#     sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
#                           {model.center_loc:center.ids, 
#                            model.label_loc:context.ids.reshape(-1,1),
#                            model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
#                            model.coor_center:center.coors, 
#                            model.coor_label:context.coors})

#     loc, time_label = next(dataloader_time.dg)
#     t_loss, _ = sess.run([model.time_loss, model.train_t],
#                          {model.center_loc:loc, model.label_t:time_label})