In [8]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
seed = 1
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)
import os 
import csv
import pickle
import time
import h5py
from collections import defaultdict
import tensorflow as tf
import json
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy.ma as ma

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

In [188]:
from dataloader import load_data, DataLoader, DataLoader_time
from parser import get_parser
from utils import norm, normalize, is_normalized_matrix, extract_data, save_args, load_args, \
    save_embeddings, load_embeddings, DataStruct, save_model_tf, save_best_tf, load_model_tf
from train import get_train_data
from logger import Logger
from model import init_params, crossentropy, choose_emb, choose_geo_loss, STSkipgram
from multiprocess_tools import multiprocess_compute_distance

In [13]:
args = get_parser(['--CITY', 'NYC', '--LOG_DIR', 'log_test', '--normalize_weight', '--WITH_TIME', '--WITH_GPS', '--WITH_TIMESTAMP', 
                   '--geo_reg_type', 'l2'])

In [219]:
origin_data, dicts = load_data(os.path.join(args.ROOT, 'data','{}_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk'.format(args.CITY) ))
args.vocabulary_size = dicts.vocabulary_size
data, idx = extract_data(origin_data, args)

loading data from /home/haibin2/data/checkins/data/NYC_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk
args.pattern: hand
indices setting : WITH_TIME normalize_weight WITH_GPS WITH_TIMESTAMP resume
indices: [0, 2, 3, 4, 5]


In [15]:
train_data = get_train_data(data)

Mode:both, size:(162302, 2, 5) size:(162302, 2, 5) total size:(324604, 2, 5)


In [206]:
class BestCriteria:
    def __init__(self, metrics):
        self.best_score = 0
        self.metrics = metrics
    def should_save(self, result, ):
        tmp = list()
        for m in self.metrics:
            tmp.append(result[m])
        score = np.mean(tmp)
        if score > self.best_score:
            self.best_score = score
            return True
        return False

In [217]:
def update(losses, sk, geo, t):
    assert type(losses) is dict, 'losses is expected to be dict'
    losses['geo'].append(geo)
    losses['skipgram'].append(sk)
    losses['time'].append(t)
    return losses

def compute_weight_decay(t1, t2, temp):
    return np.exp(-1*((t1-t2)/60*temp)**2)

def evaluate(emb, evaluator):
    result = evaluator.evaluate(emb)
    evaluator.update_history(res_dict=result)
    evaluator.save_history()
    return result

def train(graph, sess, model, args, evaluator, logger, dataloader, dataloader_time):
    save_args(args)
    losses = {'geo':[], 'skipgram':[], 'time':[]}
    n_batch = 0
    n_epoch = 0
    tick0 = time.time()
    
    best_criteria = BestCriteria(['{}_f1_{}'.format(mode, k) for mode in ['sub', 'root'] for k in [1,5,10]])
    with graph.as_default():
        saver = tf.train.Saver(model.all_params)
        if args.resume:
            sess = load_model_tf(saver, args, sess)
            evaluator.load_history(args)
        else:
            logger.renew_log_file()
            sess.run(tf.global_variables_initializer())
        logger.log('\nStart training')
        
        while dataloader.get_epoch() < args.num_epoch:
            if args.normalize_weight:
                _, _ = sess.run([model.normalize_geo_op, model.normalize_sem_op])

            epoch_tick = time.time()
            emb, weight = sess.run([model.normalized_emb, model.normalized_weight])
            result_emb = evaluate(emb, evaluator_emb)
            result_weight = evaluate(weight, evaluator_weight)
            save_model_tf(saver, sess, args)
            if best_criteria.should_save(result):
                tmp = dict(result)
                tmp['epoch'] = n_epoch
                tmp['batch'] = n_batch
                save_best_tf(saver, sess, args, {'args':vars(args), 'result':tmp})
            #-- Optimization steps 
            while n_epoch >= dataloader.get_epoch():
                center, context = next(dataloader.dg)
                sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
                          {model.center_loc:center.ids, 
                           model.label_loc:context.ids.reshape(-1,1),
                           model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
                           model.coor_center:center.coors, 
                           model.coor_label:context.coors})
                
                loc, time_label = next(dataloader_time.dg)
                t_loss, _ = sess.run([model.time_loss, model.train_t],
                         {model.center_loc:loc, model.label_t:time_label})
                
                losses = update(losses, sk=sk_loss, geo=geo_loss, t=t_loss)
                
                if n_batch % 100 == 0:
                    losses = {k:np.mean(v) for k, v in losses.items()}
                    evaluator.update_history(losses=losses)
                    logstr = '[{}] LOSS '.format(n_batch) + "".join(['{} : {:.6f} '.format(k, v) for k, v in losses.items()])
                    losses = {'geo':[], 'skipgram':[], 'time':[]}
                    logger.log(logstr)
                    
                n_batch += 1
            #-----------------------
            n_epoch += 1
            logstr = '#'*50+'\n'
            logstr += 'Ecpoh {}, used time: {}, eval: {}'.format(n_epoch, time.time()-epoch_tick, result)
            logger.log(logstr)
    print('FINISH, USED TIME:{}'.format(time.time()-tick0))
    return sess

In [213]:
args.time_temp = 0.01
args.geo_temp = 10
args.main_emb = 'emb'
args.regulation_weight = 10
args.num_epoch = 10
args.resume = True
args.n_processes = 3
args.batch_size = 256

In [202]:
sess.close()

In [220]:
dataloader = DataLoader(train_data, args)
dataloader_time = DataLoader_time(data, args, idx)
evaluator = Evaluator(args, dicts)
logger = Logger(os.path.join(args.LOG_DIR, 'log_txt'))

graph = tf.Graph()
with graph.as_default():
    model = STSkipgram(args)
    sess = tf.Session(graph=graph, config=config)
state = train(graph, sess, model, args, evaluator, logger, dataloader, dataloader_time)

Saved args to log_test/args.json
INFO:tensorflow:Restoring parameters from log_test/saved/model.ckpt
load history from log_test/history.pk

Start training
eval distance
Job Done, used time 3.0486044883728027
Job Done, used time 1.8326241970062256
eval translation
saved history to log_test/history.pk
Saved model to log_test/saved/model.ckpt
Saved BEST model to log_test/best/model.ckpt
[0] LOSS geo : 1.316595 time : 1.625507 skipgram : 0.740908 
[100] LOSS geo : 1.384676 time : 1.636829 skipgram : 0.559827 
[200] LOSS geo : 1.388073 time : 1.627023 skipgram : 0.546426 
[300] LOSS geo : 1.387686 time : 1.631028 skipgram : 0.540374 
[400] LOSS geo : 1.400061 time : 1.631873 skipgram : 0.567788 
[500] LOSS geo : 1.389661 time : 1.631391 skipgram : 0.551011 
[600] LOSS geo : 1.383934 time : 1.632436 skipgram : 0.569020 
[700] LOSS geo : 1.393086 time : 1.632557 skipgram : 0.562930 
[800] LOSS geo : 1.400099 time : 1.626283 skipgram : 0.547292 
[900] LOSS geo : 1.389113 time : 1.632758 skipgr

[6200] LOSS geo : 1.390137 time : 1.619338 skipgram : 0.546027 
[6300] LOSS geo : 1.393902 time : 1.626609 skipgram : 0.511090 
##################################################
Ecpoh 5, used time: 15.811672449111938, eval: {'sub_precision_5': 0.1194, 'root_recall_5': 0.1342, 'root_accuracy_5': 0.676, 'sub_precision_10': 0.1083, 'root_f1_5': 0.17893333333333333, 'sub_recall_10': 0.1083, 'sub_accuracy_5': 0.346, 'sub_recall_1': 0.0139, 'root_recall_1': 0.0302, 'sub_f1_1': 0.02527272727272727, 'root_accuracy_10': 0.826, 'sub_f1_5': 0.0796, 'root_recall_10': 0.2481, 'sub_f1_10': 0.1083, 'root_f1_1': 0.054909090909090914, 'root_precision_1': 0.302, 'root_accuracy_1': 0.302, 'sub_recall_5': 0.0597, 'sub_accuracy_10': 0.488, 'sub_accuracy_1': 0.139, 't': 5.410040378570557, 'root_f1_10': 0.2481, 'root_precision_5': 0.2684, 'sub_precision_1': 0.139, 'root_precision_10': 0.2481}
eval distance
Job Done, used time 2.992262601852417
Job Done, used time 1.7296369075775146
eval translation
saved hi

[11600] LOSS geo : 1.393987 time : 1.608672 skipgram : 0.510544 
[11700] LOSS geo : 1.400494 time : 1.608647 skipgram : 0.504305 
[11800] LOSS geo : 1.385059 time : 1.606049 skipgram : 0.493587 
[11900] LOSS geo : 1.386119 time : 1.606969 skipgram : 0.511391 
[12000] LOSS geo : 1.389528 time : 1.617905 skipgram : 0.485967 
[12100] LOSS geo : 1.399050 time : 1.612061 skipgram : 0.512836 
[12200] LOSS geo : 1.391217 time : 1.608590 skipgram : 0.506135 
[12300] LOSS geo : 1.391301 time : 1.601239 skipgram : 0.513996 
[12400] LOSS geo : 1.387529 time : 1.612327 skipgram : 0.493376 
[12500] LOSS geo : 1.386926 time : 1.610697 skipgram : 0.489626 
[12600] LOSS geo : 1.390143 time : 1.611825 skipgram : 0.499168 
##################################################
Ecpoh 10, used time: 16.193974018096924, eval: {'sub_precision_5': 0.1208, 'root_recall_5': 0.1314, 'root_accuracy_5': 0.681, 'sub_precision_10': 0.1063, 'root_f1_5': 0.17519999999999997, 'sub_recall_10': 0.1063, 'sub_accuracy_5': 0.3

KeyboardInterrupt: 

In [116]:
# with graph.as_default():
#     sess = tf.Session(graph=graph, config=config)
#     sess.run(tf.global_variables_initializer())
#     center, context = next(dataloader.dg)
#     sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
#                           {model.center_loc:center.ids, 
#                            model.label_loc:context.ids.reshape(-1,1),
#                            model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
#                            model.coor_center:center.coors, 
#                            model.coor_label:context.coors})

#     loc, time_label = next(dataloader_time.dg)
#     t_loss, _ = sess.run([model.time_loss, model.train_t],
#                          {model.center_loc:loc, model.label_t:time_label})