In [8]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
seed = 1
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)
import os 
import csv
import pickle
import time
import h5py
from collections import defaultdict
import tensorflow as tf
import json
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy.ma as ma

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

In [None]:
from utils import load_model

In [188]:
from dataloader import load_data, DataLoader, DataLoader_time
from parser import get_parser
from utils import norm, normalize, is_normalized_matrix, extract_data, save_args, load_args, \
    save_embeddings, load_embeddings, DataStruct, save_model_tf, save_best_tf, load_model_tf
from train import get_train_data
from logger import Logger
from model import init_params, crossentropy, choose_emb, choose_geo_loss, STSkipgram
from multiprocess_tools import multiprocess_compute_distance

In [13]:
args = get_parser(['--CITY', 'NYC', '--LOG_DIR', 'log_test', '--normalize_weight', '--WITH_TIME', '--WITH_GPS', '--WITH_TIMESTAMP', 
                   '--geo_reg_type', 'l2'])

In [14]:
origin_data, dicts = load_data(os.path.join(args.ROOT, 'data','{}_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk'.format(args.CITY) ))
args.vocabulary_size = dicts.vocabulary_size
data, idx = extract_data(origin_data, args)

loading data from /home/haibin2/data/checkins/data/NYC_INTV_processed_voc5_len2_setting_WITH_GPS_WITH_TIME_WITH_USERID.pk
args.pattern: hand
indices setting : WITH_TIME normalize_weight WITH_GPS WITH_TIMESTAMP
indices: [0, 2, 3, 4, 5]


In [15]:
train_data = get_train_data(data)

Mode:both, size:(162302, 2, 5) size:(162302, 2, 5) total size:(324604, 2, 5)


In [206]:
class BestCriteria:
    def __init__(self, metrics):
        self.best_score = 0
        self.metrics = metrics
    def should_save(self, result, ):
        tmp = list()
        for m in self.metrics:
            tmp.append(result[m])
        score = np.mean(tmp)
        if score > self.best_score:
            self.best_score = score
            return True
        return False

In [205]:
def update(losses, sk, geo, t):
    assert type(losses) is dict, 'losses is expected to be dict'
    losses['geo'].append(geo)
    losses['skipgram'].append(sk)
    losses['time'].append(t)
    return losses

def compute_weight_decay(t1, t2, temp):
    return np.exp(-1*((t1-t2)/60*temp)**2)

def train(graph, sess, model, evaluator, logger, dataloader, dataloader_time):
    save_args(args)
    losses = {'geo':[], 'skipgram':[], 'time':[]}
    n_batch = 0
    n_epoch = 0
    tick0 = time.time()
    
    best_criteria = BestCriteria(['{}_f1_{}'.format(mode, k) for mode in ['sub', 'root'] for k in [1,5,10]])
    with graph.as_default():
        saver = tf.train.Saver(model.all_params)
        if args.resume:
            sess = load_model_tf(saver, args, sess)
            evaluator.load_history(args)
        else:
            logger.renew_log_file()
            sess.run(tf.global_variables_initializer())
        logger.log('\nStart training')
        
        while dataloader.get_epoch() < args.num_epoch:
            if args.normalize_weight:
                _, _ = sess.run([model.normalize_geo_op, model.normalize_sem_op])

            epoch_tick = time.time()
            result = evaluator.evaluate(model, sess)
            evaluator.update_history(res_dict=result)
            evaluator.save_history()
            save_model_tf(saver, sess, args)
            if best_criteria.should_save(result):
                tmp = dict(result)
                tmp['epoch'] = n_epoch
                tmp['batch'] = n_batch
                save_best_tf(saver, sess, args, {'args':vars(args), 'result':tmp})
            while n_epoch >= dataloader.get_epoch():
                center, context = next(dataloader.dg)
                sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
                          {model.center_loc:center.ids, 
                           model.label_loc:context.ids.reshape(-1,1),
                           model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
                           model.coor_center:center.coors, 
                           model.coor_label:context.coors})
                
                loc, time_label = next(dataloader_time.dg)
                t_loss, _ = sess.run([model.time_loss, model.train_t],
                         {model.center_loc:loc, model.label_t:time_label})
                
                losses = update(losses, sk=sk_loss, geo=geo_loss, t=t_loss)
                
                if n_batch % 100 == 0:
                    losses = {k:np.mean(v) for k, v in losses.items()}
                    evaluator.update_history(losses=losses)
                    logstr = '[{}] LOSS '.format(n_batch) + "".join(['{} : {:.6f} '.format(k, v) for k, v in losses.items()])
                    losses = {'geo':[], 'skipgram':[], 'time':[]}
                    logger.log(logstr)
                    
                n_batch += 1
                
            n_epoch += 1
            logstr = '#'*50+'\n'
            logstr += 'Ecpoh {}, used time: {}, eval: {}'.format(n_epoch, time.time()-epoch_tick, result)
            logger.log(logstr)
    print('FINISH, USED TIME:{}'.format(time.time()-tick0))
    return sess

In [201]:
args.time_temp = 0.01
args.geo_temp = 10
args.main_emb = 'emb'
args.regulation_weight = 10
args.num_epoch = 20
args.resume = False
args.n_processes = 3
args.batch_size = 256

In [202]:
sess.close()

In [207]:
dataloader = DataLoader(train_data, args)
dataloader_time = DataLoader_time(data, args, idx)
evaluator = Evaluator(args, dicts, valid_ids=range(1000))
logger = Logger(os.path.join(args.LOG_DIR, 'log_txt'))

graph = tf.Graph()
with graph.as_default():
    model = STSkipgram(args)
    sess = tf.Session(graph=graph, config=config)
state = train(graph, sess, model, evaluator, logger, dataloader, dataloader_time)

Saved args to log_test/args.json

Start training
eval distance
Job Done, used time 2.977628469467163
Job Done, used time 1.6788504123687744
eval translation
saved history to log_test/history.pk
Saved model to log_test/saved/model.ckpt
Saved BEST model to log_test/best/model.ckpt
[0] LOSS geo : 1.367862 time : 1.872306 skipgram : 1.773618 
[100] LOSS geo : 1.450527 time : 1.853407 skipgram : 6.813413 
[200] LOSS geo : 1.552121 time : 1.807023 skipgram : 16.308954 
[300] LOSS geo : 1.431290 time : 1.816345 skipgram : 23.572285 
[400] LOSS geo : 1.329821 time : 1.813184 skipgram : 22.737238 
[500] LOSS geo : 1.326245 time : 1.765638 skipgram : 71.423325 
[600] LOSS geo : 1.353521 time : 1.638035 skipgram : 30.261635 
[700] LOSS geo : 1.402532 time : 1.671341 skipgram : 21.668076 
[800] LOSS geo : 1.508805 time : 1.670709 skipgram : 10.645128 
[900] LOSS geo : 1.523472 time : 1.679934 skipgram : 32.756912 
[1000] LOSS geo : 1.353605 time : 1.673965 skipgram : 29.318632 
[1100] LOSS geo : 1

[6200] LOSS geo : 1.309812 time : 1.649625 skipgram : 0.912825 
[6300] LOSS geo : 1.341093 time : 1.677067 skipgram : 0.906518 
##################################################
Ecpoh 5, used time: 16.260549545288086, eval: {'sub_precision_5': 0.1096, 'root_recall_5': 0.1204, 'root_accuracy_5': 0.631, 'sub_precision_10': 0.0948, 'root_f1_5': 0.16053333333333333, 'sub_recall_10': 0.0948, 'sub_accuracy_5': 0.328, 'sub_recall_1': 0.0122, 'root_recall_1': 0.0262, 'sub_f1_1': 0.022181818181818184, 'root_accuracy_10': 0.804, 'sub_f1_5': 0.07306666666666668, 'root_recall_10': 0.2223, 'sub_f1_10': 0.09480000000000001, 'root_f1_1': 0.047636363636363636, 'root_precision_1': 0.262, 'root_accuracy_1': 0.262, 'sub_recall_5': 0.0548, 'sub_accuracy_10': 0.448, 'sub_accuracy_1': 0.122, 't': 5.662145137786865, 'root_f1_10': 0.22229999999999997, 'root_precision_5': 0.2408, 'sub_precision_1': 0.122, 'root_precision_10': 0.2223}
eval distance
Job Done, used time 3.2195918560028076
Job Done, used time 1.6

saved history to log_test/history.pk
Saved model to log_test/saved/model.ckpt
Saved BEST model to log_test/best/model.ckpt
[11500] LOSS geo : 1.411811 time : 1.650933 skipgram : 0.671819 
[11600] LOSS geo : 1.533785 time : 1.651625 skipgram : 0.796432 
[11700] LOSS geo : 1.421539 time : 1.676899 skipgram : 0.723854 
[11800] LOSS geo : 1.314516 time : 1.651360 skipgram : 0.767700 
[11900] LOSS geo : 1.306157 time : 1.600511 skipgram : 0.700184 
[12000] LOSS geo : 1.343084 time : 1.661652 skipgram : 0.686086 
[12100] LOSS geo : 1.389727 time : 1.669860 skipgram : 0.670302 
[12200] LOSS geo : 1.489134 time : 1.660616 skipgram : 0.695739 
[12300] LOSS geo : 1.502489 time : 1.595343 skipgram : 0.776726 
[12400] LOSS geo : 1.352855 time : 1.647509 skipgram : 0.699443 
[12500] LOSS geo : 1.284786 time : 1.652278 skipgram : 0.705902 
[12600] LOSS geo : 1.363420 time : 1.675840 skipgram : 0.597233 
##################################################
Ecpoh 10, used time: 16.27995991706848, eval: 

Job Done, used time 3.013631582260132
Job Done, used time 1.5664775371551514
eval translation
saved history to log_test/history.pk
Saved model to log_test/saved/model.ckpt
[17800] LOSS geo : 1.381677 time : 1.646526 skipgram : 0.593355 
[17900] LOSS geo : 1.489041 time : 1.649369 skipgram : 0.623378 
[18000] LOSS geo : 1.504754 time : 1.671501 skipgram : 0.664308 
[18100] LOSS geo : 1.356409 time : 1.643053 skipgram : 0.634468 
[18200] LOSS geo : 1.280991 time : 1.598022 skipgram : 0.658915 
[18300] LOSS geo : 1.365155 time : 1.658472 skipgram : 0.555620 
[18400] LOSS geo : 1.293086 time : 1.661452 skipgram : 0.606963 
[18500] LOSS geo : 1.510952 time : 1.656132 skipgram : 0.560896 
[18600] LOSS geo : 1.482268 time : 1.591158 skipgram : 0.760235 
[18700] LOSS geo : 1.414461 time : 1.637630 skipgram : 0.596820 
[18800] LOSS geo : 1.299826 time : 1.648365 skipgram : 0.618955 
[18900] LOSS geo : 1.347960 time : 1.668889 skipgram : 0.627432 
[19000] LOSS geo : 1.326580 time : 1.643631 skip

Job Done, used time 2.877664804458618
Job Done, used time 1.726832628250122
eval translation
saved history to log_test/history.pk
Saved model to log_test/saved/model.ckpt
[24100] LOSS geo : 1.299848 time : 1.632852 skipgram : 0.546627 
[24200] LOSS geo : 1.514766 time : 1.642212 skipgram : 0.509594 
[24300] LOSS geo : 1.475634 time : 1.660798 skipgram : 0.724679 
[24400] LOSS geo : 1.423030 time : 1.634667 skipgram : 0.551465 
[24500] LOSS geo : 1.296031 time : 1.592216 skipgram : 0.584172 
[24600] LOSS geo : 1.352003 time : 1.653983 skipgram : 0.559453 
[24700] LOSS geo : 1.325446 time : 1.648124 skipgram : 0.549121 
[24800] LOSS geo : 1.394221 time : 1.648932 skipgram : 0.530649 
[24900] LOSS geo : 1.553768 time : 1.585607 skipgram : 0.674703 
[25000] LOSS geo : 1.435498 time : 1.629163 skipgram : 0.592521 
[25100] LOSS geo : 1.311400 time : 1.641122 skipgram : 0.601222 
[25200] LOSS geo : 1.311188 time : 1.657962 skipgram : 0.583575 
[25300] LOSS geo : 1.347445 time : 1.634730 skipg

In [116]:
# with graph.as_default():
#     sess = tf.Session(graph=graph, config=config)
#     sess.run(tf.global_variables_initializer())
#     center, context = next(dataloader.dg)
#     sk_loss, _, geo_loss, _ = sess.run([model.weighted_skipgram_loss, model.train_skipgram, model.geo_loss, model.train_geo],
#                           {model.center_loc:center.ids, 
#                            model.label_loc:context.ids.reshape(-1,1),
#                            model.weight_decay: compute_weight_decay(center.timestmp, context.timestmp, args.time_temp),
#                            model.coor_center:center.coors, 
#                            model.coor_label:context.coors})

#     loc, time_label = next(dataloader_time.dg)
#     t_loss, _ = sess.run([model.time_loss, model.train_t],
#                          {model.center_loc:loc, model.label_t:time_label})