# Training Embeddings for Playlist Tracks

This tutorial shows how to learn **item embeddings** from co-occurrence statistics.

We use the following model architecture, a long with the conditional cost function used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm. 

The learnt embeddings are then extracted from the model and saved as TSV file.

<img src="cooc2emb.png" width="600" height="400"/>

The following are the steps of this tutorial:


1. Define input data metadata
2. Implement data input function
3. Create feature columns
4. Create a custome estimator
5. Define the train and evaluate experiment
6. Set the experiment parameters
7. Run the experiment
8. Extract the learnt embeddings from the model
9. Export the saved model to serve as an embedding lookup


<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/track2ann/02-Training_Embeddings_for_Playlist_Tracks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
# !pip install -r ../requirements.txt

In [4]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow import data
from datetime import datetime

In [2]:
WORKSPACE = './workspace'
COOC_DIR = '{}/cooc'.format(WORKSPACE)
MODELS_DIR = '{}/models'.format(WORKSPACE)
SEED = 19831060

In [3]:
!echo "Files:"
!ls {COOC_DIR}/
!echo ""

!echo "info:"
!head {COOC_DIR}/info.log
!echo ""

!echo "vocab file:"
!head {COOC_DIR}/vocab.txt

Files:
cooc-00000-of-00001.tfrecords info2.log
info.log                      vocab.txt

info:
P: 2249477
N: 486352
max: 13.33242
min: 2.2812

vocab file:
3140490
1174666
4124042
1176202
4190858
1110922
914570
980106
63370
849802


## 1.  Metadata

In [5]:
FEATURES_SCHEMA = {
    'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'type': tf.FixedLenFeature(dtype=tf.string, shape=())
}

WEIGHT_FEATURE_NAME = 'weight'
TARGET_FEATURE_NAME = 'score'

## 2.  Data Input Function

In [6]:
def make_input_fn(file_pattern, 
                  batch_size=128, num_epochs=1, mode=tf.estimator.ModeKeys.EVAL):

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features=FEATURES_SCHEMA,
            label_key=TARGET_FEATURE_NAME,
            reader=tf.data.TFRecordDataset,
            shuffle_buffer_size=batch_size * 10,
            reader_num_threads=1,
            parser_num_threads=2,
            num_epochs=num_epochs,
            shuffle=(mode==tf.estimator.ModeKeys.TRAIN),
            sloppy_ordering=True,
            drop_final_batch=True
        )
        return dataset
    
    return _input_fn

In [7]:
# tf.enable_eager_execution()

# DATA_FILES = "{}/cooc-*.tfrecords".format(COOC_DIR)

# dataset = make_input_fn(DATA_FILES, batch_size=5)()
# for features, target in dataset.take(1):
#     print()
#     print("Input features:")
#     for key in features:
#         print("-{}:{}".format(key, features[key]))
#     print("Targets:")
#     print(target)

## 3. Feature Columns

In [8]:
def create_feature_columns(embedding_size, vocab1_file, vocab2_file):
    
    feature_columns = []
    
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item1', 
                vocabulary_file=vocab1_file
            ), 
            embedding_size
        )
    )
    
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item2', 
                vocabulary_file=vocab2_file
            ), 
            embedding_size
        )
    )
        
    return feature_columns

## 4.  Custom Estimator

In [9]:
def compute_loss(labels, predictions, weights, types):
    
    def _positive_sample_cost(errors, weights):
        return 0.5 * weights * tf.math.square(errors)
    
    def _negative_sample_cost(errors, weights):
        return weights * tf.math.log(1 + tf.exp(errors))
    
    errors = predictions - labels
    
    p_loss = _positive_sample_cost(errors, weights)
    n_loss = _negative_sample_cost(errors, weights)
    loss = tf.where(tf.equal(types, 'P'), p_loss, n_loss)
    
    return tf.reduce_sum(loss)

def model_fn(features, labels, mode, params):
    
    items1 = features['item1']
    feature_columns = create_feature_columns(
        params.embedding_size, params.vocab1_file, params.vocab2_file)
    
    # TODO: add bias to the input_layer
    item1_layer = tf.feature_column.input_layer(
        features={'item1': items1}, feature_columns=[feature_columns[0]])
    
    if mode != tf.estimator.ModeKeys.PREDICT:
        items2 = features['item2']
        item2_layer = tf.feature_column.input_layer(
            features={'item2': items2}, feature_columns=[feature_columns[1]])
        
        dot_product = tf.keras.layers.Dot(axes=1)([item1_layer, item2_layer])
        logits = (params.max_value - params.min_value) * tf.sigmoid(dot_product) + params.min_value 

    predictions = None
    export_outputs = None
    loss = None
    train_op = None

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions =  item1_layer
        export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}
    else:
        types = features['type']
        weights = features[WEIGHT_FEATURE_NAME]
        
#         loss = tf.losses.mean_squared_error(
#             labels=labels, 
#             predictions=tf.squeeze(logits),
#             weights=weights
#         )

        loss = compute_loss(
            labels=labels, 
            predictions=tf.squeeze(logits), 
            weights=weights, 
            types=types
        )
        
        train_op=tf.train.AdamOptimizer(params.learning_rate).minimize(
            loss=loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        export_outputs=export_outputs,
        loss=loss,
        train_op=train_op
    )


def create_estimator(params, run_config):
    
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config
    )
    
    return estimator

## 5. Experiment

In [10]:
def run_experiment(params, run_config):
    
    train_data_files = params.train_data_files
    eval_data_files = params.eval_data_files
    
    # TrainSpec ####################################
    train_input_fn = make_input_fn(
        train_data_files,
        batch_size=params.batch_size,
        num_epochs=None,
        mode=tf.estimator.ModeKeys.TRAIN
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = train_input_fn,
        max_steps=params.traning_steps
    )
    ###############################################    
    
    # EvalSpec ####################################
    eval_input_fn = make_input_fn(
        eval_data_files,
        num_epochs=None, #1
        batch_size=params.batch_size,
    )

    eval_spec = tf.estimator.EvalSpec(
        name=datetime.utcnow().strftime("%H%M%S"),
        input_fn = eval_input_fn,
        steps=params.eval_steps,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    ###############################################

    tf.logging.set_verbosity(tf.logging.INFO)
    
    if tf.gfile.Exists(run_config.model_dir):
        print("Removing previous artefacts...")
        tf.gfile.DeleteRecursively(run_config.model_dir)
            
    print("")
    estimator = create_estimator(params, run_config)
    print("")
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator

## 6. Parameters 

In [11]:
MODEL_NAME = 'cooc2emb-01'
model_dir = os.path.join(MODELS_DIR, MODEL_NAME)
info_file = os.path.join(COOC_DIR, 'info.log')
min_value = 15
max_value = -5

info_map = {}

if os.path.exists(info_file):
    try:
        with open(info_file) as f:
            for line in f.readlines():
                key, value = line.split(":")
                info_map[key] = float(value)
        min_value = math.floor(info_map['min'])
        max_value = math.ceil(info_map['max'])
    except: pass
    
class HParams():
    pass

params  = HParams()
params.train_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.eval_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.vocab1_file = os.path.join(COOC_DIR,'vocab.txt')
params.vocab2_file = os.path.join(COOC_DIR,'vocab.txt')
params.embedding_size = 32
params.min_value = min_value
params.max_value = max_value
params.batch_size = 265
params.traning_steps = 50000
params.learning_rate = 0.001
params.eval_steps = 1
params.eval_throttle_secs = 0

print(vars(params))

run_config = tf.estimator.RunConfig(
    tf_random_seed=SEED,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=3,
    model_dir=model_dir,
)

{'train_data_files': './workspace/cooc/cooc-*.tfrecords', 'eval_data_files': './workspace/cooc/cooc-*.tfrecords', 'vocab1_file': './workspace/cooc/vocab.txt', 'vocab2_file': './workspace/cooc/vocab.txt', 'embedding_size': 32, 'min_value': 2, 'max_value': 14, 'batch_size': 265, 'traning_steps': 50000, 'learning_rate': 0.001, 'eval_steps': 1, 'eval_throttle_secs': 0}


## 7. Run

In [12]:
estimator = run_experiment(params, run_config)

Removing previous artefacts...

INFO:tensorflow:Using config: {'_model_dir': './workspace/models/cooc2emb-01', '_tf_random_seed': 19831060, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12d410f98>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Experiment started at 01:25:43
.......................................
INFO:tensorflo

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:25:55
INFO:tensorflow:Saving dict for global step 1000: global_step = 1000, loss = 254.78947
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: ./workspace/models/cooc2emb-01/model.ckpt-1000
INFO:tensorflow:global_step/sec: 40.3583
INFO:tensorflow:loss = 362.19202, step = 1001 (2.479 sec)
INFO:tensorflow:global_step/sec: 85.1185
INFO:tensorflow:loss = 397.5473, step = 1101 (1.174 sec)
INFO:tensorflow:global_step/sec: 103.58
INFO:tensorflow:loss = 380.37415, step = 1201 (0.965 sec)
INFO:tensorflow:global_step/sec: 106.884
INFO:tensorflow:loss = 434.37445, step = 1301 (0.936 sec)
INFO:tensorflow:global_step/sec: 84.3021
INFO:tensorflow:loss = 370.93167, step = 1401 (1.186 sec)
INFO:tensorflow:global_step/sec: 111.5

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:26:47Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:26:48
INFO:tensorflow:Saving dict for global step 5000: global_step = 5000, loss = 286.10925
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: ./workspace/models/cooc2emb-01/model.ckpt-5000
INFO:tensorflow:global_step/sec: 25.5702
INFO:tensorflow:loss = 438.59958, step = 5001 (3.910 sec)
INFO:tensorflow:global_step/sec: 64.2724
INFO:tensorflow:loss = 302.27322, step = 5101 (1.556 sec)
INFO:tensorflow:global_step/sec: 83.1319
INFO:tensorflow:loss = 411.52512, step = 5201 (1.202 sec)
INFO:tensorflow:global_step/sec: 84.2538
INFO:tensorflow:loss = 346.1227, step = 5301 (1.188 sec)


INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:27:36
INFO:tensorflow:Saving dict for global step 9000: global_step = 9000, loss = 336.13763
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 9000: ./workspace/models/cooc2emb-01/model.ckpt-9000
INFO:tensorflow:global_step/sec: 45.5469
INFO:tensorflow:loss = 384.42258, step = 9001 (2.197 sec)
INFO:tensorflow:global_step/sec: 111.203
INFO:tensorflow:loss = 339.41785, step = 9101 (0.897 sec)
INFO:tensorflow:global_step/sec: 123.456
INFO:tensorflow:loss = 373.16754, step = 9201 (0.810 sec)
INFO:tensorflow:global_step/sec: 123.944
INFO:tensorflow:loss = 363.72968, step = 9301 (0.807 sec)
INFO:tensorflow:global_step/sec: 127.906
INFO:tensorflow:loss = 276.84094, step = 9401 (0.782 sec)
INFO:tensorflow:global_step/sec: 127.703
INFO:tensorflow:loss = 310.58994, step = 9501 (0.783 sec)
INFO:tensorflow:global_ste

INFO:tensorflow:Saving dict for global step 13000: global_step = 13000, loss = 152.6368
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 13000: ./workspace/models/cooc2emb-01/model.ckpt-13000
INFO:tensorflow:global_step/sec: 48.2836
INFO:tensorflow:loss = 114.93041, step = 13001 (2.071 sec)
INFO:tensorflow:global_step/sec: 111.567
INFO:tensorflow:loss = 222.70157, step = 13101 (0.896 sec)
INFO:tensorflow:global_step/sec: 114.778
INFO:tensorflow:loss = 185.13202, step = 13201 (0.871 sec)
INFO:tensorflow:global_step/sec: 115.991
INFO:tensorflow:loss = 247.92998, step = 13301 (0.862 sec)
INFO:tensorflow:global_step/sec: 118.955
INFO:tensorflow:loss = 275.43304, step = 13401 (0.841 sec)
INFO:tensorflow:global_step/sec: 115.986
INFO:tensorflow:loss = 216.3549, step = 13501 (0.862 sec)
INFO:tensorflow:global_step/sec: 113.722
INFO:tensorflow:loss = 134.9022, step = 13601 (0.880 sec)
INFO:tensorflow:global_step/sec: 102.968
INFO:tensorflow:loss = 217.2843, step = 13701 (0.971 

INFO:tensorflow:global_step/sec: 47.7158
INFO:tensorflow:loss = 169.77742, step = 17001 (2.096 sec)
INFO:tensorflow:global_step/sec: 109.913
INFO:tensorflow:loss = 218.81027, step = 17101 (0.910 sec)
INFO:tensorflow:global_step/sec: 113.996
INFO:tensorflow:loss = 195.1891, step = 17201 (0.877 sec)
INFO:tensorflow:global_step/sec: 93.0698
INFO:tensorflow:loss = 266.95953, step = 17301 (1.075 sec)
INFO:tensorflow:global_step/sec: 100.48
INFO:tensorflow:loss = 259.98633, step = 17401 (0.996 sec)
INFO:tensorflow:global_step/sec: 110.822
INFO:tensorflow:loss = 149.43785, step = 17501 (0.902 sec)
INFO:tensorflow:global_step/sec: 110.994
INFO:tensorflow:loss = 203.02101, step = 17601 (0.901 sec)
INFO:tensorflow:global_step/sec: 110.127
INFO:tensorflow:loss = 232.47626, step = 17701 (0.908 sec)
INFO:tensorflow:global_step/sec: 115.619
INFO:tensorflow:loss = 187.56784, step = 17801 (0.865 sec)
INFO:tensorflow:global_step/sec: 113.328
INFO:tensorflow:loss = 255.08318, step = 17901 (0.882 sec)
IN

INFO:tensorflow:loss = 186.40327, step = 21101 (1.025 sec)
INFO:tensorflow:global_step/sec: 109.606
INFO:tensorflow:loss = 145.76837, step = 21201 (0.914 sec)
INFO:tensorflow:global_step/sec: 112.045
INFO:tensorflow:loss = 177.62183, step = 21301 (0.891 sec)
INFO:tensorflow:global_step/sec: 115.174
INFO:tensorflow:loss = 179.38206, step = 21401 (0.868 sec)
INFO:tensorflow:global_step/sec: 113.217
INFO:tensorflow:loss = 141.83926, step = 21501 (0.884 sec)
INFO:tensorflow:global_step/sec: 112.217
INFO:tensorflow:loss = 174.77736, step = 21601 (0.890 sec)
INFO:tensorflow:global_step/sec: 114.348
INFO:tensorflow:loss = 196.25336, step = 21701 (0.883 sec)
INFO:tensorflow:global_step/sec: 112.755
INFO:tensorflow:loss = 90.94211, step = 21801 (0.879 sec)
INFO:tensorflow:global_step/sec: 114.001
INFO:tensorflow:loss = 175.38579, step = 21901 (0.877 sec)
INFO:tensorflow:Saving checkpoints for 22000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow

INFO:tensorflow:global_step/sec: 114.547
INFO:tensorflow:loss = 113.71767, step = 25301 (0.873 sec)
INFO:tensorflow:global_step/sec: 113.94
INFO:tensorflow:loss = 108.48268, step = 25401 (0.877 sec)
INFO:tensorflow:global_step/sec: 115.227
INFO:tensorflow:loss = 98.073326, step = 25501 (0.868 sec)
INFO:tensorflow:global_step/sec: 111.93
INFO:tensorflow:loss = 180.5286, step = 25601 (0.893 sec)
INFO:tensorflow:global_step/sec: 111.713
INFO:tensorflow:loss = 157.95364, step = 25701 (0.895 sec)
INFO:tensorflow:global_step/sec: 117.475
INFO:tensorflow:loss = 105.891716, step = 25801 (0.851 sec)
INFO:tensorflow:global_step/sec: 113.463
INFO:tensorflow:loss = 161.44926, step = 25901 (0.882 sec)
INFO:tensorflow:Saving checkpoints for 26000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 3

INFO:tensorflow:global_step/sec: 71.463
INFO:tensorflow:loss = 115.87976, step = 29501 (1.398 sec)
INFO:tensorflow:global_step/sec: 96.1745
INFO:tensorflow:loss = 128.74767, step = 29601 (1.040 sec)
INFO:tensorflow:global_step/sec: 105.913
INFO:tensorflow:loss = 121.35515, step = 29701 (0.945 sec)
INFO:tensorflow:global_step/sec: 91.6232
INFO:tensorflow:loss = 113.05442, step = 29801 (1.090 sec)
INFO:tensorflow:global_step/sec: 108.264
INFO:tensorflow:loss = 94.86696, step = 29901 (0.924 sec)
INFO:tensorflow:Saving checkpoints for 30000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:31:

INFO:tensorflow:global_step/sec: 88.6028
INFO:tensorflow:loss = 67.18909, step = 33701 (1.129 sec)
INFO:tensorflow:global_step/sec: 92.8906
INFO:tensorflow:loss = 54.375847, step = 33801 (1.077 sec)
INFO:tensorflow:global_step/sec: 102.059
INFO:tensorflow:loss = 55.504505, step = 33901 (0.980 sec)
INFO:tensorflow:Saving checkpoints for 34000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:32:04Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-34000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running 

INFO:tensorflow:global_step/sec: 123.091
INFO:tensorflow:loss = 67.35437, step = 37901 (0.814 sec)
INFO:tensorflow:Saving checkpoints for 38000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:32:44Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-38000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:32:44
INFO:tensorflow:Saving dict for global step 38000: global_step = 38000, loss = 47.056843
INFO

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:33:25Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-42000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:33:26
INFO:tensorflow:Saving dict for global step 42000: global_step = 42000, loss = 30.711111
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 42000: ./workspace/models/cooc2emb-01/model.ckpt-42000
INFO:tensorflow:global_step/sec: 44.0776
INFO:tensorflow:loss = 48.936073, s

INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T02:34:05Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-46000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:34:05
INFO:tensorflow:Saving dict for global step 46000: global_step = 46000, loss = 33.368282
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 46000: ./workspace/models/cooc2emb-01/model.ckpt-46000
INFO:tensorflow:global_step/sec: 52.3644
INFO:tensorflow:loss = 32.427776, step = 46001 (1.910 sec)
INFO:tensorflow:global_step/sec: 91.0781
INFO:tensorflow:loss = 33.64977, step = 46101 (1.098 sec)
INFO:tensorflow:global_step/sec: 90.0133
INFO:tenso

INFO:tensorflow:Starting evaluation at 2019-09-27T02:34:48Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-27-02:34:48
INFO:tensorflow:Saving dict for global step 50000: global_step = 50000, loss = 32.80314
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 50000: ./workspace/models/cooc2emb-01/model.ckpt-50000
INFO:tensorflow:Loss for final step: 43.453007.
.......................................
Experiment finished at 01:34:48

Experiment elapsed time: 545.102906 seconds


<img src="loss.png" width="600" height="600"/>


## 8. Extract embeddings

In [13]:
def extract_embeddings():
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(model_dir, 'model.ckpt-{}.meta'.format(params.traning_steps)))
        saver.restore(sess, os.path.join(model_dir, 'model.ckpt-{}'.format(params.traning_steps)))
        graph = tf.get_default_graph()
        weights_tensor = graph.get_tensor_by_name('input_layer_1/item2_embedding/embedding_weights:0')
        weights = np.array(sess.run(weights_tensor))

    return weights

In [14]:
embeddings = extract_embeddings()
print(len(embeddings))
print(embeddings[0])

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-50000
39195
[-0.13656929 -0.08152047  0.22075112 -0.08936733 -0.25698322 -0.00394824
 -0.01086133  0.00339999 -0.05643786  0.31033006 -0.05168191 -0.0300671
  0.01123126  0.22518325  0.08868477 -0.00723081 -0.23095499  0.1212828
  0.00407882 -0.06277223  0.04385442  0.17758256 -0.04478771  0.1413052
 -0.23489681  0.04716527  0.13687761  0.05283307 -0.21793135 -0.04481634
 -0.0370245   0.26175183]


In [15]:
vocab_path = os.path.join(COOC_DIR,'vocab.txt')
output_path = os.path.join(WORKSPACE,'embeddings.tsv')

def write_embeddings_to_tsv():
    with open(output_path, 'w') as out_f:
        with open(vocab_path) as vocab_f:
            for index, item in enumerate(vocab_f):
                embedding = embeddings[index]
                print('\t'.join([item.strip()] + [str(x) for x in embedding]), file=out_f)
                
write_embeddings_to_tsv()

In [16]:
!head {output_path}

3140490	-0.13656929	-0.08152047	0.22075112	-0.08936733	-0.25698322	-0.0039482424	-0.0108613325	0.003399993	-0.056437857	0.31033006	-0.051681906	-0.030067096	0.011231261	0.22518325	0.08868477	-0.00723081	-0.23095499	0.1212828	0.004078818	-0.06277223	0.043854423	0.17758256	-0.04478771	0.1413052	-0.23489681	0.047165267	0.13687761	0.052833073	-0.21793135	-0.044816345	-0.0370245	0.26175183
1174666	0.013843778	-0.27829817	-0.20373544	0.2345063	0.13281359	0.35718116	0.2595925	-0.012942535	-0.43445885	-0.19701801	0.34873945	-0.13490969	-0.40343148	0.073161945	0.40497667	-0.18959127	-0.3336594	0.13334094	0.0339188	-0.026429988	0.041407272	-0.0064272597	0.025063872	0.13757876	0.24225596	0.16204597	-0.06984314	-0.23530872	-0.0049144193	-0.15068308	-0.173307	0.18084238
4124042	0.20764865	0.08293786	-0.28368247	-0.20892073	0.057472624	0.15269594	0.6306407	-0.33692354	0.07106056	0.27661732	0.20864043	0.3706322	0.25792092	-0.14687982	0.20227244	-0.21646985	-0.2637002	0.33540612	-0.26416937	0.090075

## 9. Export saved model as item-embedding lookup

In [17]:
def make_serving_input_receiver_fn():
    return tf.estimator.export.build_raw_serving_input_receiver_fn(
        {'item1': tf.placeholder(shape=[None], dtype=tf.string)}
    )

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)

Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predictions', 'serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:ten

b'./workspace/models/cooc2emb-01/export/1569548683'

In [18]:
export_dir = os.path.join(model_dir, "export")
saved_model_dir = os.path.join(
    export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])

print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
)

output = predictor_fn({'item1': ['56430']})
print(output)

./workspace/models/cooc2emb-01/export/1569548683
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/export/1569548683/variables/variables
{'output': array([[ 0.3423961 , -0.19566412,  0.18098071,  0.05013943, -0.42272353,
         0.52389747,  0.41435465,  0.15142418, -0.26902387,  0.09728196,
        -0.21906653, -0.21496587, -0.41003945,  0.07338266, -0.0357971