# Training Embeddings for Playlist Tracks

This tutorial shows how to learn **item embeddings** from co-occurrence statistics.

We use the following model architecture, a long with the conditional cost function used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm. 

The learnt embeddings are then extracted from the model and saved as TSV file.

<img src="cooc2emb.png" width="600" height="400"/>

The following are the steps of this tutorial:


1. Define input data metadata
2. Implement data input function
3. Create feature columns
4. Create a custome estimator
5. Define the train and evaluate experiment
6. Set the experiment parameters
7. Run the experiment
8. Extract the learnt embeddings from the model
9. Export the saved model to serve as an embedding lookup


<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/02-Training_Embeddings_for_Playlist_Tracks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
# !pip install -r requirements.txt

In [3]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow import data
from datetime import datetime

In [4]:
WORKSPACE = './workspace'
COOC_DIR = '{}/cooc'.format(WORKSPACE)
MODELS_DIR = '{}/models'.format(WORKSPACE)
SEED = 19831060

In [5]:
!echo "Files:"
!ls {COOC_DIR}/
!echo ""

!echo "info:"
!head {COOC_DIR}/info.log
!echo ""

!echo "vocab file:"
!head {COOC_DIR}/vocab.txt

Files:
cooc-00000-of-00001.tfrecords info2.log
info.log                      vocab.txt

info:
P: 2249477
N: 486352
max: 13.33242
min: 2.2812

vocab file:
3140490
1174666
4124042
1176202
4190858
1110922
914570
980106
63370
849802


## 1.  Metadata

In [6]:
FEATURES_SCHEMA = {
    'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'type': tf.FixedLenFeature(dtype=tf.string, shape=())
}

WEIGHT_FEATURE_NAME = 'weight'
TARGET_FEATURE_NAME = 'score'

## 2.  Data Input Function

In [7]:
def make_input_fn(file_pattern, 
                  batch_size=128, num_epochs=1, mode=tf.estimator.ModeKeys.EVAL):

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features=FEATURES_SCHEMA,
            label_key=TARGET_FEATURE_NAME,
            reader=tf.data.TFRecordDataset,
            shuffle_buffer_size=batch_size * 10,
            reader_num_threads=1,
            parser_num_threads=2,
            num_epochs=num_epochs,
            shuffle=(mode==tf.estimator.ModeKeys.TRAIN),
            sloppy_ordering=True,
            drop_final_batch=True
        )
        return dataset
    
    return _input_fn

In [None]:
# tf.enable_eager_execution()

# DATA_FILES = "{}/cooc-*.tfrecords".format(COOC_DIR)

# dataset = make_input_fn(DATA_FILES, batch_size=5)()
# for features, target in dataset.take(1):
#     print()
#     print("Input features:")
#     for key in features:
#         print("-{}:{}".format(key, features[key]))
#     print("Targets:")
#     print(target)

## 3. Feature Columns

In [8]:
def create_feature_columns(embedding_size, vocab1_file, vocab2_file):
    
    feature_columns = []
    
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item1', 
                vocabulary_file=vocab1_file
            ), 
            embedding_size
        )
    )
    
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item2', 
                vocabulary_file=vocab2_file
            ), 
            embedding_size
        )
    )
        
    return feature_columns

## 4.  Custom Estimator

In [16]:
def compute_loss(labels, predictions, weights, types):
    
    def _positive_sample_cost(errors, weights):
        return 0.5 * weights * tf.math.square(errors)
    
    def _negative_sample_cost(errors, weights):
        return weights * tf.math.log(1 + tf.exp(errors))
    
    errors = predictions - labels
    
    p_loss = _positive_sample_cost(errors, weights)
    n_loss = _negative_sample_cost(errors, weights)
    loss = tf.where(types == 'P', p_loss, n_loss)
    
    return tf.reduce_sum(loss)

def model_fn(features, labels, mode, params):
    
    items1 = features['item1']
    feature_columns = create_feature_columns(
        params.embedding_size, params.vocab1_file, params.vocab2_file)
    
    # TODO: add bias to the input_layer
    item1_layer = tf.feature_column.input_layer(
        features={'item1': items1}, feature_columns=[feature_columns[0]])
    
    if mode != tf.estimator.ModeKeys.PREDICT:
        items2 = features['item2']
        item2_layer = tf.feature_column.input_layer(
            features={'item2': items2}, feature_columns=[feature_columns[1]])
        
        dot_product = tf.keras.layers.Dot(axes=1)([item1_layer, item2_layer])
        logits = (params.max_value - params.min_value) * tf.sigmoid(dot_product) + params.min_value 

    predictions = None
    export_outputs = None
    loss = None
    train_op = None

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions =  item1_layer
        export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}
    else:
        types = features['type']
        weights = features[WEIGHT_FEATURE_NAME]
        
#         loss = tf.losses.mean_squared_error(
#             labels=labels, 
#             predictions=tf.squeeze(logits),
#             weights=weights
#         )

        loss = compute_loss(
            labels=labels, 
            predictions=tf.squeeze(logits), 
            weights=weights, 
            types=types
        )
        
        train_op=tf.train.AdamOptimizer(params.learning_rate).minimize(
            loss=loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        export_outputs=export_outputs,
        loss=loss,
        train_op=train_op
    )


def create_estimator(params, run_config):
    
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config
    )
    
    return estimator

## 5. Experiment

In [17]:
def run_experiment(params, run_config):
    
    train_data_files = params.train_data_files
    eval_data_files = params.eval_data_files
    
    # TrainSpec ####################################
    train_input_fn = make_input_fn(
        train_data_files,
        batch_size=params.batch_size,
        num_epochs=None,
        mode=tf.estimator.ModeKeys.TRAIN
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = train_input_fn,
        max_steps=params.traning_steps
    )
    ###############################################    
    
    # EvalSpec ####################################
    eval_input_fn = make_input_fn(
        eval_data_files,
        num_epochs=None, #1
        batch_size=params.batch_size,
    )

    eval_spec = tf.estimator.EvalSpec(
        name=datetime.utcnow().strftime("%H%M%S"),
        input_fn = eval_input_fn,
        steps=params.eval_steps,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    ###############################################

    tf.logging.set_verbosity(tf.logging.INFO)
    
    if tf.gfile.Exists(run_config.model_dir):
        print("Removing previous artefacts...")
        tf.gfile.DeleteRecursively(run_config.model_dir)
            
    print("")
    estimator = create_estimator(params, run_config)
    print("")
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator

## 6. Parameters 

In [18]:
MODEL_NAME = 'cooc2emb-01'
model_dir = os.path.join(MODELS_DIR, MODEL_NAME)
info_file = os.path.join(COOC_DIR, 'info.log')
min_value = 15
max_value = -5

info_map = {}

if os.path.exists(info_file):
    try:
        with open(info_file) as f:
            for line in f.readlines():
                key, value = line.split(":")
                info_map[key] = float(value)
        min_value = math.floor(info_map['min'])
        max_value = math.ceil(info_map['max'])
    except: pass
    
class HParams():
    pass

params  = HParams()
params.train_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.eval_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.vocab1_file = os.path.join(COOC_DIR,'vocab.txt')
params.vocab2_file = os.path.join(COOC_DIR,'vocab.txt')
params.embedding_size = 32
params.min_value = min_value
params.max_value = max_value
params.batch_size = 265
params.traning_steps = 50000
params.learning_rate = 0.0005
params.eval_steps = 1
params.eval_throttle_secs = 0

print(vars(params))

run_config = tf.estimator.RunConfig(
    tf_random_seed=SEED,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=3,
    model_dir=model_dir,
)

{'train_data_files': './workspace/cooc/cooc-*.tfrecords', 'eval_data_files': './workspace/cooc/cooc-*.tfrecords', 'vocab1_file': './workspace/cooc/vocab.txt', 'vocab2_file': './workspace/cooc/vocab.txt', 'embedding_size': 32, 'min_value': 2, 'max_value': 14, 'batch_size': 265, 'traning_steps': 50000, 'learning_rate': 0.0005, 'eval_steps': 1, 'eval_throttle_secs': 0}


## 7. Run

In [19]:
estimator = run_experiment(params, run_config)


INFO:tensorflow:Using config: {'_model_dir': './workspace/models/cooc2emb-01', '_tf_random_seed': 19831060, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dd7d278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Experiment started at 19:02:38
.......................................
INFO:tensorflow:Not using Distribute Coordina

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:03:10
INFO:tensorflow:Saving dict for global step 3000: global_step = 3000, loss = 207.46689
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3000: ./workspace/models/cooc2emb-01/model.ckpt-3000
INFO:tensorflow:global_step/sec: 48.3319
INFO:tensorflow:loss = 231.64137, step = 3001 (2.069 sec)
INFO:tensorflow:global_step/sec: 113.068
INFO:tensorflow:loss = 239.54126, step = 3101 (0.884 sec)
INFO:tensorflow:global_step/sec: 121.623
INFO:tensorflow:loss = 229.84946, step = 3201 (0.822 sec)
INFO:tensorflow:global_step/sec: 118.773
INFO:tensorflow:loss = 194.0849, step = 3301 (0.842 sec)
INFO:tensorflow:global_step/sec: 122.637
INFO:tensorflow:loss = 213.67398, step = 3401 (0.815 sec)
INFO:tensorflow:global_step/sec: 122.263
INFO:tensorflow:loss = 224.94846, step = 3501 (0.818 sec)
INFO:tensorflow:global_step

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 7000: ./workspace/models/cooc2emb-01/model.ckpt-7000
INFO:tensorflow:global_step/sec: 51.2363
INFO:tensorflow:loss = 126.58723, step = 7001 (1.953 sec)
INFO:tensorflow:global_step/sec: 109.428
INFO:tensorflow:loss = 203.24863, step = 7101 (0.913 sec)
INFO:tensorflow:global_step/sec: 121.095
INFO:tensorflow:loss = 224.10477, step = 7201 (0.826 sec)
INFO:tensorflow:global_step/sec: 131.111
INFO:tensorflow:loss = 146.8731, step = 7301 (0.763 sec)
INFO:tensorflow:global_step/sec: 129.951
INFO:tensorflow:loss = 175.83353, step = 7401 (0.770 sec)
INFO:tensorflow:global_step/sec: 131.237
INFO:tensorflow:loss = 162.24089, step = 7501 (0.762 sec)
INFO:tensorflow:global_step/sec: 129
INFO:tensorflow:loss = 171.27118, step = 7601 (0.775 sec)
INFO:tensorflow:global_step/sec: 117.551
INFO:tensorflow:loss = 263.03635, step = 7701 (0.851 sec)
INFO:tensorflow:global_step/sec: 123.199
INFO:tensorflow:loss = 144.85307, step = 7801 (0.812 s

INFO:tensorflow:global_step/sec: 108.515
INFO:tensorflow:loss = 261.6709, step = 11101 (0.920 sec)
INFO:tensorflow:global_step/sec: 114.574
INFO:tensorflow:loss = 240.81813, step = 11201 (0.873 sec)
INFO:tensorflow:global_step/sec: 117.264
INFO:tensorflow:loss = 201.26619, step = 11301 (0.853 sec)
INFO:tensorflow:global_step/sec: 117.613
INFO:tensorflow:loss = 142.66801, step = 11401 (0.850 sec)
INFO:tensorflow:global_step/sec: 116.552
INFO:tensorflow:loss = 257.04593, step = 11501 (0.858 sec)
INFO:tensorflow:global_step/sec: 132.665
INFO:tensorflow:loss = 129.23802, step = 11601 (0.753 sec)
INFO:tensorflow:global_step/sec: 123.619
INFO:tensorflow:loss = 212.71115, step = 11701 (0.809 sec)
INFO:tensorflow:global_step/sec: 116.654
INFO:tensorflow:loss = 147.43509, step = 11801 (0.857 sec)
INFO:tensorflow:global_step/sec: 116.755
INFO:tensorflow:loss = 147.03365, step = 11901 (0.856 sec)
INFO:tensorflow:Saving checkpoints for 12000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:ten

INFO:tensorflow:loss = 102.39223, step = 15201 (0.755 sec)
INFO:tensorflow:global_step/sec: 130.032
INFO:tensorflow:loss = 89.27941, step = 15301 (0.769 sec)
INFO:tensorflow:global_step/sec: 133.029
INFO:tensorflow:loss = 132.35013, step = 15401 (0.752 sec)
INFO:tensorflow:global_step/sec: 130.943
INFO:tensorflow:loss = 93.633965, step = 15501 (0.764 sec)
INFO:tensorflow:global_step/sec: 122.702
INFO:tensorflow:loss = 104.82235, step = 15601 (0.815 sec)
INFO:tensorflow:global_step/sec: 117.221
INFO:tensorflow:loss = 245.27763, step = 15701 (0.853 sec)
INFO:tensorflow:global_step/sec: 117.689
INFO:tensorflow:loss = 152.45248, step = 15801 (0.850 sec)
INFO:tensorflow:global_step/sec: 118.329
INFO:tensorflow:loss = 120.06522, step = 15901 (0.845 sec)
INFO:tensorflow:Saving checkpoints for 16000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./w

INFO:tensorflow:global_step/sec: 118.056
INFO:tensorflow:loss = 71.510124, step = 19401 (0.848 sec)
INFO:tensorflow:global_step/sec: 107.482
INFO:tensorflow:loss = 99.960594, step = 19501 (0.930 sec)
INFO:tensorflow:global_step/sec: 107.08
INFO:tensorflow:loss = 69.829155, step = 19601 (0.934 sec)
INFO:tensorflow:global_step/sec: 108.675
INFO:tensorflow:loss = 93.70269, step = 19701 (0.920 sec)
INFO:tensorflow:global_step/sec: 107.765
INFO:tensorflow:loss = 74.388, step = 19801 (0.928 sec)
INFO:tensorflow:global_step/sec: 108.347
INFO:tensorflow:loss = 99.80997, step = 19901 (0.922 sec)
INFO:tensorflow:Saving checkpoints for 20000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.

INFO:tensorflow:global_step/sec: 121.632
INFO:tensorflow:loss = 29.816235, step = 23601 (0.822 sec)
INFO:tensorflow:global_step/sec: 119.554
INFO:tensorflow:loss = 38.68205, step = 23701 (0.837 sec)
INFO:tensorflow:global_step/sec: 113.08
INFO:tensorflow:loss = 79.60011, step = 23801 (0.884 sec)
INFO:tensorflow:global_step/sec: 102.768
INFO:tensorflow:loss = 38.83442, step = 23901 (0.973 sec)
INFO:tensorflow:Saving checkpoints for 24000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:06:33Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/

INFO:tensorflow:loss = 16.001102, step = 27701 (0.869 sec)
INFO:tensorflow:global_step/sec: 116.827
INFO:tensorflow:loss = 11.717692, step = 27801 (0.856 sec)
INFO:tensorflow:global_step/sec: 116.82
INFO:tensorflow:loss = 14.7532215, step = 27901 (0.856 sec)
INFO:tensorflow:Saving checkpoints for 28000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:07:13Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-28000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluatio

INFO:tensorflow:global_step/sec: 109.347
INFO:tensorflow:loss = 8.667614, step = 31901 (0.914 sec)
INFO:tensorflow:Saving checkpoints for 32000 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:07:54Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-32000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:07:54
INFO:tensorflow:Saving dict for global step 32000: global_step = 32000, loss = 2.9795954
INFO

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:08:34Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-36000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:08:34
INFO:tensorflow:Saving dict for global step 36000: global_step = 36000, loss = 2.2982242
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 36000: ./workspace/models/cooc2emb-01/model.ckpt-36000
INFO:tensorflow:global_step/sec: 50.6101
INFO:tensorflow:loss = 2.669294, st

INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:09:16Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-40000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:09:16
INFO:tensorflow:Saving dict for global step 40000: global_step = 40000, loss = 1.8630394
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 40000: ./workspace/models/cooc2emb-01/model.ckpt-40000
INFO:tensorflow:global_step/sec: 49.4978
INFO:tensorflow:loss = 1.871636, step = 40001 (2.020 sec)
INFO:tensorflow:global_step/sec: 128.258
INFO:tensorflow:loss = 3.171071, step = 40101 (0.780 sec)
INFO:tensorflow:global_step/sec: 114.728
INFO:tensor

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-26T20:09:58Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-44000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:09:58
INFO:tensorflow:Saving dict for global step 44000: global_step = 44000, loss = 1.228081
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 44000: ./workspace/models/cooc2emb-01/model.ckpt-44000
INFO:tensorflow:global_step/sec: 56.0836
INFO:tensorflow:loss = 1.9463851, step = 44001 (1.783 sec)
INFO:tensorflow:global_step/sec: 76.2156
INFO:tensorflow:loss = 2.4886556, step = 44101 (1.312 sec)
INFO:tensorflow:global_step/sec: 93.4003
INFO:tensorflow:loss = 3.5987377, step = 44201 (1.072 sec)
INFO:tensorflow:global_step/sec: 70.2242
INFO:tensorflow:loss = 2.0854924, step = 44301 (1.

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-09-26-20:10:47
INFO:tensorflow:Saving dict for global step 48000: global_step = 48000, loss = 1.1819469
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 48000: ./workspace/models/cooc2emb-01/model.ckpt-48000
INFO:tensorflow:global_step/sec: 38.3537
INFO:tensorflow:loss = 1.9475672, step = 48001 (2.607 sec)
INFO:tensorflow:global_step/sec: 66.8592
INFO:tensorflow:loss = 1.3619143, step = 48101 (1.495 sec)
INFO:tensorflow:global_step/sec: 65.8507
INFO:tensorflow:loss = 2.3992617, step = 48201 (1.519 sec)
INFO:tensorflow:global_step/sec: 76.0328
INFO:tensorflow:loss = 4.3166556, step = 48301 (1.315 sec)
INFO:tensorflow:global_step/sec: 68.5563
INFO:tensorflow:loss = 1.0231736, step = 48401 (1.460 sec)
INFO:tensorflow:global_step/sec: 73.9979
INFO:tensorflow:loss = 1.8855752, step = 48501 (1.354 sec)
INFO:tensorflow:

<img src="loss.png" width="600" height="400"/>


## 8. Extract embeddings

In [20]:
def extract_embeddings():
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(model_dir, 'model.ckpt-{}.meta'.format(params.traning_steps)))
        saver.restore(sess, os.path.join(model_dir, 'model.ckpt-{}'.format(params.traning_steps)))
        graph = tf.get_default_graph()
        weights_tensor = graph.get_tensor_by_name('input_layer_1/item2_embedding/embedding_weights:0')
        weights = np.array(sess.run(weights_tensor))

    return weights

In [21]:
embeddings = extract_embeddings()
print(len(embeddings))
print(embeddings[0])

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-50000
39195
[ 0.15476117 -0.22028464  0.5029515   0.08361941  0.01662882  0.20098309
  0.02289918  0.30216253 -0.145497    0.54127693 -0.03063745  0.09072404
 -0.02569989  0.23658207 -0.14141276 -0.18085752 -0.02175103  0.18658789
  0.3142943  -0.1167581  -0.24651718  0.3543694  -0.05479481 -0.04769993
 -0.26089588  0.03063611  0.573654   -0.03463669 -0.44228756  0.09221426
 -0.26482692  0.33758983]


In [22]:
vocab_path = os.path.join(COOC_DIR,'vocab.txt')
output_path = os.path.join(WORKSPACE,'embeddings.tsv')

def write_embeddings_to_tsv():
    with open(output_path, 'w') as out_f:
        with open(vocab_path) as vocab_f:
            for index, item in enumerate(vocab_f):
                embedding = embeddings[index]
                print('\t'.join([item.strip()] + [str(x) for x in embedding]), file=out_f)
                
write_embeddings_to_tsv()

In [23]:
!head {output_path}

3140490	0.15476117	-0.22028464	0.5029515	0.08361941	0.016628817	0.20098309	0.022899184	0.30216253	-0.145497	0.54127693	-0.030637449	0.090724036	-0.02569989	0.23658207	-0.14141276	-0.18085752	-0.02175103	0.18658789	0.3142943	-0.1167581	-0.24651718	0.3543694	-0.05479481	-0.047699932	-0.26089588	0.030636115	0.573654	-0.034636687	-0.44228756	0.092214264	-0.26482692	0.33758983
1174666	-0.17831224	-0.42993304	-0.0839878	0.29915634	0.03356193	0.31322202	-0.042066813	-0.051706303	-0.37178084	0.14129065	0.080491856	-0.06896243	-0.30283847	0.026865078	0.13399	-0.18424447	-0.091643706	0.030812453	-0.025963543	-0.20992623	0.087224096	-0.013801538	-0.27795655	-0.021292964	0.0033754818	-0.022404164	0.21593037	-0.14364143	-0.0361688	0.117180236	-0.09234736	0.14837469
4124042	0.18704276	-0.08447446	0.062283743	0.085570484	0.040571995	0.22841203	0.18322597	0.019756168	0.06682893	0.29113013	-0.11036961	0.51800394	-0.12652962	0.055464357	-0.09905837	-0.38206455	0.2718967	0.23420781	0.072959624	-0.25740

## 9. Export saved model as item-embedding lookup

In [24]:
def make_serving_input_receiver_fn():
    return tf.estimator.export.build_raw_serving_input_receiver_fn(
        {'item1': tf.placeholder(shape=[None], dtype=tf.string)}
    )

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)

Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predictions', 'serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:ten

b'./workspace/models/cooc2emb-01/export/1569525078'

In [25]:
export_dir = os.path.join(model_dir, "export")
saved_model_dir = os.path.join(
    export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])

print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
)

output = predictor_fn({'item1': ['56430']})
print(output)

./workspace/models/cooc2emb-01/export/1569525078
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/export/1569525078/variables/variables
{'output': array([[-0.4528324 ,  0.30650634, -0.6440126 , -0.3283458 , -0.04250643,
        -0.19180682,  0.04424484, -0.2118475 ,  0.47607163, -0.12407511,
         0.0424717 , -0.11153425, -0.17166878, -0.31536543,  0.1275056