# Training Embeddings for Playlist Tracks

This tutorial shows how to learn **item embeddings** from co-occurrence statistics.

We use the following model architecture, a long with the conditional cost function used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm. 

The learnt embeddings are then extracted from the model and saved as TSV file.

<img src="cooc2emb.png" width="600" height="400"/>

The following are the steps of this tutorial:


1. Define input data metadata
2. Implement data input function
3. Create feature columns
4. Create a custome estimator
5. Define the train and evaluate experiment
6. Set the experiment parameters
7. Run the experiment
8. Extract the learnt embeddings from the model
9. Export the saved model to serve as an embedding lookup


<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/track2ann/02-Training_Embeddings_for_Playlist_Tracks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
# !pip install -r ../requirements.txt

In [2]:
import os
import math
import numpy as np
import tensorflow as tf
from datetime import datetime

In [3]:
WORKSPACE = './workspace'
COOC_DIR = '{}/cooc'.format(WORKSPACE)
MODELS_DIR = '{}/models'.format(WORKSPACE)
SEED = 19831060

In [4]:
!echo "Files:"
!ls {COOC_DIR}/
!echo ""

!echo "info:"
!head {COOC_DIR}/info.log
!echo ""

!echo "vocab file:"
!head {COOC_DIR}/vocab.txt

Files:
cooc-00000-of-00001.tfrecords info2.log
info.log                      vocab.txt

info:
P: 2249477
N: 486352
max: 13.33242
min: 2.2812

vocab file:
3140490
1174666
4124042
1176202
4190858
1110922
914570
980106
63370
849802


## 1.  Metadata

In [5]:
FEATURES_SCHEMA = {
    'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'type': tf.FixedLenFeature(dtype=tf.string, shape=())
}

WEIGHT_FEATURE_NAME = 'weight'
TARGET_FEATURE_NAME = 'score'

## 2.  Data Input Function

In [6]:
def make_input_fn(
    file_pattern, batch_size=128, num_epochs=1, shuffle=False):

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features=FEATURES_SCHEMA,
            label_key=TARGET_FEATURE_NAME,
            reader=tf.data.TFRecordDataset,
            shuffle_buffer_size=batch_size * 2,
            num_epochs=num_epochs,
            shuffle=shuffle,
            sloppy_ordering=True,
            drop_final_batch=False
        )
        return dataset
    
    return _input_fn

## 3. Feature Columns

In [7]:
def create_feature_columns(embedding_size, vocab_file):

    feature_columns = tf.feature_column.shared_embedding_columns(
        [
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item1', 
                vocabulary_file=vocab_file
            ),
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item2', 
                vocabulary_file=vocab_file
            )
        ],
        embedding_size
    )
        
    return feature_columns

## 4.  Custom Estimator

In [8]:
def compute_loss(labels, predictions, weights, types):
    
    errors = predictions - labels
    
    positives = tf.cast(tf.equal(types, 'P'), tf.float32)
    negatives = 1 - positives
        
    p_loss = positives * (0.5 * weights * tf.math.square(errors))
    n_loss = negatives * (1.0 * weights * tf.math.softplus(errors))
    
    loss = p_loss + n_loss

    return tf.reduce_mean(loss)
    

def model_fn(features, labels, mode, params):
    
    predictions = None
    export_outputs = None
    loss = None
    train_op = None
    
    items1 = features['item1']
    feature_columns = create_feature_columns(
        params.embedding_size, params.vocab_file)
    
    item1_layer = tf.feature_column.input_layer(
        features={'item1': items1}, feature_columns=[feature_columns[0]])
    
    if mode != tf.estimator.ModeKeys.PREDICT:
        items2 = features['item2']
        item2_layer = tf.feature_column.input_layer(
            features={'item2': items2}, feature_columns=[feature_columns[1]])
        
        dot_product = tf.keras.layers.Dot(axes=1)([item1_layer, item2_layer])
        predictions = tf.squeeze(dot_product)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions =  item1_layer
        export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}
    else:
        types = features['type']
        weights = features[WEIGHT_FEATURE_NAME]

        loss = compute_loss(
            labels=labels, 
            predictions=predictions, 
            weights=weights, 
            types=types
        )
        
        train_op=tf.train.AdamOptimizer(params.learning_rate).minimize(
            loss=loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        export_outputs=export_outputs,
        loss=loss,
        train_op=train_op
    )


def create_estimator(params, run_config):
    
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config
    )
    
    return estimator

## 5. Experiment

In [9]:
def run_experiment(estimator, params):
    
    train_data_files = params.train_data_files
    eval_data_files = params.eval_data_files
    
    # TrainSpec ####################################
    
    train_spec = tf.estimator.TrainSpec(
            input_fn=make_input_fn(
            train_data_files,
            batch_size=params.batch_size,
            num_epochs=None,
            shuffle=True
        ),
        max_steps=params.train_steps
    )

    # EvalSpec ####################################

    eval_spec = tf.estimator.EvalSpec(
        input_fn=make_input_fn(
            eval_data_files,
            batch_size=1 #params.batch_size,
        ),
        steps=params.eval_steps,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    ###############################################
    
    time_start = datetime.utcnow() 
    print("Experiment started...")
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )
    
    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished.")


## 6. Parameters 

In [10]:
MODEL_NAME = 'cooc2emb-01'
model_dir = os.path.join(MODELS_DIR, MODEL_NAME)
info_file = os.path.join(COOC_DIR, 'info.log')

info_map = {}

if os.path.exists(info_file):
    try:
        with open(info_file) as f:
            for line in f.readlines():
                key, value = line.split(":")
                info_map[key] = float(value)
    except: pass

        
class HParams():
    pass

batch_size = 128 
learning_rate = 1.0e-04
multiplier = 8

dataset_size = int(info_map['P'] + info_map['N'])
steps_per_epoch = dataset_size / (batch_size * multiplier)
num_epochs = 15
num_steps = int(num_epochs * steps_per_epoch)

print("Dataset size: {}".format(dataset_size))
print("Batch size: {}".format(batch_size))
print("Steps per epoch: {}".format(int(steps_per_epoch)))
print("Epochs: {}".format(num_epochs))
print("Training steps: {}".format(num_steps))
    
class HParams():
    pass

params  = HParams()
params.train_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.eval_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.vocab_file = os.path.join(COOC_DIR,'vocab.txt')
params.embedding_size = 32
params.batch_size = batch_size * multiplier
params.train_steps = num_steps
params.learning_rate = learning_rate * multiplier
params.eval_steps = 1
params.eval_throttle_secs = 0

print(vars(params))

run_config = tf.estimator.RunConfig(
    tf_random_seed=SEED,
    save_checkpoints_steps=steps_per_epoch // 5,
    keep_checkpoint_max=3,
    model_dir=model_dir,
)

print("Experiment parameters are set.")

Dataset size: 2735829
Batch size: 128
Steps per epoch: 2671
Epochs: 15
Training steps: 40075
{'train_data_files': './workspace/cooc/cooc-*.tfrecords', 'eval_data_files': './workspace/cooc/cooc-*.tfrecords', 'vocab_file': './workspace/cooc/vocab.txt', 'embedding_size': 32, 'batch_size': 1024, 'train_steps': 40075, 'learning_rate': 0.0008, 'eval_steps': 1, 'eval_throttle_secs': 0}
Experiment parameters are set.


## 7. Run

In [11]:
tf.logging.set_verbosity(tf.logging.INFO)

if tf.gfile.Exists(run_config.model_dir):
    print("Removing previous artefacts...")
    tf.gfile.DeleteRecursively(run_config.model_dir)
            
estimator = create_estimator(params, run_config)
%time run_experiment(estimator, params)

Removing previous artefacts...
INFO:tensorflow:Using config: {'_model_dir': './workspace/models/cooc2emb-01', '_tf_random_seed': 19831060, '_save_summary_steps': 100, '_save_checkpoints_steps': 534.0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x134425748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Experiment started...
.......................................
INFO:tensorflow:Not usin

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:40:56Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-1068
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:40:56
INFO:tensorflow:Saving dict for global step 1068: global_step = 1068, loss = 43.268852
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1068: ./workspace/models/cooc2emb-01/model.ckpt-1068
INFO:tensorflow:global_step/sec: 37.6781
INFO:tensorflow:loss = 30.815956, step = 1101 (2.655 sec)
INFO:tensorflow:global_step/sec: 64.7136
INFO:tensorflow:loss = 29.152, step = 1201 (1.544 sec)
INFO:tensorflow:global_step/sec: 61.256
INFO:tensorflow:loss = 33.64283, step = 1301 (1.632 sec)
INFO:tensorflow:global_step/sec: 59.402
INFO:tensorflow:loss = 33.398712, step = 1401 (1.684 sec)
INFO:

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-3738
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:41:41
INFO:tensorflow:Saving dict for global step 3738: global_step = 3738, loss = 34.376507
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3738: ./workspace/models/cooc2emb-01/model.ckpt-3738
INFO:tensorflow:global_step/sec: 41.7527
INFO:tensorflow:loss = 29.429754, step = 3801 (2.395 sec)
INFO:tensorflow:global_step/sec: 136.95
INFO:tensorflow:loss = 27.577173, step = 3901 (0.730 sec)
INFO:tensorflow:global_step/sec: 87.2332
INFO:tensorflow:loss = 31.826597, step = 4001 (1.146 sec)
INFO:tensorflow:global_step/sec: 70.7432
INFO:tensorflow:loss = 29.59535, step = 4101 (1.414 sec)
INFO:tensorflow:global_step/sec: 65.3468
INFO:tensorflow:loss = 32.19809, step = 4201 (1.530 sec)
IN

INFO:tensorflow:global_step/sec: 44.1578
INFO:tensorflow:loss = 24.516024, step = 6501 (2.265 sec)
INFO:tensorflow:global_step/sec: 146.443
INFO:tensorflow:loss = 18.56444, step = 6601 (0.683 sec)
INFO:tensorflow:global_step/sec: 143.508
INFO:tensorflow:loss = 20.020641, step = 6701 (0.697 sec)
INFO:tensorflow:global_step/sec: 85.02
INFO:tensorflow:loss = 21.058098, step = 6801 (1.176 sec)
INFO:tensorflow:global_step/sec: 82.6029
INFO:tensorflow:loss = 22.376478, step = 6901 (1.210 sec)
INFO:tensorflow:Saving checkpoints for 6942 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:42:25Z
INF

INFO:tensorflow:Saving checkpoints for 9612 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:42:58Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-9612
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:42:59
INFO:tensorflow:Saving dict for global step 9612: global_step = 9612, loss = 2.2749555
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 9612: ./workspace/models/cooc2emb-01/model

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-12282
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:43:34
INFO:tensorflow:Saving dict for global step 12282: global_step = 12282, loss = 2.129991
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 12282: ./workspace/models/cooc2emb-01/model.ckpt-12282
INFO:tensorflow:global_step/sec: 32.0236
INFO:tensorflow:loss = 10.166758, step = 12301 (3.128 sec)
INFO:tensorflow:global_step/sec: 122.337
INFO:tensorflow:loss = 6.714791, step = 12401 (0.812 sec)
INFO:tensorflow:global_step/sec: 128.837
INFO:tensorflow:loss = 8.095094, step = 12501 (0.776 sec)
INFO:tensorflow:global_step/sec: 84.3037
INFO:tensorflow:loss = 10.153255, step = 12601 (1.186 sec)
INFO:tensorflow:global_step/sec: 72.4959
INFO:tensorflow:loss = 7.5700054, step = 12701 (1.379 sec)
INFO:tensorflow:global_step/s

INFO:tensorflow:global_step/sec: 36.5388
INFO:tensorflow:loss = 5.610318, step = 15001 (2.739 sec)
INFO:tensorflow:global_step/sec: 158.455
INFO:tensorflow:loss = 4.250322, step = 15101 (0.629 sec)
INFO:tensorflow:global_step/sec: 177.875
INFO:tensorflow:loss = 5.288439, step = 15201 (0.562 sec)
INFO:tensorflow:global_step/sec: 88.3811
INFO:tensorflow:loss = 5.54334, step = 15301 (1.131 sec)
INFO:tensorflow:global_step/sec: 66.6085
INFO:tensorflow:loss = 4.7023916, step = 15401 (1.501 sec)
INFO:tensorflow:Saving checkpoints for 15486 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:44:18Z

INFO:tensorflow:loss = 2.0869517, step = 18101 (1.380 sec)
INFO:tensorflow:Saving checkpoints for 18156 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:44:54Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-18156
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:44:54
INFO:tensorflow:Saving dict for global step 18156: global_step = 18156, loss = 0.00096558215
INFO:tensorflow:Saving 'checkpoint_path'

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:45:31Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-20826
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:45:32
INFO:tensorflow:Saving dict for global step 20826: global_step = 20826, loss = 0.092296176
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 20826: ./workspace/models/cooc2emb-01/model.ckpt-20826
INFO:tensorflow:global_step/sec: 37.4634
INFO:tensorflow:loss = 1.0241492, step = 20901 (2.668 sec)
INFO:tensorflow:global_step/sec: 147.059
INFO:tensorflow:loss = 1.174664, step = 21001 (0.680 sec)
INFO:tensorflow:global_step/sec: 124.722
INFO:tensorflow:loss = 1.3609316, step = 21101 (0.802 sec)
INFO:tensorflow:global_step/sec: 76.5201
INFO:tensorflow:loss = 0.6456757, step = 21201 (

INFO:tensorflow:global_step/sec: 18.4974
INFO:tensorflow:loss = 0.5885432, step = 23501 (5.407 sec)
INFO:tensorflow:global_step/sec: 181.668
INFO:tensorflow:loss = 0.69254315, step = 23601 (0.551 sec)
INFO:tensorflow:global_step/sec: 164.764
INFO:tensorflow:loss = 0.84812546, step = 23701 (0.606 sec)
INFO:tensorflow:global_step/sec: 143.42
INFO:tensorflow:loss = 0.5966329, step = 23801 (0.698 sec)
INFO:tensorflow:global_step/sec: 68.8679
INFO:tensorflow:loss = 0.82700753, step = 23901 (1.452 sec)
INFO:tensorflow:global_step/sec: 51.0094
INFO:tensorflow:loss = 0.5406697, step = 24001 (1.960 sec)
INFO:tensorflow:Saving checkpoints for 24030 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vo

INFO:tensorflow:global_step/sec: 78.5252
INFO:tensorflow:loss = 0.34102553, step = 26601 (1.273 sec)
INFO:tensorflow:Saving checkpoints for 26700 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:46:53Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-26700
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:46:54
INFO:tensorflow:Saving dict for global step 26700: global_step = 26700, loss = 0.59004796
I

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:47:30Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-29370
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:47:31
INFO:tensorflow:Saving dict for global step 29370: global_step = 29370, loss = 0.62992704
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 29370: ./workspace/models/cooc2emb-01/model.ckpt-29370
INFO:tensorflow:global_step/sec: 31.9208
INFO:tensorflow:loss = 0.19046402, step = 29401 (3.133 sec)
INFO:tensorflow:global_step/sec: 157.877
INFO:tensorflow:loss = 0.17461668, step = 29501 (0.635 sec)
INFO:tensorflow:global_step/sec: 132.125
INFO:tensorflow:loss = 0.17740521, step = 29601 (0.756 sec)
INFO:tensorflow:global_step/sec: 89.1973
INFO:tensorflow:loss = 0.16578059, step = 297

INFO:tensorflow:Saving dict for global step 32040: global_step = 32040, loss = 0.6149935
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 32040: ./workspace/models/cooc2emb-01/model.ckpt-32040
INFO:tensorflow:global_step/sec: 37.3011
INFO:tensorflow:loss = 0.14613548, step = 32101 (2.681 sec)
INFO:tensorflow:global_step/sec: 195.424
INFO:tensorflow:loss = 0.14285666, step = 32201 (0.512 sec)
INFO:tensorflow:global_step/sec: 197.307
INFO:tensorflow:loss = 0.16411924, step = 32301 (0.507 sec)
INFO:tensorflow:global_step/sec: 100.669
INFO:tensorflow:loss = 0.1412789, step = 32401 (0.993 sec)
INFO:tensorflow:global_step/sec: 90.1948
INFO:tensorflow:loss = 0.15149859, step = 32501 (1.109 sec)
INFO:tensorflow:Saving checkpoints for 32574 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:voca

INFO:tensorflow:global_step/sec: 161.981
INFO:tensorflow:loss = 0.1556085, step = 35001 (0.617 sec)
INFO:tensorflow:global_step/sec: 83.6302
INFO:tensorflow:loss = 0.12226046, step = 35101 (1.195 sec)
INFO:tensorflow:global_step/sec: 81.0435
INFO:tensorflow:loss = 0.14286438, step = 35201 (1.234 sec)
INFO:tensorflow:Saving checkpoints for 35244 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:48:48Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-35244
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done runni

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-08T14:49:23Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-37914
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-10-08-14:49:23
INFO:tensorflow:Saving dict for global step 37914: global_step = 37914, loss = 0.6365833
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 37914: ./workspace/models/cooc2emb-01/model.ckpt-37914
INFO:tensorflow:global_step/sec: 40.4673
INFO:tensorflow:loss = 0.11069974, 

<img src="loss.png" width="800" height="800"/>


## 8. Extract embeddings

In [12]:
def extract_embeddings(model_dir, checkpoint):
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(model_dir, 'model.ckpt-{}.meta'.format(checkpoint)))
        saver.restore(sess, os.path.join(model_dir, 'model.ckpt-{}'.format(checkpoint)))
        graph = tf.get_default_graph()
        weights_tensor = graph.get_tensor_by_name('input_layer/item1_item2_shared_embedding/embedding_weights:0')
        weights = np.array(sess.run(weights_tensor))

    return weights

In [13]:
checkpoint = params.train_steps
embeddings = extract_embeddings(model_dir, checkpoint)
print(len(embeddings))
print(embeddings[0])

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-40075
39195
[-0.69445676  0.15538701 -0.5623856  -0.41403106  0.47428086 -0.749056
  0.38415983  0.6144499   0.44530433 -0.7627456  -0.35830513 -0.55406684
  0.5798813   0.55426675 -0.48798132 -0.50253195  0.5263182   0.5650592
  0.6243181  -0.4317996  -0.4138922   0.59743804  0.4270064   0.37514323
  0.49481392 -0.56278145 -0.4238821   0.32114398  0.7360854  -0.6360757
  0.44446772  0.003412  ]


In [14]:
vocab_path = os.path.join(COOC_DIR,'vocab.txt')
output_path = os.path.join(WORKSPACE,'embeddings.tsv')

if tf.io.gfile.exists(output_path):
    print("Removing {} ...".format(output_path))
    tf.io.gfile.remove(output_path)

def write_embeddings_to_tsv():
    with open(output_path, 'w') as out_f:
        with open(vocab_path) as vocab_f:
            for index, item in enumerate(vocab_f):
                embedding = embeddings[index]
                print('\t'.join([item.strip()] + [str(x) for x in embedding]), file=out_f)
                
write_embeddings_to_tsv()

In [15]:
!head {output_path}

3140490	-0.69445676	0.15538701	-0.5623856	-0.41403106	0.47428086	-0.749056	0.38415983	0.6144499	0.44530433	-0.7627456	-0.35830513	-0.55406684	0.5798813	0.55426675	-0.48798132	-0.50253195	0.5263182	0.5650592	0.6243181	-0.4317996	-0.4138922	0.59743804	0.4270064	0.37514323	0.49481392	-0.56278145	-0.4238821	0.32114398	0.7360854	-0.6360757	0.44446772	0.003411999
1174666	-0.58756226	0.22192341	-0.6271466	-0.8924836	0.27205187	-0.76945573	0.62443304	0.5432575	0.66293955	-0.266195	-0.93643594	-0.56892765	0.66121984	0.8806906	-0.5815187	-0.30352914	0.8978216	0.6571457	0.26976705	-0.7229962	-0.61838704	0.5035906	-0.06480688	0.5187317	0.82479	-0.38439116	-0.5100018	0.80537647	0.9591867	-0.96387327	0.5978212	0.30264413
4124042	-0.6341571	0.8740677	-0.47905606	-0.95108676	1.0255765	-0.54103255	0.4663862	0.6009798	0.45064032	-0.53458613	-0.63207287	-0.5925189	0.5517187	0.46762142	-0.5940642	-0.69960827	0.7755746	0.95842564	0.16419093	-0.7661113	-0.7348178	0.5296598	0.6444128	0.6293398	0.36577365	-

## 9. Export saved model as item-embedding lookup

In [16]:
def make_serving_input_receiver_fn():
    return tf.estimator.export.build_raw_serving_input_receiver_fn(
        {'item1': tf.placeholder(shape=[None], dtype=tf.string)}
    )

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)

Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 39195 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:vocabulary_size = 39195 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab.txt.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predictions', 'serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:ten

b'./workspace/models/cooc2emb-01/export/1570542745'

In [17]:
export_dir = os.path.join(model_dir, "export")
saved_model_dir = os.path.join(
    export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])

print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
)

output = predictor_fn({'item1': ['56430']})
print(output)

./workspace/models/cooc2emb-01/export/1570542745
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/export/1570542745/variables/variables
{'output': array([[-0.9009376 ,  0.572969  , -0.6879163 , -0.72437125,  0.439465  ,
        -0.4253796 ,  0.73146176,  0.6351501 ,  0.77992374, -0.71362484,
        -1.0118732 , -0.4928768 ,  0.2724824 ,  0.63912374, -0.7288149