# Training Embeddings for Words

This tutorial shows how to learn **word embeddings** from co-occurrence statistics.

We use a simple NN architecture, a long with the conditional cost function used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm. 

The learnt embeddings are then extracted from the model and saved as TSV file.

The following are the steps of this tutorial:


1. Define input data metadata
2. Implement data input function
3. Create feature columns
4. Create a custome estimator
5. Define the train and evaluate experiment
6. Set the experiment parameters
7. Run the experiment
8. Extract the learnt **word embeddings** from the model

<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/text2emb/02-Training_Embeddings_for_Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [2]:
import os
import math
import numpy as np
import tensorflow as tf
from datetime import datetime

In [3]:
WORKSPACE = './workspace'
COOC_DIR = '{}/cooc'.format(WORKSPACE)
MODELS_DIR = '{}/models'.format(WORKSPACE)
SEED = 19831060

In [4]:
!echo "Files:"
!ls {COOC_DIR}/
!echo ""

!echo "info:"
!head {COOC_DIR}/info.log
!echo ""

!echo "Vocab count"
!wc -l  {COOC_DIR}/vocab.txt
!echo ""

Files:
cooc-00000-of-00016.tfrecords  cooc-00009-of-00016.tfrecords
cooc-00001-of-00016.tfrecords  cooc-00010-of-00016.tfrecords
cooc-00002-of-00016.tfrecords  cooc-00011-of-00016.tfrecords
cooc-00003-of-00016.tfrecords  cooc-00012-of-00016.tfrecords
cooc-00004-of-00016.tfrecords  cooc-00013-of-00016.tfrecords
cooc-00005-of-00016.tfrecords  cooc-00014-of-00016.tfrecords
cooc-00006-of-00016.tfrecords  cooc-00015-of-00016.tfrecords
cooc-00007-of-00016.tfrecords  info.log
cooc-00008-of-00016.tfrecords  vocab.txt

info:
min: -5.136295533922434
P: 905289
max: 9.184321336159051
N: 3821288

Vocab count
4632 ./workspace/cooc/vocab.txt



## 1.  Metadata

In [5]:
FEATURES_SCHEMA = {
    'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'type': tf.FixedLenFeature(dtype=tf.string, shape=())
}

WEIGHT_FEATURE_NAME = 'weight'
TARGET_FEATURE_NAME = 'score'

## 2.  Data Input Function

In [6]:
def make_input_fn(
    file_pattern, batch_size=128, num_epochs=1, shuffle=False):

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features=FEATURES_SCHEMA,
            label_key=TARGET_FEATURE_NAME,
            reader=tf.data.TFRecordDataset,
            shuffle_buffer_size=batch_size * 2,
            num_epochs=num_epochs,
            shuffle=shuffle,
            sloppy_ordering=True,
            drop_final_batch=False
        )
        return dataset
    
    return _input_fn

## 3. Feature Columns

In [7]:
def create_feature_columns(embedding_size, vocab_file):

    feature_columns = tf.feature_column.shared_embedding_columns(
        [
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item1', 
                vocabulary_file=vocab_file
            ),
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item2', 
                vocabulary_file=vocab_file
            )
        ],
        embedding_size
    )
        
    return feature_columns

## 4.  Custom Estimator

In [8]:
def compute_loss(labels, predictions, weights, types):
    
    errors = predictions - labels
    
    positives = tf.cast(tf.equal(types, 'P'), tf.float32)
    negatives = 1 - positives
        
    p_loss = positives * (0.5 * weights * tf.math.square(errors))
    n_loss = negatives * (1.0 * weights * tf.math.softplus(errors))
    
    loss = p_loss + n_loss

    return tf.reduce_mean(loss)
   

def model_fn(features, labels, mode, params):
    
    feature_columns = create_feature_columns(
        params.embedding_size, params.vocab_file)
   
    item1_layer = tf.feature_column.input_layer(
        features={'item1': features['item1']}, feature_columns=[feature_columns[0]])

    item2_layer = tf.feature_column.input_layer(
        features={'item2': features['item2']}, feature_columns=[feature_columns[1]])

    predictions = tf.squeeze(
        tf.keras.layers.Dot(axes=1)([item1_layer, item2_layer]))
    
    loss = compute_loss(
        labels=labels, 
        predictions=predictions, 
        weights=features[WEIGHT_FEATURE_NAME], 
        types=features['type']
    )
    
    train_op=tf.train.AdamOptimizer(params.learning_rate).minimize(
        loss=loss, 
        global_step=tf.train.get_global_step()
    )
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op
    )
    

def create_estimator(params, run_config):
    
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config
    )
    
    return estimator

## 5. Experiment

In [9]:
def run_experiment(estimator, params):
    
    train_data_files = params.train_data_files
    eval_data_files = params.eval_data_files
    
    # TrainSpec ####################################
    
    train_spec = tf.estimator.TrainSpec(
            input_fn=make_input_fn(
            train_data_files,
            batch_size=params.batch_size,
            num_epochs=None,
            shuffle=True
        ),
        max_steps=params.train_steps
    )

    # EvalSpec ####################################

    eval_spec = tf.estimator.EvalSpec(
        input_fn=make_input_fn(
            eval_data_files,
            batch_size=1#params.batch_size,
        ),
        steps=params.eval_steps,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    ###############################################
    
    time_start = datetime.utcnow() 
    print("Experiment started...")
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )
    
    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished.")


## 6. Parameters 

In [10]:
MODEL_NAME = 'cooc2emb-07'
model_dir = os.path.join(MODELS_DIR, MODEL_NAME)
info_file = os.path.join(COOC_DIR, 'info.log')

info_map = {}

if os.path.exists(info_file):
    with open(info_file) as f:
        for line in f.readlines():
            key, value = line.split(":")
            info_map[key] = float(value)
    
class HParams():
    pass


batch_size = 128 
learning_rate = 1.0e-04
multiplier = 8

dataset_size = int(info_map['P'] + info_map['N'])
steps_per_epoch = dataset_size / (batch_size * multiplier)
num_epochs = 10
num_steps = int(num_epochs * steps_per_epoch)

print("Dataset size: {}".format(dataset_size))
print("Batch size: {}".format(batch_size))
print("Steps per epoch: {}".format(int(steps_per_epoch)))
print("Epochs: {}".format(num_epochs))
print("Training steps: {}".format(num_steps))

params  = HParams()
params.train_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.eval_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.vocab_file = os.path.join(COOC_DIR,'vocab.txt')
params.embedding_size = 64
params.batch_size = batch_size * multiplier
params.train_steps = num_steps
params.learning_rate = learning_rate * multiplier
params.lr_first_cycle = steps_per_epoch // 2
params.eval_steps = 1
params.eval_throttle_secs = 0

print(vars(params))

run_config = tf.estimator.RunConfig(
    tf_random_seed=SEED,
    save_checkpoints_steps=steps_per_epoch // 5,
    keep_checkpoint_max=3,
    model_dir=model_dir,
)

print("Experiment parameters are set.")

Dataset size: 4726577
Batch size: 128
Steps per epoch: 4615
Epochs: 10
Training steps: 46157
{'eval_data_files': './workspace/cooc/cooc-*.tfrecords', 'batch_size': 1024, 'train_data_files': './workspace/cooc/cooc-*.tfrecords', 'vocab_file': './workspace/cooc/vocab.txt', 'learning_rate': 0.0008, 'eval_steps': 1, 'eval_throttle_secs': 0, 'embedding_size': 64, 'lr_first_cycle': 2307.0, 'train_steps': 46157}
Experiment parameters are set.


In [11]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e: print(e)
else: print('No GPUs detected')

1 Physical GPUs, 1 Logical GPU


## 7. Run

In [12]:
tf.logging.set_verbosity(tf.logging.INFO)

if tf.gfile.Exists(run_config.model_dir):
    print("Removing previous artefacts...")
    tf.gfile.DeleteRecursively(run_config.model_dir)
            
estimator = create_estimator(params, run_config)
%time run_experiment(estimator, params)

Removing previous artefacts...
INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, '_keep_checkpoint_every_n_hours': 10000, '_global_id_in_cluster': 0, '_device_fn': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_experimental_distribute': None, '_log_step_count_steps': 100, '_keep_checkpoint_max': 3, '_is_chief': True, '_task_id': 0, '_protocol': None, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_save_checkpoints_steps': 923.0, '_tf_random_seed': 19831060, '_train_distribute': None, '_evaluation_master': '', '_model_dir': './workspace/models/cooc2emb-07', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd611fb5518>, '_master': '', '_task_type': 'worker', '_num_worker_replicas': 1, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
}
Experiment started...
.......................................
INFO:tensorflow:Not u

## 8. Extract Word embeddings

In [13]:
def extract_embeddings(model_dir, checkpoint):
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(model_dir, 'model.ckpt-{}.meta'.format(checkpoint)))
        saver.restore(sess, os.path.join(model_dir, 'model.ckpt-{}'.format(checkpoint)))
        graph = tf.get_default_graph()
        weights_tensor = graph.get_tensor_by_name('input_layer/item1_item2_shared_embedding/embedding_weights:0')
        weights = np.array(sess.run(weights_tensor))

    return weights

In [14]:
checkpoint = params.train_steps
embeddings = extract_embeddings(model_dir, checkpoint)
print(len(embeddings))
print(embeddings[0])

INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-07/model.ckpt-46157
4632
[ 1.0213016  -0.10725072  0.7739918   0.0332012   0.42940634  0.1397516
  1.0675998  -0.73517615  0.8547443  -0.43006274 -0.5750132  -0.3696358
  0.5030041  -0.275972   -0.27586427 -0.41647613 -0.56597006 -0.57236856
 -0.67539805 -0.3738892   0.3473007   0.26539868 -0.42769083 -0.39912197
 -0.59455264  0.06485956  0.01481059  0.45336306 -0.23120002  0.25990224
  0.8420276   1.107458    0.21145105 -0.76536393  0.736067   -0.40591478
  0.77940476 -0.47856522  0.15879287 -0.31453386  0.64187175  0.7946321
  0.5779661   0.13116291  0.47162905 -0.7195507   0.53635824  0.5119927
 -0.06278135 -0.49129912 -0.30125576  0.15781142  0.38197124 -0.10838864
  0.41809973  0.35847008 -0.04960598 -0.22166592 -0.40517446 -0.2888908
  0.4890815  -0.13018125 -0.19192408 -0.5896855 ]


In [15]:
vocab_path = os.path.join(COOC_DIR,'vocab.txt')
output_path = os.path.join(WORKSPACE,'embeddings.tsv')

if tf.io.gfile.exists(output_path):
    print("Removing {} ...".format(output_path))
    tf.io.gfile.remove(output_path)

def write_embeddings_to_tsv():
    with open(output_path, 'w') as out_f:
        with open(vocab_path) as vocab_f:
            for index, item in enumerate(vocab_f):
                embedding = embeddings[index]
                print('\t'.join([item.strip()] + [str(x) for x in embedding]), file=out_f)
    
    print("Embeddings are written to: {}".format(output_path))
                
write_embeddings_to_tsv()

Removing ./workspace/embeddings.tsv ...
Embeddings are written to: ./workspace/embeddings.tsv


In [16]:
!head {output_path}

comparisons	1.0213016	-0.10725072	0.7739918	0.033201203	0.42940634	0.1397516	1.0675998	-0.73517615	0.8547443	-0.43006274	-0.5750132	-0.3696358	0.5030041	-0.275972	-0.27586427	-0.41647613	-0.56597006	-0.57236856	-0.67539805	-0.3738892	0.3473007	0.26539868	-0.42769083	-0.39912197	-0.59455264	0.06485956	0.014810595	0.45336306	-0.23120002	0.25990224	0.8420276	1.107458	0.21145105	-0.76536393	0.736067	-0.40591478	0.77940476	-0.47856522	0.15879287	-0.31453386	0.64187175	0.7946321	0.5779661	0.13116291	0.47162905	-0.7195507	0.53635824	0.5119927	-0.06278135	-0.49129912	-0.30125576	0.15781142	0.38197124	-0.10838864	0.41809973	0.35847008	-0.049605977	-0.22166592	-0.40517446	-0.2888908	0.4890815	-0.13018125	-0.19192408	-0.5896855
vampires	0.035619188	0.04893078	-0.084809445	-0.06471856	0.52670044	-0.0055025746	-0.037620332	0.016188823	0.07806883	0.08989796	0.06998612	0.08039018	0.33301553	-0.29314372	-0.4904764	-0.23849678	-0.13123038	0.34412712	0.057331108	0.040416718	-0.40215397	0.41187695	-0.311