# Creating Co-occurrence Statistics from Playlist Data in BigQuery

This tutorial shows how to compute **Co-occurrence statistics** for items occurring together in the same context. In this example, the items are music tracks, and the context is playlists. 

The statistics we compute for each co-occurring pair is the [Pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) (pmi), which is used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm for learning embeddings.

<img src="tabular2cooc.png" width="600" height="400"/>

The dataset we use is in [Google BigQuery](https://bigquery.cloud.google.com/table/bigquery-samples:playlists.playlist?pli=1), and we use [Apache Beam](https://beam.apache.org/get-started/beam-overview/) to implement the pmi computation process. Beam can run at scale using [Cloud Dataflow](https://cloud.google.com/dataflow/).

The following are the steps of this tutorial:


1. Exract data from BigQuery to filesystem as CSV.
2. Compute Co-occurrence statistics and store them as TFRecord files.
3. Read statistics in the TFRecords using tf.data APIs

<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/track2ann/01-Creating_Co-occurrence_Stats_from_Playlist_Data_in_BigQuery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
# !pip install -r ../requirements.txt

In [13]:
# If using COLAB
try:
    from google.colab import auth
    auth.authenticate_user()
except: pass

In [4]:
import os
import math
import apache_beam as beam
import tensorflow as tf
from google.cloud import bigquery
from datetime import datetime

In [14]:
PROJECT_ID = 'ksalama-cloudml'
REGION = 'europe-west1'
WORKSPACE = './workspace'
DATA_DIR = '{}/data'.format(WORKSPACE)
COOC_DIR = '{}/cooc'.format(WORKSPACE)

In [None]:
if tf.io.gfile.exists(WORKSPACE):
    print("Removing {} contents...".format(WORKSPACE))
    tf.io.gfile.rmtree(WORKSPACE)

print("Creating workspace: {}".format(WORKSPACE))
tf.io.gfile.makedirs(WORKSPACE)

### Dataset
* 430K+ playlists
* 850K+ tracks
* 120M+ entries

In [5]:
query = '''
    SELECT  
      id AS playlist_id,
      tracks_data_id AS track_id,
      tracks_data_title AS track_title, 
      tracks_data_artist_name AS artist_name, 
      tracks_data_album_title AS album_title 
    FROM 
      `bigquery-samples.playlists.playlist`
    WHERE
      tracks_data_id > 0
    AND 
        tracks_data_title IS NOT NULL
    AND 
        tracks_data_artist_name IS NOT NULL
    AND
       tracks_data_album_title IS NOT NULL 
    LIMIT 10
'''
bq_client = bigquery.Client(project=PROJECT_ID)
query_job = bq_client.query(query)
results = query_job.result().to_dataframe()
display(results)



Unnamed: 0,playlist_id,track_id,track_title,artist_name,album_title
0,7712011,3636957,Maggot Brain (Think It Ain't Illegal Yet),,"Cosmic Funkers, Vol. 1"
1,3072830,3636958,Freak Of The Week,,"Cosmic Funkers, Vol. 1"
2,1855902,3636982,The Pot Head Pixies,,"Magick Invocations, Vol. 1"
3,4557029,3637094,My Baby Just Cares For Me,,Lady Blue Part 1
4,5345222,3637082,He's Got The Whole World In His Hands,,Lady Blue Part 1
5,6289202,3637082,He's Got The Whole World In His Hands,,Lady Blue Part 1
6,645572,3637082,He's Got The Whole World In His Hands,,Lady Blue Part 1
7,1914713,3637096,Don't Let Me Be Misunderstood,,Lady Blue Part 2
8,9570584,3637112,Ain't Got No; I Got Life,,Live
9,3597625,3637241,Leaving On A Jet Plane,,Turn Your Radio On


## 1. Extract data from BigQuery

### Implement the query generation method
We are going to extract the entries for tracks that appeared more than N playlist.

In [None]:
def generate_query(min_occurrence=50, limit=None):
    query = '''
    WITH data
    AS
    (
      SELECT  
        id AS playlist_id,
        tracks_data_id AS track_id
      FROM
        `bigquery-samples.playlists.playlist`
      WHERE
        tracks_data_id > 0
    ),

    frequeny_tracks
    AS
    (
      SELECT 
        track_id
      FROM
        data
      GROUP BY 
        track_id
      HAVING 
        count(playlist_id) >= {}
    ),

    playlists_with_frequent_tracks
    AS
    (
      SELECT DISTINCT 
        p.playlist_id
      FROM
        data p
      JOIN
        frequeny_tracks t
      ON
        p.track_id = t.track_id
    )

    SELECT 
      d.playlist_id,
      d.track_id
    FROM
      data d
    JOIN
      playlists_with_frequent_tracks p
    ON
      d.playlist_id = p.playlist_id
  
    '''.format(min_occurrence)

    if limit:
        query += "LIMIT {}".format(limit)

    return query

###  Data extraction pipeline

In [None]:
def run_extraction_pipeline(args):
    
    def _to_csv(bq_row):
        return "{},{}".format(bq_row['playlist_id'], bq_row['track_id'])

    source_query = args['source_query']
    sink_data_location = args['sink_data_location']
    runner = args['runner']
    
    pipeline_options = beam.options.pipeline_options.GoogleCloudOptions(**args)

    
    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
        (pipeline 
         | "Read from BigQuery">> beam.io.Read(beam.io.BigQuerySource(query = source_query, use_standard_sql = True))
         | 'Convert to CSV' >> beam.Map(_to_csv)
         | "Write to file" >> beam.io.WriteToText(
                    file_path_prefix = sink_data_location+"/data", 
                    file_name_suffix=".csv")
        )

### Run data extraction pipeline locally

In [None]:
runner = 'DirectRunner'
job_name = 'playlist-data-extraction-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

source_query = generate_query(min_occurrence=50, limit=1000000)

args = {
    'job_name': job_name,
    'runner': runner,
    'source_query': source_query,
    'sink_data_location': DATA_DIR,
    'project': PROJECT_ID,
}
print("Pipeline args are set.")

In [None]:
time_start = datetime.utcnow() 
print("Running data extraction pipeline...")
run_extraction_pipeline(args)
print("Pipeline is done.")
time_end = datetime.utcnow() 
time_elapsed = time_end - time_start
print("Execution elapsed time: {} seconds".format(time_elapsed.total_seconds()))

In [6]:
!ls {DATA_DIR}/

data-00000-of-01000.csv data-00334-of-01000.csv data-00668-of-01000.csv
data-00001-of-01000.csv data-00335-of-01000.csv data-00669-of-01000.csv
data-00002-of-01000.csv data-00336-of-01000.csv data-00670-of-01000.csv
data-00003-of-01000.csv data-00337-of-01000.csv data-00671-of-01000.csv
data-00004-of-01000.csv data-00338-of-01000.csv data-00672-of-01000.csv
data-00005-of-01000.csv data-00339-of-01000.csv data-00673-of-01000.csv
data-00006-of-01000.csv data-00340-of-01000.csv data-00674-of-01000.csv
data-00007-of-01000.csv data-00341-of-01000.csv data-00675-of-01000.csv
data-00008-of-01000.csv data-00342-of-01000.csv data-00676-of-01000.csv
data-00009-of-01000.csv data-00343-of-01000.csv data-00677-of-01000.csv
data-00010-of-01000.csv data-00344-of-01000.csv data-00678-of-01000.csv
data-00011-of-01000.csv data-00345-of-01000.csv data-00679-of-01000.csv
data-00012-of-01000.csv data-00346-of-01000.csv data-00680-of-01000.csv
data-00013-of-01000.csv data-00347-of-01000.csv dat

### [OPTIONAL] Run data extraction pipeline using Cloud Dataflow

In [None]:
PROJECT_ID = 'ksalama-cloudml'
REGION = 'europe-west1'
WORKSPACE = 'gs://ksalama-cloudml/workspace/playlist2ann'
DATA_DIR = '{}/data'.format(WORKSPACE)
COOC_DIR = '{}/cooc'.format(WORKSPACE)
STAGING_DIR ='{}/stg'.format(WORKSPACE)
TEMP_DIR ='{}/tmp'.format(WORKSPACE)

runner = 'DataflowRunner'
job_name = 'playlist-data-extraction-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

source_query = generate_query(min_occurrence=50)

args = {
    'job_name': job_name,
    'runner': runner,
    'source_query': source_query,
    'sink_data_location': DATA_DIR,
    'project': PROJECT_ID,
    'region': REGION,
    'staging_location': STAGING_DIR,
    'temp_location': TEMP_DIR,
    'save_main_session': True,
}
print("Pipeline args are set.")

In [None]:
print("Running data extraction pipeline...")
run_extraction_pipeline(args)
print("Pipeline is done.")

In [None]:
!gsutil ls gs://ksalama-cloudml/workspace/playlist2ann/data/

## 2. Compute co-occurrence statistics 

### Pipeline steps

In [26]:
def read_data(pipeline, source_data_location):
    raw_data = ( 
        pipeline
        | 'Read from CSV'>> beam.io.ReadFromText(
            file_pattern=source_data_location)
    )
    return raw_data
    

def parse_data(raw_data):
    
    def _parse_csv(line):
        try:
            context_id, item_id = line.split(',')
            return (context_id, item_id)
        except:
            raise ValueError("Invalid file format. A comma-separated data with two values is expected.")
            
    parsed_data = (
        raw_data
        | 'Parse to tuple' >> beam.Map(_parse_csv)
    
    )
    return parsed_data
    

def compute_item_frequency(raw_data, min_freq):
    
    def _make_item_as_key(record):
        context_id, item_id = record
        return (item_id, context_id)
    
    def _include(record):
        item, count = record
        return count >= min_freq
    
    item_frequency = (
        raw_data
        | 'Make item as key'>> beam.Map(_make_item_as_key)
        | 'Count item frequency' >> beam.CombinePerKey(beam.combiners.CountCombineFn())
        | 'Filter infrequent items' >> beam.Filter(_include)
    )
    return item_frequency

def compute_cooccurrence(parsed_data):
    
    def _has_multiple_items(record):
        _, items = record
        return len(list(items)) > 1

    def _create_item_pairs(record):
        _, items = record
        items = list(items)
        result = list()
        count = len(items)
        for i in range(count):
            for j in range(i+1, count):
                first, second = (items[i], items[j]) if  items[i] < items[j] else (items[j], items[i])
                result.append(
                    ((first, second), 1))

        return result

    def _format_output(record):
        key, value = record
        item_1, item_2 = key
        return (item_1, item_2, value)
    
    cooccurrence = (
        parsed_data
        | 'Group by context id' >> beam.GroupByKey()
        | 'Filter groups with single item' >> beam.Filter(_has_multiple_items)
        | 'Create item pairs' >> beam.FlatMap(_create_item_pairs)
        | 'Compute cooccurrence' >> beam.CombinePerKey(sum)
        | 'Format cooc' >> beam.Map(_format_output)
        
    )
    return cooccurrence

def compute_total_cooc(cooccurrence):
    
    def _get_cooc_values(record):
        _, _, cooc = record
        return cooc
    
    result = (
        cooccurrence
        | "Get cooc values" >> beam.Map(_get_cooc_values)
        | "Sum cooc values" >> beam.CombineGlobally(sum)
    )
    
    return result

def join_with_item_frequency(cooccurrence, item_frequency):
    
    def _make_item1_key(record):
        item1, item2, cooc = record
        return (item1, (item2, cooc))
        
    def _make_item2_key(record):
        item1, item2, cooc, freq1 = record
        return (item2, (item1, cooc, freq1))
    
    def _is_frequent_item(record):
        _, entry = record
        return len(list(entry['freq'])) == 1
    
    def _reformat1(record):
        result = []
        item1, entry = record
        cooc_items = entry['cooc']
        item1_freq = entry['freq']
        
        for item2, cooc in cooc_items:
            result.append(
                (item1, item2, cooc, item1_freq[0])
            )
            
        return result
    
    def _reformat2(record):
        result = []
        item2, entry = record
        cooc_items = entry['cooc']
        item2_freq = entry['freq']
        
        for item1, cooc, freq1 in cooc_items:
            result.append(
                (item1, item2, cooc, freq1, item2_freq[0])
            )
            
        return result
    
    item1_cooc = (
        cooccurrence
        | "Make item 1 as key" >> beam.Map(_make_item1_key)
    )
    
    cooc_and_freq1 = (
        {'cooc': item1_cooc, 'freq': item_frequency}
        | "Join with item 1 frequency" >> beam.CoGroupByKey()
        | "Filter frequent items 1" >> beam.Filter(_is_frequent_item)
        | "Reformat" >> beam.FlatMap(_reformat1)
        
    )
    
    item2_cooc = (
        cooc_and_freq1
        | "Make item 2 as key" >> beam.Map(_make_item2_key)
    )
    
    stats = (
        {'cooc': item2_cooc, 'freq': item_frequency}
        | "Join with item 2 frequency" >> beam.CoGroupByKey()
        | "Filter frequent items 2" >> beam.Filter(_is_frequent_item)
        | "Reformat again" >> beam.FlatMap(_reformat2)
    )
    
    return stats


def vocabulary(item_frequency):
    
    def _get_vocab(record):
        item, _ = record
        return item
    
    vocab = (
        item_frequency
        | "Extract item vocabulary" >> beam.Map(_get_vocab)

    )
    
    return vocab

def create_partitions(vocab, num_shards):
    
    def _partition_fn(item, num_shards):
        return abs(hash(item)) % num_shards
    
    partitions = (
        vocab 
        | "Parition items" >> beam.Partition(_partition_fn, num_shards)
    )
    
    return partitions

    
def create_top_pairs(index, item_frequency, top_count):
    
    def _compare(item_freq1, item_freq2):
        _, freq1 = item_freq1
        _, freq2 = item_freq2
        return freq1 < freq2
    
    def _generate_pairs(items):
        results = []
        items = list(items)
        count = len(items)
        for i in range(count):
            for j in range(i+1, count):
                item1 = items[i][0]
                item2 = items[j][0]
                first, second = (item1, item2) if  item1 < item2 else (item2, item1)
                results.append((first, second, 0))
        return results
    
    pairs = (
        item_frequency
        | "Get top items - shard {}".format(index) >> beam.CombineGlobally(beam.combiners.TopCombineFn(top_count, _compare))
        | "Generate item pairs - shard {}".format(index) >> beam.FlatMap(_generate_pairs)
    )
    
    return pairs

def generate_and_union_pairs(partitions, top_count):
    
    pair_list = []
    for i, partition in enumerate(partitions):
        pairs = create_top_pairs(i+1, partition, top_count)
        pair_list.append(pairs)
            
    pairs = (
        pair_list
        | "Union item pairs" >> beam.Flatten()
    )
    return pairs
    
def merge_pairs(pairs):
    
    def _pair_as_key(record):
        item1, item2, cooc = record
        return ((item1, item2), cooc)
    
    def _process_pair(record):
        key, value = record
        item1, item2 = key
        cooc = max(value)
        return (item1, item2, cooc)
    
    pairs = (
        pairs
        | "Merge item pairs" >> beam.Flatten()
        | "Use pair as key" >> beam.Map(_pair_as_key)
        | "Group pairs by key" >> beam.GroupByKey()
        | "Process pairs" >> beam.Map(_process_pair)
        
    )
    return pairs


def compute_score(data, total):
    
    def _compute_pmi(record, total):
        import math
        item1, item2, cooc, freq1, freq2 = record
        
        if cooc > 0:
            pmi = math.log(cooc) - math.log(freq1) - math.log(freq2) + math.log(total)
            weight = math.sqrt(cooc)
            sample_type = 'P'
        else:
            pmi = math.log(1) - math.log(freq1) - math.log(freq2) + math.log(total)
            weight = 1
            sample_type = 'N'
        return (item1, item2, round(pmi, 5), round(weight, 5), sample_type)
    
    
    stats = (
        data
        | "Compute pairewise mutual infromation" >> beam.Map(_compute_pmi, beam.pvalue.AsSingleton(total))
    )
    return stats

def get_info(stats):
    
    def _make_type_as_key(record):
        _, _, _, _, record_type = record
        return (record_type, 1)
    
    def _get_scores(record):
        _, _, score, _, _ = record
        return score
    
    counts = (
        stats
        | "Group by record type" >> beam.Map(_make_type_as_key)
        | "Count records" >> beam.CombinePerKey(sum)
        | "Fromat counts" >> beam.Map(lambda entry: '{}: {}'.format(entry[0], entry[1]))
    )
    
    scores = (
        stats
        | "Get scores" >> beam.Map(_get_scores)
    )
    
    mins = (
        scores
        | "Get min score" >> beam.CombineGlobally(min).without_defaults()
        | "Format min score" >> beam.Map(lambda value: 'min: {}'.format(value))
    )
    
    maxs = (
        scores
        | "Get max score" >> beam.CombineGlobally(max).without_defaults()
        | "Format max score" >> beam.Map(lambda value: 'max: {}'.format(value))
    )
    
    info = (
        (counts, mins, maxs)
        | "Combine info" >> beam.Flatten()
    )
    
    return info
    

def write_debug(data, sink_data_location):
    
    (
        data
        | 'Write debug' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/debug")
    )
    

def write_log(info, sink_data_location):
    
    (
        info
        | 'Write logs' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/info",
            file_name_suffix = ".log",
            shard_name_template ='',
            num_shards = 1)
    )

def write_vocab(vocab, sink_data_location):
    
    (
        vocab
        | 'Write vocabulary file' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/vocab", 
            file_name_suffix = ".txt",
            shard_name_template ='',
            num_shards = 1)
    )
    

def write_to_tfrecords(stats, sink_data_location):
    
    def _to_tf_example(record):
        item1, item2, score, weight, record_type = record
        
        entry1 = {
            'item1': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item1)])),
            'item2': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item2)])),
            'score': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(score)])),
            'weight': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(weight)])),
            'type': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(record_type)])),
        }
        
        entry2 = {
            'item1': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item2)])),
            'item2': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item1)])),
            'score': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(score)])),
            'weight': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(weight)])),
            'type': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(record_type)])),
        }
        return [
            tf.train.Example(features=tf.train.Features(feature=entry1)),
            tf.train.Example(features=tf.train.Features(feature=entry2)),
        ]
        
    (
        stats
        | 'Encode to tf.example' >> beam.FlatMap(_to_tf_example)
        | 'Serialize to string' >> beam.Map(lambda example: example.SerializeToString(deterministic=True))
        | 'Shuffle data' >> beam.Reshuffle()
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
                file_path_prefix = sink_data_location+"/cooc",
                file_name_suffix = '.tfrecords')
    ) 

### Co-occurence pipeline 

In [32]:
def run_cooc_pipeline(args):

    source_data_location = args['source_data_location']
    sink_data_location = args['sink_data_location']
    runner = args['runner']
    min_freq = args['min_freq']
    num_shards = args['num_shards']
    top_count = args['top_count']
    
    pipeline_options = beam.options.pipeline_options.GoogleCloudOptions(**args)
    
    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
        # Read data from source files
        raw_data = read_data(pipeline, source_data_location)
        
        # Parse data to (context_id, item_id)
        parsed_data = parse_data(raw_data)
        
        # Compute frequency of each item (item_id, frequency)
        item_frequency = compute_item_frequency(parsed_data, min_freq)
        #write_debug(item_frequency, sink_data_location)
        
        # Extract distinct list of items (vocabulary)
        vocab = vocabulary(item_frequency)
        write_vocab(vocab, sink_data_location)
        
        # Generate pairs (item_1, item_2, cooc) for the top frequent items. cooc is set to 0.
        top_pairs = create_top_pairs(0, item_frequency, top_count)
        #write_debug(top_pairs, sink_data_location)
        
        # Split items to partitions
        partitions = create_partitions(item_frequency, num_shards)
        #write_debug(partitions[0], sink_data_location)
        
        # For each partition, generate  (item_1, item_2, score) for the top frequent items. Then union.
        top_pairs_per_partition = generate_and_union_pairs(partitions, top_count)
        #write_debug(pairs, sink_data_location)
        
        #For the co-occuring items, compute the cooccurrence (item_1, item_2, cooc)
        cooccurrence = compute_cooccurrence(parsed_data)
        #write_debug(cooccurrence, sink_data_location)
        
        # Merge all pairs: the co-occuring (positive) and not co-occuring (negative)
        all_pairs = merge_pairs((cooccurrence, top_pairs, top_pairs_per_partition))
        #write_debug(all_pairs, sink_data_location)
        
        # Compute |D| = \sum_{ij} x_{ij} 
        total = compute_total_cooc(cooccurrence)
        #write_debug(total, sink_data_location)
        
        # Join Cooc with item frequency => (item_1, item_2, cooc, freq_1, freq_2)
        join = join_with_item_frequency(all_pairs, item_frequency)
        #write_debug(join, sink_data_location)
        
        # Compute statistics (pmi), weights, and record type => (item_1, item_2, pmi, weight, type)
        stats = compute_score(join, total)
        #write_debug(stats, sink_data_location)
        write_to_tfrecords(stats, sink_data_location)
        
        # Log information about the created dataset
        info = get_info(stats)
        write_log(info, sink_data_location)


In [37]:
runner = 'DirectRunner'
job_name = 'test-cooc-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

args = {
    'job_name': job_name,
    'runner': runner,
    'source_data_location': '{}/data-*.csv'.format(DATA_DIR),
    'sink_data_location': COOC_DIR,
    'min_freq': 5,
    'top_count': 100,
    'num_shards': 100,
    'project': PROJECT_ID,
}
print("Pipeline args are set.")

Pipeline args are set.


### Run Co-occurrence pipeline

In [None]:
time_start = datetime.utcnow() 
print("Running cooc pipeline...")
run_cooc_pipeline(args)
print("Pipeline is done.")
time_end = datetime.utcnow() 
time_elapsed = time_end - time_start
print("Execution elapsed time: {} seconds".format(time_elapsed.total_seconds()))

Running cooc pipeline...


In [7]:
!ls {COOC_DIR}

cooc-00000-of-00001.tfrecords info2.log
info.log                      vocab.txt


In [15]:
!head {COOC_DIR}/info.log

P: 2249477
N: 486352
max: 13.33242
min: 2.2812


## 3. Read TFRecords using tf.data APIs

In [9]:
def make_input_fn(file_pattern, batch_size):
    
    features = {
        'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'type': tf.FixedLenFeature(dtype=tf.string, shape=())
    }

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features,
            reader=tf.data.TFRecordDataset,
            label_key=None,
            num_epochs=1,
            shuffle=True
        )
        return dataset
    
    return _input_fn

In [10]:
tf.enable_eager_execution()

DATA_FILES = "{}/cooc-*".format(COOC_DIR)

dataset = make_input_fn(DATA_FILES, batch_size=5)()
for i, features in enumerate(dataset.take(5)):
    print()
    print("Record {}:".format(i+1))
    for key in features:
        print("-{}:{}".format(key, features[key]))

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.

Record 1:
-item1:[b'259762' b'324089' b'263416' b'3130567' b'2961260']
-item2:[b'991115' b'4299822' b'3156107' b'699438' b'3114309']
-score:[ 9.36591 10.08983  9.66374  7.32508  8.36261]
-type:[b'N' b'P' b'P' b'P' b'N']
-weight:[1. 1. 1. 1. 1.]

Record 2:
-item1:[b'1155014' b'558543' b'3871848' b'1121660' b'1104682']
-item2:[b'3140490' b'594827' b'4190858' b'759854' b'1184321']
-score:[ 6.95162  9.86668 11.32197 11.65844  9.64354]
-type:[b'P' b'P' b'P' b'P' b'P']
-weight:[1. 1. 1. 1. 1.]

Record 3:
-item1:[b'1104624' b'3788311' b'2330675' b'1100002' b'660399']
-item2:[b'2829121' b'881477' b'2714414' b'559941' b'795275']
-score:[10.15437  8.32624  6.66444  5.53245  9.93122]
-type:[b'P' b'P' b'P' b'P' b'P']
-weight:[1. 1. 1. 1. 1.]

Record 4:
-item1:[b'