# This notebook is to load a json file data into the `tfRecords` format

##### Description of the `.jsonl` file in `./training-data-schema.json`

#### This does

1. Reads parameters for existing data
2. Create a small development data set
3. Expand to the full data and stream data into a formatted tfrecord in a new bucket `TF_RECORDS_DIR`

In [1]:
# !gsutil -m rm gs://tfrs-tf-records/* #use this to clear the target directory - if you are adding additional fields

In [22]:
#set parameters
BUCKET = 'gs://mcskinner-sample-data/2tower/last-view'
SCHEMA_JSON = 'gs://mcskinner-sample-data/2tower/last-view/training-data-schema.json'
TRAIN_JSON = 'gs://tfrs-sample-data/training-data.jsonl'
MY_BUCKET = 'gs://tfrs-sample-data'
TF_RECORDS_DIR = 'gs://tfrs-central-a'
SMALL_DATASET = 'gs://tfrs-sample-data/training-data_dev.jsonl'

In [23]:
### Create a smaller dev dataset first
import json
import os
import subprocess
from tensorflow.python.lib.io import file_io


ROW_LIMIT = 10000
jsonl_path = os.path.join(MY_BUCKET, 'training-data.jsonl')
SMALL_DATASET = 'gs://tfrs-sample-data/training-data_dev.jsonl'

input_file_columns = subprocess.getoutput(f'gsutil cp {jsonl_path} - | head -{ROW_LIMIT} | gsutil cp - {SMALL_DATASET}')

### Read in the schema for later parsing

#### FYI:Pre-calculated counts

In [24]:
num_records = 4293302 #sum(1 for _ in file_io.FileIO(SMALL_DATASET, 'rb')) #CHANGE THIS TO LARGE DATASET WHEN READY
print("Total number of records: {}".format(num_records))

Total number of records: 4293302


### Establish the parameters

`num_samples` is the number of data samples for each TFRECORD file

`num_tfrecods` is total number of TFRecords that we will create.

Generally - aim for around 100 MB size tfrecord files

In [25]:
#To be tuned later

num_samples = 12228
num_tfrecords = num_records // num_samples 
if num_records % num_samples:
    num_tfrecords += 1

print("Number of Expected TFRecords: {}".format(num_tfrecords))

Number of Expected TFRecords: 352


### These helper functions declare different feature types
This is used to parase the jsonl file

Note [this](https://keras.io/examples/keras_recipes/creating_tfrecords/#define-dataset-helper-functions) is a good resource

#### Notes on data transforms:
* Grabbing all fields avaialble for query
* Transforming and flattening of array / ragged data inputs (`['last_viewed', 'ss_prodTypeCombo_ss',` etc..`]`)
* Using special delimiter (`|` in this case) to later unpack values as a string-split for text vectorizor layer

In [26]:
import tensorflow as tf
import numpy as np


def string_array(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v.encode('utf-8') for v in value]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[float(v) for v in value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(v) for v in value]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

    
def parse_line(ln):
    q_ln = ln['query']
    c_ln = ln['candidate']
    
    timestamp = string_array(q_ln['search_date_time']),
    
    month = str(q_ln['search_date_time']).split("-")[1]
    
    hour = str(q_ln['search_date_time']).split(" ")[1]
    hour = str(hour).split(":")[0]
    
    lv = ""
    for item in q_ln['last_viewed']:
        lv = lv + " " + item
        
    lv = lv + " END" #so we get string len > 0 on non-last viewed results
    
    pt = ""
    for item in c_ln['productTypeCombo_ss']:
        pt = pt + "|" + item
        
    pt_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(pt).encode('utf-8')]))
    lv_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(lv).encode('utf-8')]))

            
    hour = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(hour).encode('utf-8')]))
    
    month = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(month).encode('utf-8')]))

    
    feature = {
        #query features
        "query": string_array(q_ln['query']), #this is actually a list - consider tokenizing
#         "search_date_time": timestamp,
        "last_viewed": lv_feature,

        #candidate features
        "IVM_s": string_array(c_ln['IVM_s']), #target??
        "description": string_array(c_ln['description']),
#         "total_ratings_i": float_feature(c_ln['total_ratings_i']),
#         "overall_ratings": float_feature(c_ln['overall_ratings']),
#         "avg_rating_td": float_feature(c_ln['avg_rating_td']),
#         "parent_description": string_array(c_ln['parent_description']),
#         "Brand_s": string_array(c_ln['Brand_s']),
#         "item_type": string_array(c_ln['item_type']), #just fixed this to be a string
#         "prc_rdc_amt": float_feature(c_ln['prc_rdc_amt']),
#         "quantity_sold": float_feature(c_ln['quantity_sold']),
#         "sales_dollar_f": float_feature(c_ln['sales_dollar_f']),
#         "freight_term": string_array(c_ln['freight_term']),
#         "is_energy_star_s": string_array(c_ln['is_energy_star_s']),
        "price_td": float_feature(c_ln['price_td']),
        "PriceRange_s": string_array(c_ln['PriceRange_s']),
#         "prc_rdc_pct": float_feature(c_ln['prc_rdc_pct']),
#         "spellcheck": string_array(c_ln['spellcheck']),
        "productTypeCombo_ss": pt_feature, #this is actually a list - consider tokenizing
#         "Searchable_t": string_array(c_ln['Searchable_t']), #this is actually a list - consider tokenizing
#         "clean_Brand_s": string_array(c_ln['clean_Brand_s']),
        "visual": float_feature(c_ln['visual']),
        "month": month,
        "hour": hour
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

### Create a target TF Records dataset for later use

In [28]:
import time
import progressbar

# this will generate data into the correct format for tf_records
record_counter = 0
lns = [] #empty holder for the lines
tfrec_counter = 0


# quick function to write the data as we read through it
def write_a_tfrec(lns):
    #next write to a tfrecord
    with tf.io.TFRecordWriter(
        TF_RECORDS_DIR + "/file_%.2i-%i.tfrec" % (tfrec_counter, len(lns))
    ) as writer:
        for ln in lns:
            example = parse_line(ln)
            writer.write(example.SerializeToString())
            
            
with progressbar.ProgressBar(max_value=num_tfrecords) as bar:
        
    with file_io.FileIO(TRAIN_JSON, 'r') as reader:
        for line in reader:
            record_counter += 1
            if record_counter % num_samples == 0 or record_counter == num_records: 
                write_a_tfrec(lns) #write out a batch
                lns = [] #reset to a new batch
                tfrec_counter += 1
                bar.update(tfrec_counter)
            else:
                pass
#             lns.append(json.loads(line)) #toggle if you want to save lines locally

100% (352 of 352) |######################| Elapsed Time: 1:37:58 Time:  1:37:58


In [None]:
#verify records
!gsutil ls $TF_RECORDS_DIR