<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/TFRecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Import and define

In [0]:
import glob
import json
import os

import numpy as np
import tensorflow as tf
import tqdm

In [0]:
def int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def vector_feature(vector):
  if vector.get('size') is not None:
    value = np.zeros(vector['size'])
    value[vector['indices']] = vector['values']
    value = value.tolist()
  else:
    value = vector['values']
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def serialize_example(example):
  feature = {
      'eventId': int64_feature(example['eventId']),
      'clickLabel': int64_feature(example['clickLabel']),
      'userActiveness': float_feature(example['userActiveness']),
      'categoryVector': vector_feature(example['categoryVector']),
      'newsClickCountVector': vector_feature(example['newsClickCountVector']),
      'contextVector': vector_feature(example['contextVector']),
      'userHistoryVector': vector_feature(example['userHistoryVector']),
      'userProfileVector': vector_feature(example['userProfileVector']),
      'userClickCountVector': vector_feature(example['userClickCountVector']),
      'userHistoryVectorNext': vector_feature(example['userHistoryVectorNext']),
      'userProfileVectorNext': vector_feature(example['userProfileVectorNext']),
      'userClickCountVectorNext': vector_feature(example['userClickCountVectorNext']),
  }
  
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

# Convert `JSON` files to `TFRecords`

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
train_filepaths = sorted(glob.glob(os.path.join(DATA_PATH, 'all_features', 'train', 'part*.json')))

In [0]:
train_size = 0

for filepath in tqdm.tqdm(train_filepaths):
  filename = filepath.split('/')[-1].split('.')[0]
  with tf.io.TFRecordWriter(os.path.join(DATA_PATH, 'tfrecords', 'train', filename), 'GZIP') as writer, open(filepath) as file:
    for row in file:
      writer.write(serialize_example(json.loads(row)))
      train_size += 1

100%|██████████| 200/200 [53:07<00:00, 16.37s/it]


In [0]:
train_size

12915691

In [0]:
test_filepaths = sorted(glob.glob(os.path.join(DATA_PATH, 'all_features', 'test', 'part*.json')))

In [0]:
test_size = 0

for filepath in tqdm.tqdm(test_filepaths):
  filename = filepath.split('/')[-1].split('.')[0]
  with tf.io.TFRecordWriter(os.path.join(DATA_PATH, 'tfrecords', 'test', filename), 'GZIP') as writer, open(filepath) as file:
    for row in file:
      writer.write(serialize_example(json.loads(row)))
      test_size += 1

100%|██████████| 200/200 [07:17<00:00,  2.20s/it]


In [0]:
test_size

1796331