# From raw catalog in JSON to catalog in JSONL suitable for BigQuery

## Preliminary steps

### Install the libraries used in this tutorial

Library to authenticate with Google Cloud products


In [150]:
pip install --upgrade google-auth

Note: you may need to restart the kernel to use updated packages.


### Import the necessary libraries

Google Cloud client library for Cloud Storage (GCS)

In [151]:
from google.cloud import storage

### Define global project variables and credentials

In [162]:
# The name for the new bucket. Change the bucker name to an unique bucket name.
bucket_name = "pod-fr-retail-retailsearch-asset"
# The name for the file to import from GCS
blob_name = "products.json"
# The name for the JSONL file to write back in GCS
destination_blob_name = "products_bq.json"

### Define functions

Function to read content of a blob in GCS and parse as a JSON

In [153]:
def read_blob(bucket_name, blob_name):
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your new GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    with blob.open("r") as f:
        return json.load(f)

Function to write content into a blob and upload it to GCS

In [154]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

## Process

Read blob from GCS

In [155]:
products_blob = read_blob(bucket_name, blob_name)


Transform categories into Retail API format (see https://cloud.google.com/retail/docs/reference/rest/v2/projects.locations.catalogs.branches.products)

In [156]:
for product in products_blob:
    product["category"] = (' > '.join(category["name"] for category in product["category"]))

Transform JSON into JSONL format (see https://stackoverflow.com/questions/51300674/converting-json-into-newline-delimited-json-in-python/51301075#51301075)

In [157]:
contents = ('\n'.join([json.dumps(record) for record in products_blob]))

Write contents into a file

In [158]:
output = open(destination_blob_name, 'w')
output.write(contents)

35089846

Upload file to GCS

In [160]:
upload_blob(bucket_name, destination_blob_name, destination_blob_name)

File products_bq.json uploaded to products_bq.json.
