In [82]:
import json
import pandas as pd

In [83]:
from google.cloud import storage

In [91]:
!gsutil rm -r gs://airetailparquet

Removing gs://airetailparquet/...


In [92]:
!gsutil mb gs://airetailparquet

Creating gs://airetailparquet/...


In [97]:
!gsutil rm -r gs://airetail/retail_db

Removing gs://airetail/retail_db#1664787696526014...
/ [1 objects]                                                                   
Operation completed over 1 objects.                                              


In [98]:
!gsutil cp -r ../../data/retail_db/schemas.json gs://airetail/retail_db/schemas.json

Copying file://../../data/retail_db/schemas.json [Content-Type=application/json]...
/ [1 files][  4.2 KiB/  4.2 KiB]                                                
Operation completed over 1 objects/4.2 KiB.                                      


In [99]:
!gsutil ls gs://airetail/retail_db/

gs://airetail/retail_db/schemas.json


In [100]:
gsclient = storage.Client()

In [101]:
bucket = gsclient.get_bucket('airetail')

In [102]:
schemas_blob = bucket.get_blob('retail_db/schemas.json')

In [103]:
schemas = json.loads(schemas_blob.download_as_string())

In [104]:
schemas['orders']

[{'column_name': 'order_id', 'data_type': 'integer', 'column_position': 1},
 {'column_name': 'order_date', 'data_type': 'string', 'column_position': 2},
 {'column_name': 'order_customer_id',
  'data_type': 'timestamp',
  'column_position': 3},
 {'column_name': 'order_status', 'data_type': 'string', 'column_position': 4}]

In [105]:
column_details = sorted(schemas['orders'], key=lambda col: col['column_position'])
column_details

[{'column_name': 'order_id', 'data_type': 'integer', 'column_position': 1},
 {'column_name': 'order_date', 'data_type': 'string', 'column_position': 2},
 {'column_name': 'order_customer_id',
  'data_type': 'timestamp',
  'column_position': 3},
 {'column_name': 'order_status', 'data_type': 'string', 'column_position': 4}]

In [106]:
columns = list(map(lambda td: td['column_name'], column_details))
columns

['order_id', 'order_date', 'order_customer_id', 'order_status']

In [107]:
def get_columns(schemas_blob, ds_name):
    schemas = json.loads(schemas_blob.download_as_string())
    column_details = sorted(schemas[ds_name], key=lambda col: col['column_position'])
    columns = list(map(lambda td: td['column_name'], column_details))
    return columns

In [110]:
ds_name = 'orders'

In [111]:
get_columns(schemas_blob, ds_name)

['order_id', 'order_date', 'order_customer_id', 'order_status']

In [112]:
print(columns)

['order_id', 'order_date', 'order_customer_id', 'order_status']


In [None]:
%env TGT_BUCKET_NAME=airetailparquet

env: TGT_BUCKET_NAME=airetailparquet


In [115]:
!gsutil cp -r ../../data/retail_db/orders gs://airetail/retail_db/orders

Copying file://../../data/retail_db/orders/part-00000 [Content-Type=application/octet-stream]...
| [1 files][  2.9 MiB/  2.9 MiB]                                                
Operation completed over 1 objects/2.9 MiB.                                      


In [113]:
event = {
    'bucket': 'airetail',
    'name': 'retail_db/orders/part-00000'
}

In [114]:
tgt_bucket_name = os.environ.get('TGT_BUCKET_NAME')

In [116]:
src_bucket_name = event['bucket']
blob_name = event['name']

In [117]:
df = pd.read_csv(f'gs://{src_bucket_name}/{blob_name}', names=columns)

In [118]:
df.to_parquet(f'gs://{tgt_bucket_name}/{blob_name}.snappy.parquet')

In [119]:
!gsutil ls -r gs://airetailparquet/retail_db

gs://airetailparquet/retail_db/:

gs://airetailparquet/retail_db/orders/:
gs://airetailparquet/retail_db/orders/part-00000.snappy.parquet


In [124]:
import json
import os
import pandas as pd

from google.cloud import storage

def get_columns(schemas_blob, ds_name):
    schemas = json.loads(schemas_blob.download_as_string())
    column_details = sorted(schemas[ds_name], key=lambda col: col['column_position'])
    columns = list(map(lambda td: td['column_name'], column_details))
    return columns


def main(event, context):
    """Triggered by a change to a Cloud Storage bucket.
    Args:
         event (dict): Event payload.
         context (google.cloud.functions.Context): Metadata for the event.
    """
    tgt_bucket_name = os.environ.get('TGT_BUCKET_NAME')
    schemas_file_path = os.environ.get('SCHEMAS_FILE_PATH')
    src_bucket_name = event['bucket']
    blob_name = event['name']
    print(f'Processing file {blob_name} in bucket {src_bucket_name}')
    gsclient = storage.Client()
    src_bucket = gsclient.get_bucket(src_bucket_name)
    schemas_blob = src_bucket.get_blob(schemas_file_path)
    ds_name = blob_name.split('/')[-2]
    columns = get_columns(schemas_blob, ds_name)
    df = pd.read_csv(f'gs://{src_bucket_name}/{blob_name}', names=columns)
    df.to_parquet(f'gs://{tgt_bucket_name}/{blob_name}.snappy.parquet')

In [125]:
%env TGT_BUCKET_NAME=airetailparquet

env: TGT_BUCKET_NAME=airetailparquet


In [126]:
%env SCHEMAS_FILE_PATH=retail_db/schemas.json

env: SCHEMAS_FILE_PATH=retail_db/schemas.json


In [127]:
main(event, None)

Processing file retail_db/orders/part-00000 in bucket airetail


In [81]:
!gsutil ls -r gs://airetailparquet

gs://airetailparquet/retail_db/:

gs://airetailparquet/retail_db/categories/:
gs://airetailparquet/retail_db/categories/part-00000.snappy.parquet

gs://airetailparquet/retail_db/customers/:
gs://airetailparquet/retail_db/customers/part-00000.snappy.parquet

gs://airetailparquet/retail_db/departments/:
gs://airetailparquet/retail_db/departments/part-00000.snappy.parquet

gs://airetailparquet/retail_db/order_items/:
gs://airetailparquet/retail_db/order_items/part-00000.snappy.parquet

gs://airetailparquet/retail_db/orders/:
gs://airetailparquet/retail_db/orders/part-00000.snappy.parquet

gs://airetailparquet/retail_db/products/:
gs://airetailparquet/retail_db/products/part-00000.snappy.parquet
