* Overview of File Format Conversion
* Review Schema Details in JSON Format
* Develop function to return column names
* Read data from csv into dataframe using schema
* Print number of records from each data set
* Write data in JSON format to files
* Compute overall execution time
* Validate data in JSON files using Pandas
* Exercise and Solution

In [None]:
# Overview of File Format Conversion

In [None]:
# Review Schema Details in JSON Format
# data/retail_db/schemas.json

In [None]:
# Develop function to return column names
import json
def get_columns(ds):
    with open('data/retail_db/schemas.json') as fp:
        schemas = json.load(fp)
    try:
        schema = schemas.get(ds)
        if not schema:
            raise KeyError
        cols = sorted(schema, key=lambda s: s['column_position'])
        columns = [col['column_name'] for col in cols]
        return columns
    except KeyError:
        print(f'Schema not found for {ds}')
        return



In [None]:
get_columns('orders')

In [None]:
get_columns('dummy')

In [None]:
# Read data from csv into dataframe using schema
import pandas as pd

In [None]:
df = pd.read_csv(
    'data/retail_db/departments/part-00000',
    names=get_columns('departments')
)

In [None]:
df.shape

In [None]:
# Print number of records from each data set
import glob
import os

In [None]:
for path in glob.glob('data/retail_db/*'):
    if os.path.isdir(path):
        ds = os.path.split(path)[1]
        for file in glob.glob(f'{path}/part*'):
            df = pd.read_csv(file, names=get_columns(ds))
            print(f'Number of records for {os.path.split(file)[1]} in {ds} is {df.shape[0]}')

In [None]:
# Write data in JSON format to files

import uuid
for path in glob.glob('data/retail_db/*'):
    if os.path.isdir(path):
        ds = os.path.split(path)[1]
        for file in glob.glob(f'{path}/part*'):
            df = pd.read_csv(file, names=get_columns(ds))
            os.makedirs(f'data/retail_demo/{ds}', exist_ok=True)
            df.to_json(
                f'data/retail_demo/{ds}/{str(uuid.uuid1())}.json',
                orient='records',
                lines=True
            )
            print(f'Number of records processed for {os.path.split(file)[1]} in {ds} is {df.shape[0]}')

In [None]:
# Compute overall execution time
# delete target base folder before running this
%time

import uuid
for path in glob.glob('data/retail_db/*'):
    if os.path.isdir(path):
        ds = os.path.split(path)[1]
        for file in glob.glob(f'{path}/part*'):
            df = pd.read_csv(file, names=get_columns(ds))
            os.makedirs(f'data/retail_demo/{ds}', exist_ok=True)
            df.to_json(
                f'data/retail_demo/{ds}/part-{str(uuid.uuid1())}.json',
                orient='records',
                lines=True
            )
            print(f'Number of records processed for {os.path.split(file)[1]} in {ds} is {df.shape[0]}')

In [None]:
# Validate data in JSON files using Pandas
for path in glob.glob('data/retail_demo/*'):
    if os.path.isdir(path):
        for file in glob.glob(f'{path}/part*'):
            df = pd.read_json(file, lines=True)
            print(f'Number of records for {os.path.split(file)[1]} in {ds} is {df.shape[0]}')

* Exercise: Convert NYSE data from CSV in `data/nyse_all/nyse_data` to JSON format using gzip compression.
  * Source folder: `data/nyse_all/nyse_data`
  * Target folder: `data/nyse_all/nyse_json`
  * File Format: `gzip` compressed json format.
  * Column Names: `['ticker', 'trade_date', 'open_price', 'low_price', 'high_price', 'close_price', 'volume']`
  * Make sure file names are generated using `part-uuid.json` format (eg: `part-some-unique-id.json.gz`)
  * Validate by using shape on both source and target locations.
