In [1]:
import glob
import os
import json 
import re
import pandas as pd


In [None]:
#convert any csv data to json_file

In [2]:
#this function is extract column name from schemas in correct order
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [3]:
#This function reads a file and returns a DataFrame 
# with properly assigned column names.
def read_csv(file, schemas):
    file_path_list = re.split(r'[\\/]', file)
    ds_name = file_path_list[-2]
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    return df

In [4]:
#This function saves a DataFrame as a 
# JSON file in a structured directory
def to_json(df, base_dir,ds_name,file_name):
    json_file_path=f'{base_dir}/{ds_name}/{file_name}'
    os.makedirs(f'{base_dir}/{ds_name}',exist_ok=True)
    df.to_json(json_file_path,orient='records',lines=True)

In [5]:
#This function automates the conversion of multiple files from 
# CSV to JSON using modular functions
def file_convertor(src_base_dir, base_dir, ds_name, schemas):
    src_base_dir='retail_db'
    base_dir='retail_db_json'
    schemas= json.load(open('retail_db/schemas.json'))
    files=glob.glob(f'{src_base_dir}/*/part-*')
    for file in files:
        print(f'processing {file}')
        df= read_csv(file,schemas)
        file_name=re.split(r'[\\]',file)[-1]
        to_json(df,base_dir,ds_name,file_name)

In [6]:
#This function controls the workflow by selecting datasets 
# and triggering their conversion process
def process_files(ds_name=None):
    src_base_dir = 'retail_db'
    base_dir = 'retail_db_json'
    schemas = json.load(open('retail_db/schemas.json'))

    if ds_name is None:
        ds_names = schemas.keys()
    else:
        ds_names = [ds_name]

    for name in ds_names:
        print(f'processing {name}')
        file_convertor(src_base_dir, base_dir, ds_name, schemas)

In [7]:
schemas=json.load(open('retail_db/schemas.json'))
schemas.keys()

dict_keys(['departments', 'categories', 'orders', 'products', 'customers', 'order_items'])

In [8]:

process_files('orders')

processing orders
processing retail_db\categories\part-categories.txt
processing retail_db\customers\part-customers.txt
processing retail_db\departments\part-departments.txt
processing retail_db\orders\part-orders.txt
processing retail_db\order_items\part-order_items.txt
processing retail_db\products\part-products.txt
