# Project 1 - File Format Converter using Python

* Get File Names to be processed using glob
* Get Column Names using Schemas File
* Get Data Set Names from File Names or Paths using regular expressions
* Read CSV Data into Pandas Dataframe with Schema Dynamically
* Generate File Paths for Target JSON Files Dynamically
* Recap of Writing Pandas Dataframe to JSON File

* Get File Names to be processed using glob

In [None]:
import glob

In [None]:
help(glob)

In [None]:
help(glob.glob)

In [None]:
glob.glob('data/retail_db/**', recursive=True)

In [None]:
glob.glob('data/retail_db/*/*')

In [None]:
src_file_names = glob.glob('data/retail_db/*/part-*')

In [None]:
src_file_names

In [None]:
import pandas as pd

In [None]:
for file_name in src_file_names:
    df = pd.read_csv(file_name, header=None)
    print(f'Shape of {file_name} is {df.shape}')

* Get Column Names using Schemas File

In [None]:
import json

In [None]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [None]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [None]:
orders_columns = get_column_names(schemas, 'orders')

In [None]:
orders_columns

In [None]:
import pandas as pd

In [None]:
orders = pd.read_csv('data/retail_db/orders/part-00000', names=orders_columns)

In [None]:
orders

* Get Data Set Names from File Names or Paths using regular expressions

In [None]:
import glob

In [None]:
help(glob)

In [None]:
help(glob.glob)

In [None]:
glob.glob('data/retail_db/**', recursive=True)

In [None]:
glob.glob('data/retail_db/*/*')

In [None]:
src_file_names = glob.glob('data/retail_db/*/part-*')

In [None]:
src_file_names

In [None]:
# for Windows
import re

In [None]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    print(file_path_list)

In [None]:
tgt_base_dir = 'data/retail_db_json'

In [None]:
file = src_file_names[0]

In [None]:
file_path_list = re.split('[/\\\]', file)

In [None]:
ds_name = file_path_list[-2]

In [None]:
file_name = file_path_list[-1]

In [None]:
f'{tgt_base_dir}/{ds_name}/{file_name}'

* Read CSV Data into Pandas Dataframe with Schema Dynamically

In [None]:
import pandas as pd

In [None]:
for file_name in src_file_names:
    df = pd.read_csv(file_name, header=None)
    print(f'Shape of {file_name} is {df.shape}')

* Generate File Paths for Target JSON Files Dynamically

In [None]:
import glob

In [None]:
src_file_names = glob.glob('data/retail_db/*/part*')

In [None]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    print(json_file_path)

* Recap of Writing Pandas Dataframe to JSON File

In [None]:
import glob

In [None]:
src_file_names = glob.glob('data/retail_db/*/part*')

In [None]:
import re

In [None]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    print(file_path_list)

In [None]:
tgt_base_dir = 'data/retail_db_json'

In [None]:
file = src_file_names[0]

In [None]:
file_path_list = re.split('[/\\\]', file)

In [None]:
ds_name = file_path_list[-2]

In [None]:
file_name = file_path_list[-1]

In [None]:
f'{tgt_base_dir}/{ds_name}/{file_name}'

In [None]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    print(json_file_path)