In [1]:
import pandas as pd
from helpers import get_table_columns, slugify, get_table_id
import json

In [2]:
path_to_dev_tables = '/Users/bleopold/OneDrive/data-analysis/vendors/WikiSQL/data/dev.tables.jsonl'
base_target_path = "../data/01_raw/wiki-sql/schema_infos"

In [3]:
"""
Build the information_schema

* original column_name vs. clean column_name that can be used in a ddl
* column_order

# Technical process:
1. Iterate over all tables that should be included into the information_schema
2. For each table
    * set up a dataframe (information_schema_one_table) with specific columns (information_schema_column_names)
    * populate this dataframe
    * append this dataframe into the master df information_schema_all_tables
3. save information_schema_all_tables as csv
"""
# these columns I need for the information_schema
information_schema_column_names = ['table_id', 'column_name_original', 'column_name_clean', 'column_order']

# this df will hold all the information in the information_schema for all tables
information_schema_all_tables = pd.DataFrame(
    columns = information_schema_column_names
)


# get the file that contains all the tables that should be included in the information_schema
with open(path_to_dev_tables, 'r') as json_file:
    tables_info_raw = list(json_file)

# iterate over each table
for table in tables_info_raw:
    result = json.loads(table)
    
    # set the df that will hold the information_schema for the current table
    information_schema_one_table = pd.DataFrame(
        columns = information_schema_column_names
    )
    
    # iterate over each column in the table
    original_column_names = get_table_columns(result)
    for idx, column_name_original in enumerate(original_column_names):
        
        # set column names (clean + original)
        column_name_clean = slugify(column_name_original)
        information_schema_one_table.loc[idx, "column_name_original"] = column_name_original
        information_schema_one_table.loc[idx, "column_name_clean"] = column_name_clean
        
        # set column order
        information_schema_one_table.loc[idx, "column_order"] = idx
        
    # set table_id
    table_id  = get_table_id(result)
    information_schema_one_table[["table_id"]] = table_id
    
    # append individual information schema into all
    information_schema_all_tables = information_schema_all_tables.append(information_schema_one_table)
    information_schema_all_tables.reset_index(drop=True, inplace=True)
    
# save to disk
information_schema_all_tables.to_csv(
        "{}/information_schema.csv".format(base_target_path)
        , sep='\t'
        , index=False
    )

print('done')

KeyboardInterrupt: 