# Here's what I did
<pre>
- Overall EDA of the data:
    1. Shapes:                         data_shape.txt
    2. Missing count:                  column_missing_count.csv
    3. Types:                          column_type.csv
    4. Shared columns across data:     columns_shared_across_csvs.json
    5. Terminology for columns:        column_description.csv

- Result: ./eda/overall/artifact/*
</pre>

In [1]:
import pandas as pd
import glob
import json
import os

In [2]:
os.makedirs("./artifact", exist_ok=True)

In [3]:
src_path = "../../downloads/olist/*"
csv_paths = glob.glob(src_path)
csv_paths.sort()
# csv_paths

## Shape of data

In [4]:
df_by_path = {}
for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    df_by_path[csv_path] = df
    
with open('./artifact/data_shape.txt', 'w') as f:
    for csv_path, df in df_by_path.items():
        file_name = os.path.split(csv_path)[-1]
        f.write(f"{file_name:<40} -> {df.shape}\n")

## Check missing count

In [5]:
missing_df = pd.DataFrame(columns=['file_name', 'column_name', 'missing_count'])
for csv_path, df in df_by_path.items():
    check_missing_ds = df.isna().sum()
    tmp_df = check_missing_ds.to_frame(name='missing_count')
    tmp_df.index.name = 'column_name'
    tmp_df = tmp_df.reset_index()
    tmp_df['file_name'] = os.path.split(csv_path)[-1]
    missing_df = pd.concat([missing_df, tmp_df])
    missing_df.to_csv("./artifact/column_missing_count.csv", index=False)

## All type of columns

In [6]:
type_df = pd.DataFrame(columns=['file_name', 'column_name', 'type'])
for csv_path, df in df_by_path.items():
    type_ds = df.dtypes
    tmp_df = type_ds.to_frame(name='type')
    tmp_df.index.name = 'column_name'
    tmp_df = tmp_df.reset_index()
    tmp_df['file_name'] = os.path.split(csv_path)[-1]
    type_df = pd.concat([type_df, tmp_df])
    type_df.to_csv("./artifact/column_type.csv", index=False)

## Columns shared across data

In [7]:
columns_by_csv = {}
for csv_path, df in df_by_path.items():
    columns = sorted(df.columns.to_list())
    columns_by_csv[csv_path] = columns

# print(json.dumps(columns_by_csv, indent=4))

In [8]:
all_unique_columns = []
for columns in list(columns_by_csv.values()):
    all_unique_columns += columns
all_unique_columns = sorted(set(all_unique_columns))

In [9]:
column_by_csvs = {}
for unique_col in all_unique_columns:
    column_by_csvs[unique_col] = []

for csv_path, cols in columns_by_csv.items():
    for unique_col in column_by_csvs.keys():
        if unique_col in cols:
            file_name = os.path.split(csv_path)[-1]
            column_by_csvs[unique_col].append(file_name)
            
columns_shared_across_csvs = dict(filter(lambda item: len(item[1]) > 1, column_by_csvs.items()))
with open("./artifact/columns_shared_across_csvs.json", 'w', encoding='utf-8') as f:
    json.dump(columns_shared_across_csvs, f, indent=4, ensure_ascii=False)

## Column description

In [10]:
!python -m column_description

Done: './artifact/column_description.csv'
