In [35]:
import pandas as pd
import os

In [37]:
from ydata_profiling import ProfileReport

In [39]:
# Paths
data_path = os.path.join("..", "data", "raw")
output_path = os.path.join("..", "reports", "profiling")

# Create profiling folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# List all CSV files in the datasets folder
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

for file in csv_files:
    # Skip FULL profiling for product_category_name_translation dataset since it's too small
    if file == "product_category_name_translation.csv":
        print(f"Skipping {file} (lookup table, no profiling needed).")
        continue
    
    file_path = os.path.join(data_path, file)
    print(f"Processing {file}...")
    
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Create profiling report
    profile = ProfileReport(df, title=f"{file} Profiling Report")
    
    # Save to the profiling folder
    out_file = os.path.join(output_path, file.replace(".csv", "_profiling_report.html"))
    profile.to_file(out_file)
    
    print(f"Saved report to {out_file}\n")

Processing olist_customers_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/5 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.97it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_customers_dataset_profiling_report.html

Processing olist_geolocation_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/5 [00:00<?, ?it/s]
[A%|████████████████▊                                                                   | 1/5 [00:03<00:15,  3.78s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.22it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_geolocation_dataset_profiling_report.html

Processing olist_orders_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/8 [00:00<?, ?it/s]
[A%|██████████▌                                                                         | 1/8 [00:04<00:34,  4.94s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.57it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_orders_dataset_profiling_report.html

Processing olist_order_items_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/7 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  3.31it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_order_items_dataset_profiling_report.html

Processing olist_order_payments_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/5 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.55it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_order_payments_dataset_profiling_report.html

Processing olist_order_reviews_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/7 [00:00<?, ?it/s]
[A%|████████████                                                                        | 1/7 [00:04<00:27,  4.64s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.46it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_order_reviews_dataset_profiling_report.html

Processing olist_products_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/9 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 16.99it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_products_dataset_profiling_report.html

Processing olist_sellers_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 43.32it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to ..\reports\profiling\olist_sellers_dataset_profiling_report.html

Skipping product_category_name_translation.csv (lookup table, no profiling needed).


In [45]:
# Handle product_category_name_translation separately (mini profiling. dataset too small for full profiling as above)
special_file = "product_category_name_translation.csv"

file_path = os.path.join(data_path, special_file)
df = pd.read_csv(file_path, encoding="utf-8")

profile = ProfileReport(
    df,
    title="Product Category Translation Profiling Report",
    minimal=True,      # lighter profiling
    explorative=True   # allow interactive exploration
)

out_file = os.path.join(output_path, "product_category_translation_profiling_report.html")
profile.to_file(out_file)

print(f"Saved mini report to {out_file}\n")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 273.34it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved mini report to ..\reports\profiling\product_category_translation_profiling_report.html

