In [43]:
import pandas as pd
import os

In [46]:
from ydata_profiling import ProfileReport

In [45]:
# Paths
data_path = "datasets"
output_path = "profiling"

# Create profiling folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# List all CSV files in the datasets folder
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

for file in csv_files:
    # Skip FULL profiling for product_category_name_translation dataset since it's too small
    if file == "product_category_name_translation.csv":
        print(f"Skipping {file} (lookup table, no profiling needed).")
        continue
    
    file_path = os.path.join(data_path, file)
    print(f"Processing {file}...")
    
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Create profiling report
    profile = ProfileReport(df, title=f"{file} Profiling Report")
    
    # Save to the profiling folder
    out_file = os.path.join(output_path, file.replace(".csv", "_profiling_report.html"))
    profile.to_file(out_file)
    
    print(f"Saved report to {out_file}\n")

Processing olist_customers_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/5 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.32it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_customers_dataset_profiling_report.html

Processing olist_geolocation_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/5 [00:00<?, ?it/s][A
 20%|████████████████████████                                                                                                | 1/5 [00:05<00:22,  5.73s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.20s/it][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_geolocation_dataset_profiling_report.html

Processing olist_orders_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/8 [00:00<?, ?it/s][A
 12%|███████████████                                                                                                         | 1/8 [00:05<00:37,  5.40s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.42it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_orders_dataset_profiling_report.html

Processing olist_order_items_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/7 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.78it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_order_items_dataset_profiling_report.html

Processing olist_order_payments_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/5 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_order_payments_dataset_profiling_report.html

Processing olist_order_reviews_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/7 [00:00<?, ?it/s][A
 14%|█████████████████▏                                                                                                      | 1/7 [00:03<00:23,  3.85s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.53it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_order_reviews_dataset_profiling_report.html

Processing olist_products_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                | 0/9 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 15.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_products_dataset_profiling_report.html

Processing olist_sellers_dataset.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 47.58it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\olist_sellers_dataset_profiling_report.html

Processing product_category_name_translation.csv...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 37.35it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saved report to profiling\product_category_name_translation_profiling_report.html



In [55]:
df = pd.read_csv("datasets/product_category_name_translation.csv", encoding="utf-8")

profile = ProfileReport(
    df,
    title="Product Category Translation Profiling Report",
    minimal=True,   # skip heavy stats that fail on tiny data
    explorative=True
)

profile.to_file("profiling/product_category_translation_profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 220.56it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
# Load orders dataset
orders = datasets["orders"] 

# --- Step 1: Overview of missing values ---
print("Missing values per column:")
print(orders.isnull().sum())
print("\nPercentage missing per column:")
print((orders.isnull().mean() * 100).round(2))

Missing values per column:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

Percentage missing per column:
order_id                         0.00
customer_id                      0.00
order_status                     0.00
order_purchase_timestamp         0.00
order_approved_at                0.16
order_delivered_carrier_date     1.79
order_delivered_customer_date    2.98
order_estimated_delivery_date    0.00
dtype: float64


In [26]:
# Orders with missing delivery date
missing_delivery = orders_df[orders_df['order_delivered_customer_date'].isnull()]
print("\nOrders with missing delivered_customer_date:", len(missing_delivery))
display(missing_delivery.head())


Orders with missing delivered_customer_date: 2965


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
44,ee64d42b8cf066f35eac1cf57de1aa85,caded193e8e47b8362864762a83db3c5,shipped,2018-06-04 16:44:48,2018-06-05 04:31:18,2018-06-05 14:32:00,,2018-06-28 00:00:00
103,0760a852e4e9d89eb77bf631eaaf1c84,d2a79636084590b7465af8ab374a8cf5,invoiced,2018-08-03 17:44:42,2018-08-07 06:15:14,,,2018-08-21 00:00:00
128,15bed8e2fec7fdbadb186b57c46c92f2,f3f0e613e0bdb9c7cee75504f0f90679,processing,2017-09-03 14:22:03,2017-09-03 14:30:09,,,2017-10-03 00:00:00
154,6942b8da583c2f9957e990d028607019,52006a9383bf149a4fb24226b173106f,shipped,2018-01-10 11:33:07,2018-01-11 02:32:30,2018-01-11 19:39:23,,2018-02-07 00:00:00
