In [1]:
import argparse
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F
import time
from tqdm import tqdm
from pyspark.sql.functions import datediff, col
from pyspark.sql.functions import datediff

from pyspark.sql.types import StringType, IntegerType, FloatType, DateType, DoubleType, TimestampType, LongType
from pyspark.sql.functions import col, lower, trim, when,row_number, count,date_add, when,to_date, lit
from pyspark.sql import Window
from pyspark.sql.window import Window

import utils.data_processing_bronze_table as bronze_processing
import utils.data_processing_silver_table as silver_processing
import utils.data_processing_gold_feature as gold_processing_feature
import utils.data_processing_gold_label_table as gold_label_processing

## set up pyspark session

In [2]:
print('\n\n---starting job---\n\n')

# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("data_processing") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")



---starting job---




Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/22 16:57:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Build Bronze Table

In [3]:
# Create bronze root directory
bronze_root = "datamart/bronze"
os.makedirs(bronze_root, exist_ok=True)
print(f"Bronze root directory: {bronze_root}")

Bronze root directory: datamart/bronze


In [4]:
# Process all Olist datasets
print("Processing Olist datasets...\n")
bronze_processing.process_olist_customers_bronze(bronze_root, spark)
print('-------------------------------------------------')
bronze_processing.process_olist_geolocation_bronze(bronze_root, spark)
print('-------------------------------------------------')
bronze_processing.process_olist_order_items_bronze(bronze_root, spark)
# bronze_processing.process_olist_order_payments_bronze(bronze_root, spark)
# bronze_processing.process_olist_order_reviews_bronze(bronze_root, spark)
print('-------------------------------------------------')
bronze_processing.process_olist_products_bronze(bronze_root, spark)
print('-------------------------------------------------')
bronze_processing.process_olist_sellers_bronze(bronze_root, spark)
print('-------------------------------------------------')
bronze_processing.process_product_cat_translation_bronze(bronze_root, spark)
print('-------------------------------------------------')

Processing Olist datasets...

loaded data/olist_customers_dataset.csv  →  99,441 rows


                                                                                

----> saved bronze: datamart/bronze/customers/bronze_olist_customers.parquet
-------------------------------------------------
loaded data/olist_geolocation_dataset.csv  →  1,000,325 rows


                                                                                

----> saved bronze: datamart/bronze/geolocation/bronze_olist_geolocation.parquet
-------------------------------------------------
loaded data/olist_order_items_dataset.csv  →  112,650 rows
----> saved bronze: datamart/bronze/order_items/bronze_olist_order_items.parquet
-------------------------------------------------
loaded data/olist_products_dataset.csv  →  32,951 rows
----> saved bronze: datamart/bronze/products/bronze_olist_products.parquet
-------------------------------------------------
loaded data/olist_sellers_dataset.csv  →  3,095 rows
----> saved bronze: datamart/bronze/sellers/bronze_olist_sellers.parquet
-------------------------------------------------
loaded data/product_category_name_translation.csv  →  72 rows
----> saved bronze: datamart/bronze/category_translation/bronze_product_category_translation.parquet
-------------------------------------------------


In [5]:
# Process orders with monthly partitioning
bronze_processing.process_olist_orders_bronze(bronze_root, spark)

                                                                                

Day 2018_04_10: 201 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_10.csv


                                                                                

Day 2017_10_12: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_12.csv


                                                                                

Day 2017_07_20: 142 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_20.csv


                                                                                

Day 2017_06_18: 90 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_18.csv


                                                                                

Day 2017_09_04: 150 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_04.csv


                                                                                

Day 2017_01_19: 29 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_19.csv


                                                                                

Day 2017_05_17: 141 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_17.csv


                                                                                

Day 2017_04_15: 52 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_15.csv


                                                                                

Day 2017_06_30: 95 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_30.csv


                                                                                

Day 2017_05_28: 98 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_28.csv


                                                                                

Day 2017_09_29: 121 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_29.csv


                                                                                

Day 2017_08_17: 158 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_17.csv


                                                                                

Day 2018_01_21: 199 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_21.csv


                                                                                

Day 2017_08_06: 109 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_06.csv


                                                                                

Day 2018_04_21: 156 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_21.csv


                                                                                

Day 2017_06_16: 97 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_16.csv


                                                                                

Day 2017_07_19: 153 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_19.csv


                                                                                

Day 2017_07_15: 98 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_15.csv


                                                                                

Day 2017_06_26: 72 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_26.csv
Day 2018_05_12: 207 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_12.csv


                                                                                

Day 2017_06_06: 134 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_06.csv


                                                                                

Day 2018_08_09: 289 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_09.csv


                                                                                

Day 2017_08_24: 128 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_24.csv
Day 2017_01_17: 32 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_17.csv


                                                                                

Day 2018_07_27: 189 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_27.csv


                                                                                

Day 2017_02_23: 59 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_23.csv


                                                                                

Day 2017_09_12: 206 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_12.csv
Day 2017_03_29: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_29.csv


                                                                                

Day 2017_03_12: 64 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_12.csv


                                                                                

Day 2017_08_07: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_07.csv


                                                                                

Day 2018_06_01: 184 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_01.csv


                                                                                

Day 2018_01_18: 241 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_18.csv


                                                                                

Day 2017_12_17: 133 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_17.csv


                                                                                

Day 2018_08_03: 314 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_03.csv


                                                                                

Day 2017_11_03: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_03.csv


                                                                                

Day 2017_09_03: 125 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_03.csv


                                                                                

Day 2017_02_12: 64 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_12.csv


                                                                                

Day 2017_10_05: 140 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_05.csv


                                                                                

Day 2017_10_13: 154 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_13.csv


                                                                                

Day 2017_09_01: 158 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_01.csv


                                                                                

Day 2017_08_09: 144 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_09.csv


                                                                                

Day 2018_07_15: 152 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_15.csv


                                                                                

Day 2017_09_19: 154 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_19.csv


                                                                                

Day 2018_07_12: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_12.csv


                                                                                

Day 2017_02_09: 77 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_09.csv


                                                                                

Day 2017_06_24: 76 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_24.csv


                                                                                

Day 2017_11_07: 160 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_07.csv


                                                                                

Day 2018_08_20: 256 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_20.csv


                                                                                

Day 2017_08_20: 104 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_20.csv


                                                                                

Day 2017_04_01: 68 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_01.csv


                                                                                

Day 2018_01_11: 266 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_11.csv


                                                                                

Day 2017_07_14: 149 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_14.csv


                                                                                

Day 2018_09_17: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_17.csv
Day 2018_02_02: 212 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_02.csv


                                                                                

Day 2018_03_08: 234 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_08.csv


                                                                                

Day 2018_01_17: 282 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_17.csv


                                                                                

Day 2018_06_25: 241 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_25.csv
Day 2017_03_09: 93 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_09.csv


                                                                                

Day 2018_07_08: 122 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_08.csv


                                                                                

Day 2018_06_16: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_16.csv


                                                                                

Day 2017_10_09: 193 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_09.csv


                                                                                

Day 2018_01_15: 307 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_15.csv


                                                                                

Day 2018_07_30: 288 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_30.csv


                                                                                

Day 2017_11_16: 226 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_16.csv


                                                                                

Day 2017_11_04: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_04.csv


                                                                                

Day 2018_08_22: 187 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_22.csv


                                                                                

Day 2018_01_13: 219 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_13.csv


                                                                                

Day 2017_03_21: 97 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_21.csv


                                                                                

Day 2017_08_23: 112 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_23.csv


                                                                                

Day 2017_01_22: 31 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_22.csv


                                                                                

Day 2017_08_30: 181 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_30.csv


                                                                                

Day 2017_09_23: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_23.csv


                                                                                

Day 2018_07_07: 91 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_07.csv


                                                                                

Day 2017_05_04: 109 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_04.csv


                                                                                

Day 2018_05_13: 207 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_13.csv


                                                                                

Day 2017_08_21: 177 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_21.csv


                                                                                

Day 2017_03_16: 103 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_16.csv


                                                                                

Day 2017_03_20: 119 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_20.csv


                                                                                

Day 2017_01_18: 33 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_18.csv


                                                                                

Day 2018_01_24: 244 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_24.csv


                                                                                

Day 2017_03_22: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_22.csv


                                                                                

Day 2017_09_30: 109 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_30.csv


                                                                                

Day 2017_04_06: 96 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_06.csv


                                                                                

Day 2018_04_04: 257 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_04.csv


                                                                                

Day 2018_02_10: 179 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_10.csv


                                                                                

Day 2018_05_05: 197 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_05.csv


                                                                                

Day 2017_07_26: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_26.csv


                                                                                

Day 2018_08_25: 69 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_25.csv
Day 2017_06_12: 126 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_12.csv


                                                                                

Day 2017_03_02: 72 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_02.csv


                                                                                

Day 2017_01_21: 24 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_21.csv


                                                                                

Day 2018_08_02: 302 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_02.csv


                                                                                

Day 2018_04_12: 255 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_12.csv


                                                                                

Day 2017_04_28: 98 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_28.csv


                                                                                

Day 2018_05_03: 305 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_03.csv


                                                                                

Day 2018_01_28: 156 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_28.csv


                                                                                

Day 2018_08_17: 257 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_17.csv
Day 2017_12_18: 209 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_18.csv
Day 2017_04_02: 65 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_02.csv


                                                                                

Day 2018_02_21: 266 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_21.csv


                                                                                

Day 2018_07_26: 247 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_26.csv


                                                                                

Day 2018_03_24: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_24.csv


                                                                                

Day 2017_02_04: 67 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_04.csv
Day 2018_06_17: 152 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_17.csv


                                                                                

Day 2018_09_13: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_13.csv


                                                                                

Day 2018_05_19: 139 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_19.csv


                                                                                

Day 2018_08_05: 276 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_05.csv


                                                                                

Day 2018_06_09: 172 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_09.csv


                                                                                

Day 2017_06_14: 131 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_14.csv


                                                                                

Day 2018_07_16: 245 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_16.csv


                                                                                

Day 2017_01_31: 70 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_31.csv


                                                                                

Day 2016_09_05: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_09_05.csv


                                                                                

Day 2017_04_20: 98 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_20.csv


                                                                                

Day 2018_07_31: 322 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_31.csv


                                                                                

Day 2018_03_16: 252 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_16.csv


                                                                                

Day 2018_03_25: 190 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_25.csv


                                                                                

Day 2017_09_22: 138 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_22.csv


                                                                                

Day 2018_03_19: 303 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_19.csv


                                                                                

Day 2017_08_11: 141 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_11.csv


                                                                                

Day 2017_12_31: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_31.csv


                                                                                

Day 2017_12_12: 260 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_12.csv


                                                                                

Day 2017_02_02: 69 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_02.csv
Day 2018_01_22: 314 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_22.csv


                                                                                

Day 2017_10_01: 128 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_01.csv


                                                                                

Day 2017_10_20: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_20.csv


                                                                                

Day 2018_08_04: 245 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_04.csv


                                                                                

Day 2018_08_19: 204 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_19.csv


                                                                                

Day 2017_01_26: 86 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_26.csv


                                                                                

Day 2017_12_27: 167 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_27.csv


                                                                                

Day 2017_11_14: 192 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_14.csv


                                                                                

Day 2017_10_02: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_02.csv


                                                                                

Day 2017_10_31: 160 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_31.csv


                                                                                

Day 2018_06_29: 174 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_29.csv


                                                                                

Day 2018_03_06: 271 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_06.csv


                                                                                

Day 2017_08_18: 147 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_18.csv


                                                                                

Day 2018_08_08: 316 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_08.csv


                                                                                

Day 2018_01_27: 153 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_27.csv


                                                                                

Day 2018_01_10: 277 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_10.csv


                                                                                

Day 2017_11_27: 403 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_27.csv


                                                                                

Day 2018_08_24: 99 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_24.csv


                                                                                

Day 2017_08_29: 137 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_29.csv


                                                                                

Day 2018_07_09: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_09.csv


                                                                                

Day 2017_08_26: 93 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_26.csv


                                                                                

Day 2018_04_25: 284 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_25.csv


                                                                                

Day 2017_08_01: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_01.csv


                                                                                

Day 2017_05_03: 103 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_03.csv


                                                                                

Day 2018_03_10: 193 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_10.csv


                                                                                

Day 2017_08_02: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_02.csv


                                                                                

Day 2017_09_18: 183 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_18.csv
Day 2018_06_18: 246 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_18.csv


                                                                                

Day 2018_01_09: 252 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_09.csv


                                                                                

Day 2018_01_14: 235 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_14.csv


                                                                                

Day 2017_04_03: 72 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_03.csv


                                                                                

Day 2017_05_13: 73 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_13.csv


                                                                                

Day 2018_01_07: 196 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_07.csv


                                                                                

Day 2017_03_14: 99 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_14.csv


                                                                                

Day 2017_01_14: 18 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_14.csv


                                                                                

Day 2018_02_04: 201 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_04.csv


                                                                                

Day 2017_10_10: 185 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_10.csv


                                                                                

Day 2018_03_01: 277 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_01.csv


                                                                                

Day 2017_06_19: 156 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_19.csv


                                                                                

Day 2017_07_07: 121 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_07.csv


                                                                                

Day 2018_07_22: 215 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_22.csv


                                                                                

Day 2017_03_28: 103 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_28.csv
Day 2018_06_28: 243 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_28.csv


                                                                                

Day 2017_04_14: 54 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_14.csv


                                                                                

Day 2018_03_21: 286 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_21.csv


                                                                                

Day 2018_06_10: 193 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_10.csv


                                                                                

Day 2017_03_31: 70 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_31.csv


                                                                                

Day 2018_02_26: 299 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_26.csv


                                                                                

Day 2017_11_01: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_01.csv


                                                                                

Day 2017_02_21: 41 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_21.csv


                                                                                

Day 2017_10_24: 164 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_24.csv


                                                                                

Day 2018_01_30: 258 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_30.csv


                                                                                

Day 2018_06_19: 231 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_19.csv
Day 2018_06_13: 259 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_13.csv


                                                                                

Day 2018_05_24: 114 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_24.csv


                                                                                

Day 2017_03_08: 79 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_08.csv


                                                                                

Day 2018_03_18: 215 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_18.csv


                                                                                

Day 2017_09_10: 136 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_10.csv


                                                                                

Day 2018_06_20: 217 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_20.csv


                                                                                

Day 2018_08_28: 44 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_28.csv


                                                                                

Day 2018_01_20: 184 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_20.csv


                                                                                

Day 2018_07_29: 188 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_29.csv


                                                                                

Day 2018_02_17: 204 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_17.csv
Day 2018_01_26: 229 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_26.csv


                                                                                

Day 2017_07_06: 154 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_06.csv


                                                                                

Day 2018_03_26: 272 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_26.csv


                                                                                

Day 2017_12_11: 267 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_11.csv


                                                                                

Day 2017_03_17: 65 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_17.csv


                                                                                

Day 2017_06_17: 75 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_17.csv


                                                                                

Day 2017_07_21: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_21.csv


                                                                                

Day 2017_01_23: 39 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_23.csv


                                                                                

Day 2018_05_17: 228 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_17.csv


                                                                                

Day 2018_04_29: 171 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_29.csv


                                                                                

Day 2017_11_30: 267 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_30.csv


                                                                                

Day 2018_05_28: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_28.csv


                                                                                

Day 2018_03_22: 254 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_22.csv


                                                                                

Day 2017_11_26: 391 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_26.csv


                                                                                

Day 2018_06_07: 223 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_07.csv


                                                                                

Day 2017_06_29: 114 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_29.csv


                                                                                

Day 2017_02_18: 54 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_18.csv


                                                                                

Day 2017_09_15: 150 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_15.csv


                                                                                

Day 2018_05_07: 372 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_07.csv


                                                                                

Day 2017_11_19: 158 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_19.csv


                                                                                

Day 2018_08_18: 198 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_18.csv


                                                                                

Day 2018_06_21: 234 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_21.csv


                                                                                

Day 2017_06_22: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_22.csv


                                                                                

Day 2018_03_29: 194 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_29.csv
Day 2017_07_23: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_23.csv


                                                                                

Day 2018_04_07: 164 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_07.csv


                                                                                

Day 2017_09_09: 106 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_09.csv


                                                                                

Day 2017_02_25: 43 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_25.csv


                                                                                

Day 2018_04_08: 185 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_08.csv


                                                                                

Day 2017_12_25: 90 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_25.csv


                                                                                

Day 2017_12_13: 218 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_13.csv


                                                                                

Day 2018_02_25: 237 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_25.csv


                                                                                

Day 2018_03_30: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_30.csv


                                                                                

Day 2017_09_14: 166 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_14.csv
Day 2017_07_27: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_27.csv


                                                                                

Day 2018_03_11: 218 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_11.csv


                                                                                

Day 2017_08_19: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_19.csv


                                                                                

Day 2017_01_11: 12 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_11.csv


                                                                                

Day 2017_11_23: 283 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_23.csv


                                                                                

Day 2018_09_11: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_11.csv


                                                                                

Day 2017_07_09: 94 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_09.csv


                                                                                

Day 2017_10_27: 133 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_27.csv


                                                                                

Day 2017_10_07: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_07.csv


                                                                                

Day 2018_04_27: 242 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_27.csv


                                                                                

Day 2017_02_17: 46 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_17.csv
Day 2017_11_20: 230 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_20.csv


                                                                                

Day 2017_07_12: 153 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_12.csv
Day 2017_02_28: 55 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_28.csv
Day 2017_03_15: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_15.csv
Day 2018_05_25: 104 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_25.csv


                                                                                

Day 2017_06_25: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_25.csv


                                                                                

Day 2017_12_01: 275 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_01.csv


                                                                                

Day 2017_11_25: 499 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_25.csv


                                                                                

Day 2017_04_24: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_24.csv


                                                                                

Day 2018_08_13: 292 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_13.csv


                                                                                

Day 2017_12_20: 171 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_20.csv


                                                                                

Day 2017_07_16: 114 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_16.csv


                                                                                

Day 2017_09_21: 150 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_21.csv


                                                                                

Day 2018_02_08: 230 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_08.csv


                                                                                

Day 2018_01_01: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_01.csv


                                                                                

Day 2018_05_23: 132 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_23.csv


                                                                                

Day 2018_05_18: 236 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_18.csv


                                                                                

Day 2017_03_03: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_03.csv


                                                                                

Day 2018_05_22: 199 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_22.csv


                                                                                

Day 2017_01_30: 53 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_30.csv


                                                                                

Day 2017_02_15: 62 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_15.csv


                                                                                

Day 2017_11_18: 149 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_18.csv


                                                                                

Day 2018_07_06: 121 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_06.csv


                                                                                

Day 2018_07_18: 307 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_18.csv


                                                                                

Day 2018_08_10: 256 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_10.csv


                                                                                

Day 2018_04_18: 280 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_18.csv


                                                                                

Day 2018_08_11: 188 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_11.csv


                                                                                

Day 2018_05_29: 149 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_29.csv
Day 2018_05_08: 331 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_08.csv


                                                                                

Day 2017_07_05: 145 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_05.csv


                                                                                

Day 2018_06_12: 246 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_12.csv


                                                                                

Day 2017_05_07: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_07.csv


                                                                                

Day 2017_03_07: 82 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_07.csv


                                                                                

Day 2017_10_17: 202 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_17.csv
Day 2018_07_20: 250 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_20.csv


                                                                                

Day 2017_03_19: 56 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_19.csv


                                                                                

Day 2017_09_17: 126 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_17.csv
Day 2017_10_29: 132 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_29.csv


                                                                                

Day 2018_08_23: 144 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_23.csv


                                                                                

Day 2017_09_27: 147 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_27.csv


                                                                                

Day 2018_08_07: 370 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_07.csv


                                                                                

Day 2016_10_10: 39 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_10.csv


                                                                                

Day 2017_12_26: 168 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_26.csv


                                                                                

Day 2017_05_02: 128 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_02.csv


                                                                                

Day 2018_04_06: 189 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_06.csv


                                                                                

Day 2017_04_18: 71 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_18.csv


                                                                                

Day 2017_09_20: 163 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_20.csv


                                                                                

Day 2017_10_11: 158 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_11.csv


                                                                                

Day 2018_02_13: 224 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_13.csv


                                                                                

Day 2018_04_16: 280 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_16.csv


                                                                                

Day 2018_06_26: 243 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_26.csv


                                                                                

Day 2017_12_02: 216 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_02.csv


                                                                                

Day 2017_07_02: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_02.csv
Day 2018_03_31: 168 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_31.csv


                                                                                

Day 2017_02_06: 85 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_06.csv


                                                                                

Day 2017_03_25: 76 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_25.csv
Day 2016_10_04: 63 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_04.csv


                                                                                

Day 2017_06_01: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_01.csv


                                                                                

Day 2018_05_04: 265 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_04.csv


                                                                                

Day 2018_08_14: 316 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_14.csv


                                                                                

Day 2017_04_11: 78 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_11.csv


                                                                                

Day 2018_02_12: 218 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_12.csv


                                                                                

Day 2017_05_14: 112 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_14.csv


                                                                                

Day 2018_05_30: 140 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_30.csv


                                                                                

Day 2017_11_06: 193 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_06.csv


                                                                                

Day 2018_08_12: 197 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_12.csv


                                                                                

Day 2018_07_10: 134 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_10.csv


                                                                                

Day 2017_11_21: 228 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_21.csv


                                                                                

Day 2017_06_05: 147 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_05.csv


                                                                                

Day 2018_01_31: 256 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_31.csv
Day 2018_03_27: 245 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_27.csv


                                                                                

Day 2017_12_07: 229 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_07.csv


                                                                                

Day 2017_04_26: 125 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_26.csv


                                                                                

Day 2018_04_14: 146 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_14.csv


                                                                                

Day 2017_07_25: 153 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_25.csv


                                                                                

Day 2018_08_15: 288 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_15.csv


                                                                                

Day 2017_04_05: 96 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_05.csv


                                                                                

Day 2017_04_19: 97 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_19.csv


                                                                                

Day 2017_05_29: 160 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_29.csv


                                                                                

Day 2017_09_28: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_28.csv


                                                                                

Day 2017_10_06: 130 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_06.csv
Day 2017_05_21: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_21.csv


                                                                                

Day 2017_12_14: 200 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_14.csv


                                                                                

Day 2017_08_14: 159 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_14.csv


                                                                                

Day 2018_05_11: 247 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_11.csv


                                                                                

Day 2018_03_07: 259 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_07.csv


                                                                                

Day 2017_07_10: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_10.csv


                                                                                

Day 2018_01_16: 302 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_16.csv


                                                                                

Day 2017_08_28: 155 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_28.csv


                                                                                

Day 2018_05_20: 161 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_20.csv


                                                                                

Day 2017_07_08: 84 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_08.csv


                                                                                

Day 2018_04_02: 282 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_02.csv


                                                                                

Day 2018_05_06: 211 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_06.csv


                                                                                

Day 2018_07_14: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_14.csv


                                                                                

Day 2017_02_14: 76 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_14.csv


                                                                                

Day 2017_03_18: 73 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_18.csv


                                                                                

Day 2018_01_02: 204 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_02.csv


                                                                                

Day 2018_02_07: 248 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_07.csv


                                                                                

Day 2017_03_11: 71 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_11.csv


                                                                                

Day 2017_05_09: 128 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_09.csv


                                                                                

Day 2018_06_02: 142 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_02.csv


                                                                                

Day 2018_09_20: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_20.csv


                                                                                

Day 2017_10_28: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_28.csv


                                                                                

Day 2017_10_14: 116 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_14.csv


                                                                                

Day 2017_02_16: 55 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_16.csv


                                                                                

Day 2018_07_13: 168 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_13.csv


                                                                                

Day 2018_04_01: 207 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_01.csv
Day 2018_03_14: 199 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_14.csv


                                                                                

Day 2017_04_30: 68 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_30.csv


                                                                                

Day 2018_01_05: 210 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_05.csv


                                                                                

Day 2017_07_04: 125 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_04.csv


                                                                                

Day 2018_06_04: 225 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_04.csv


                                                                                

Day 2018_08_27: 67 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_27.csv


                                                                                

Day 2018_05_21: 206 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_21.csv


                                                                                

Day 2018_06_14: 212 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_14.csv


                                                                                

Day 2017_05_05: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_05.csv


                                                                                

Day 2017_11_29: 323 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_29.csv


                                                                                

Day 2017_04_08: 68 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_08.csv


                                                                                

Day 2017_02_05: 75 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_05.csv


                                                                                

Day 2017_10_08: 126 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_08.csv


                                                                                

Day 2017_01_20: 29 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_20.csv


                                                                                

Day 2017_07_22: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_22.csv


                                                                                

Day 2018_06_24: 199 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_24.csv


                                                                                

Day 2018_03_05: 264 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_05.csv


                                                                                

Day 2017_01_16: 19 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_16.csv


                                                                                

Day 2017_08_04: 142 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_04.csv


                                                                                

Day 2017_12_06: 280 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_06.csv


                                                                                

Day 2017_02_13: 78 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_13.csv


                                                                                

Day 2017_03_13: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_13.csv


                                                                                

Day 2018_01_25: 233 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_25.csv


                                                                                

Day 2017_05_20: 72 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_20.csv


                                                                                

Day 2018_02_09: 216 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_09.csv


                                                                                

Day 2017_11_17: 197 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_17.csv


                                                                                

Day 2017_07_29: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_29.csv


                                                                                

Day 2017_11_09: 191 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_09.csv


                                                                                

Day 2018_02_24: 191 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_24.csv


                                                                                

Day 2018_03_20: 297 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_20.csv


                                                                                

Day 2016_09_04: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_09_04.csv


                                                                                

Day 2018_02_28: 313 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_28.csv
Day 2018_06_30: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_30.csv


                                                                                

Day 2018_01_03: 225 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_03.csv


                                                                                

Day 2017_08_15: 194 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_15.csv


                                                                                

Day 2016_10_09: 26 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_09.csv


                                                                                

Day 2018_01_19: 235 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_19.csv


                                                                                

Day 2016_10_07: 46 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_07.csv


                                                                                

Day 2017_01_12: 13 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_12.csv


                                                                                

Day 2017_09_11: 180 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_11.csv


                                                                                

Day 2017_05_24: 142 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_24.csv


                                                                                

Day 2017_08_27: 119 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_27.csv


                                                                                

Day 2017_05_18: 125 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_18.csv


                                                                                

Day 2018_07_28: 176 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_28.csv
Day 2017_03_06: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_06.csv


                                                                                

Day 2017_04_29: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_29.csv


                                                                                

Day 2018_04_09: 253 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_09.csv


                                                                                

Day 2018_02_22: 279 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_22.csv
Day 2017_11_12: 174 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_12.csv


                                                                                

Day 2017_01_07: 4 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_07.csv


                                                                                

Day 2017_07_13: 137 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_13.csv


                                                                                

Day 2018_02_01: 232 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_01.csv


                                                                                

Day 2017_01_15: 14 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_15.csv


                                                                                

Day 2018_02_05: 271 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_05.csv


                                                                                

Day 2018_07_04: 256 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_04.csv


                                                                                

Day 2017_02_07: 112 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_07.csv


                                                                                

Day 2017_07_30: 117 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_30.csv


                                                                                

Day 2018_06_08: 198 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_08.csv


                                                                                

Day 2018_07_03: 216 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_03.csv


                                                                                

Day 2017_05_10: 116 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_10.csv


                                                                                

Day 2018_08_30: 4 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_30.csv


                                                                                

Day 2017_05_19: 144 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_19.csv


                                                                                

Day 2017_08_31: 149 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_31.csv


                                                                                

Day 2017_02_20: 63 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_20.csv


                                                                                

Day 2017_11_11: 159 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_11.csv


                                                                                

Day 2017_10_25: 156 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_25.csv


                                                                                

Day 2018_07_19: 253 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_19.csv


                                                                                

Day 2018_01_08: 293 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_08.csv


                                                                                

Day 2017_02_08: 95 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_08.csv
Day 2017_04_13: 60 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_13.csv


                                                                                

Day 2017_01_28: 29 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_28.csv


                                                                                

Day 2018_08_01: 311 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_01.csv
Day 2018_03_28: 220 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_28.csv


                                                                                

Day 2017_07_31: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_31.csv


                                                                                

Day 2018_04_03: 247 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_03.csv


                                                                                

Day 2017_03_04: 80 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_04.csv


                                                                                

Day 2017_05_27: 83 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_27.csv


                                                                                

Day 2017_04_07: 85 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_07.csv


                                                                                

Day 2018_07_23: 307 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_23.csv


                                                                                

Day 2017_06_04: 102 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_04.csv


                                                                                

Day 2018_07_01: 161 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_01.csv


                                                                                

Day 2017_02_24: 64 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_24.csv


                                                                                

Day 2017_08_12: 98 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_12.csv


                                                                                

Day 2017_10_16: 195 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_16.csv


                                                                                

Day 2017_12_08: 219 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_08.csv


                                                                                

Day 2017_06_08: 137 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_08.csv


                                                                                

Day 2016_10_03: 8 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_03.csv


                                                                                

Day 2018_03_12: 232 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_12.csv
Day 2018_05_02: 296 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_02.csv


                                                                                

Day 2018_07_02: 195 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_02.csv


                                                                                

Day 2018_06_23: 151 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_23.csv


                                                                                

Day 2017_12_28: 146 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_28.csv


                                                                                

Day 2017_03_01: 99 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_01.csv


                                                                                

Day 2017_01_29: 35 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_29.csv


                                                                                

Day 2018_04_30: 240 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_30.csv


                                                                                

Day 2017_05_06: 109 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_06.csv


                                                                                

Day 2017_09_25: 136 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_25.csv


                                                                                

Day 2018_04_17: 266 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_17.csv


                                                                                

Day 2017_05_08: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_08.csv


                                                                                

Day 2017_11_05: 144 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_05.csv


                                                                                

Day 2017_08_10: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_10.csv


                                                                                

Day 2017_06_28: 120 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_28.csv


                                                                                

Day 2017_12_29: 135 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_29.csv


                                                                                

Day 2018_08_29: 14 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_29.csv


                                                                                

Day 2017_05_23: 125 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_23.csv


                                                                                

Day 2017_05_12: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_12.csv


                                                                                

Day 2018_02_03: 193 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_03.csv


                                                                                

Day 2017_11_13: 205 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_13.csv


                                                                                

Day 2017_12_30: 97 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_30.csv


                                                                                

Day 2018_02_27: 298 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_27.csv


                                                                                

Day 2018_07_11: 127 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_11.csv


                                                                                

Day 2018_06_11: 294 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_11.csv
Day 2018_08_21: 243 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_21.csv


                                                                                

Day 2017_10_18: 179 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_18.csv
Day 2017_04_25: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_25.csv


                                                                                

Day 2017_11_08: 175 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_08.csv


                                                                                

Day 2017_08_25: 123 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_25.csv


                                                                                

Day 2017_06_21: 35 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_21.csv


                                                                                

Day 2017_01_25: 63 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_25.csv


                                                                                

Day 2017_03_10: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_10.csv


                                                                                

Day 2017_08_03: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_03.csv


                                                                                

Day 2017_09_06: 140 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_06.csv


                                                                                

Day 2017_11_24: 1176 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_24.csv


                                                                                

Day 2018_01_06: 216 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_06.csv


                                                                                

Day 2017_05_30: 105 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_30.csv


                                                                                

Day 2017_10_04: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_04.csv


                                                                                

Day 2018_04_19: 293 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_19.csv


                                                                                

Day 2017_06_15: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_15.csv


                                                                                

Day 2018_02_19: 259 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_19.csv


                                                                                

Day 2018_02_15: 283 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_15.csv


                                                                                

Day 2017_04_22: 84 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_22.csv


                                                                                

Day 2018_05_26: 99 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_26.csv


                                                                                

Day 2017_07_18: 192 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_18.csv


                                                                                

Day 2018_04_26: 254 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_26.csv


                                                                                

Day 2017_06_02: 127 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_02.csv


                                                                                

Day 2017_12_03: 234 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_03.csv
Day 2018_02_18: 202 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_18.csv


                                                                                

Day 2017_04_09: 73 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_09.csv


                                                                                

Day 2017_06_09: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_09.csv


                                                                                

Day 2017_05_15: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_15.csv


                                                                                

Day 2017_10_26: 144 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_26.csv


                                                                                

Day 2017_12_04: 337 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_04.csv


                                                                                

Day 2018_06_03: 192 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_03.csv


                                                                                

Day 2017_05_25: 108 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_25.csv


                                                                                

Day 2018_05_14: 364 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_14.csv


                                                                                

Day 2017_12_05: 282 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_05.csv


                                                                                

Day 2017_03_24: 79 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_24.csv


                                                                                

Day 2017_04_17: 65 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_17.csv


                                                                                

Day 2016_10_08: 42 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_08.csv


                                                                                

Day 2018_02_11: 172 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_11.csv


                                                                                

Day 2018_08_16: 320 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_16.csv


                                                                                

Day 2017_12_16: 131 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_16.csv


                                                                                

Day 2017_02_26: 46 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_26.csv


                                                                                

Day 2018_05_09: 344 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_09.csv


                                                                                

Day 2018_01_04: 258 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_04.csv


                                                                                

Day 2017_09_07: 99 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_07.csv


                                                                                

Day 2017_09_02: 104 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_02.csv


                                                                                

Day 2018_06_05: 200 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_05.csv


                                                                                

Day 2018_01_23: 262 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_23.csv


                                                                                

Day 2018_05_27: 100 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_27.csv


                                                                                

Day 2017_07_28: 132 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_28.csv


                                                                                

Day 2017_09_05: 157 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_05.csv


                                                                                

Day 2017_09_08: 116 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_08.csv


                                                                                

Day 2017_02_10: 66 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_10.csv


                                                                                

Day 2017_02_19: 40 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_19.csv


                                                                                

Day 2017_04_27: 80 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_27.csv


                                                                                

Day 2017_10_19: 176 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_19.csv


                                                                                

Day 2016_10_05: 47 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_05.csv


                                                                                

Day 2017_04_12: 63 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_12.csv


                                                                                

Day 2018_04_13: 202 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_13.csv


                                                                                

Day 2018_02_06: 268 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_06.csv


                                                                                

Day 2017_06_10: 77 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_10.csv


                                                                                

Day 2017_07_24: 162 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_24.csv


                                                                                

Day 2018_03_03: 214 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_03.csv


                                                                                

Day 2016_09_13: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_09_13.csv


                                                                                

Day 2017_07_01: 80 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_01.csv


                                                                                

Day 2018_05_15: 352 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_15.csv


                                                                                

Day 2017_11_28: 380 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_28.csv


                                                                                

Day 2017_01_05: 32 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_05.csv


                                                                                

Day 2017_03_27: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_27.csv


                                                                                

Day 2018_03_04: 236 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_04.csv


                                                                                

Day 2018_06_15: 186 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_15.csv


                                                                                

Day 2018_03_13: 227 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_13.csv


                                                                                

Day 2017_11_15: 186 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_15.csv


                                                                                

Day 2018_02_14: 295 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_14.csv


                                                                                

Day 2017_07_17: 168 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_17.csv


                                                                                

Day 2018_01_12: 243 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_12.csv


                                                                                

Day 2017_07_03: 118 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_03.csv


                                                                                

Day 2017_06_03: 89 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_03.csv


                                                                                

Day 2017_12_24: 59 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_24.csv


                                                                                

Day 2018_06_06: 227 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_06.csv


                                                                                

Day 2017_07_11: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_07_11.csv


                                                                                

Day 2018_08_06: 372 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_06.csv


                                                                                

Day 2018_02_16: 214 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_16.csv


                                                                                

Day 2017_11_10: 165 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_10.csv


                                                                                

Day 2018_03_15: 290 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_15.csv


                                                                                

Day 2018_02_20: 289 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_20.csv


                                                                                

Day 2017_02_11: 49 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_11.csv


                                                                                

Day 2017_06_11: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_11.csv


                                                                                

Day 2018_04_15: 223 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_15.csv


                                                                                

Day 2018_03_17: 180 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_17.csv


                                                                                

Day 2018_05_16: 357 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_16.csv


                                                                                

Day 2018_07_21: 187 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_21.csv


                                                                                

Day 2017_02_22: 63 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_22.csv


                                                                                

Day 2018_03_09: 204 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_09.csv


                                                                                

Day 2018_01_29: 246 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_01_29.csv


                                                                                

Day 2017_05_31: 129 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_31.csv
Day 2017_03_23: 114 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_23.csv


                                                                                

Day 2018_04_28: 169 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_28.csv


                                                                                

Day 2017_05_16: 153 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_16.csv


                                                                                

Day 2018_03_23: 221 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_23.csv


                                                                                

Day 2018_09_06: 3 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_06.csv


                                                                                

Day 2018_02_23: 235 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_02_23.csv


                                                                                

Day 2018_07_25: 268 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_25.csv


                                                                                

Day 2018_08_26: 73 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_26.csv


                                                                                

Day 2018_07_05: 195 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_05.csv


                                                                                

Day 2017_06_20: 94 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_20.csv


                                                                                

Day 2017_03_05: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_05.csv


                                                                                

Day 2017_09_24: 118 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_24.csv


                                                                                

Day 2017_01_24: 40 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_24.csv


                                                                                

Day 2017_09_16: 129 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_16.csv


                                                                                

Day 2018_07_17: 221 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_17.csv


                                                                                

Day 2017_04_04: 96 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_04.csv


                                                                                

Day 2017_08_13: 96 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_13.csv


                                                                                

Day 2017_06_27: 141 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_27.csv


                                                                                

Day 2017_12_10: 190 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_10.csv


                                                                                

Day 2017_08_22: 138 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_22.csv


                                                                                

Day 2017_05_01: 117 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_01.csv
Day 2017_09_13: 207 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_13.csv


                                                                                

Day 2018_05_01: 256 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_01.csv


                                                                                

Day 2017_01_27: 62 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_27.csv
Day 2017_01_13: 12 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_13.csv


                                                                                

Day 2017_11_22: 201 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_22.csv


                                                                                

Day 2018_05_10: 279 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_10.csv
Day 2017_10_15: 121 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_15.csv


                                                                                

Day 2017_08_05: 115 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_05.csv


                                                                                

Day 2017_05_22: 148 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_22.csv


                                                                                

Day 2017_04_23: 88 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_23.csv


                                                                                

Day 2017_06_13: 126 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_13.csv


                                                                                

Day 2017_03_30: 74 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_30.csv


                                                                                

Day 2017_04_21: 68 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_21.csv
Day 2017_01_08: 6 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_08.csv


                                                                                

Day 2017_06_23: 107 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_23.csv
Day 2017_06_07: 129 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_06_07.csv


                                                                                

Day 2017_10_03: 198 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_03.csv


                                                                                

Day 2016_10_06: 51 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_06.csv


                                                                                

Day 2018_05_31: 133 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_05_31.csv


                                                                                

Day 2018_06_22: 179 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_22.csv


                                                                                

Day 2018_04_05: 266 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_05.csv


                                                                                

Day 2017_12_21: 143 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_21.csv


                                                                                

Day 2017_02_27: 43 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_27.csv


                                                                                

Day 2017_04_16: 49 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_16.csv


                                                                                

Day 2018_07_24: 271 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_07_24.csv


                                                                                

Day 2017_12_23: 109 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_23.csv


                                                                                

Day 2017_11_02: 124 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_11_02.csv


                                                                                

Day 2017_08_16: 174 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_16.csv


                                                                                

Day 2017_03_26: 64 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_03_26.csv


                                                                                

Day 2017_02_01: 73 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_01.csv


                                                                                

Day 2017_05_11: 126 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_11.csv


                                                                                

Day 2017_02_03: 60 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_02_03.csv


                                                                                

Day 2017_12_22: 111 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_22.csv


                                                                                

Day 2017_05_26: 80 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_05_26.csv


                                                                                

Day 2017_08_08: 146 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_08_08.csv


                                                                                

Day 2017_04_10: 87 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_04_10.csv


                                                                                

Day 2017_09_26: 180 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_09_26.csv


                                                                                

Day 2018_03_02: 266 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_03_02.csv


                                                                                

Day 2018_09_25: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_25.csv


                                                                                

Day 2017_01_10: 6 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_10.csv


                                                                                

Day 2017_01_09: 5 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_09.csv


                                                                                

Day 2017_01_06: 4 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_01_06.csv


                                                                                

Day 2016_09_15: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_09_15.csv
Day 2018_09_03: 4 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_03.csv


                                                                                

Day 2018_09_26: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_26.csv


                                                                                

Day 2018_09_10: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_10.csv


                                                                                

Day 2016_12_23: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_12_23.csv


                                                                                

Day 2018_10_03: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_10_03.csv


                                                                                

Day 2018_10_17: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_10_17.csv
Day 2016_10_22: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_22.csv


                                                                                

Day 2018_10_16: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_10_16.csv


                                                                                

Day 2018_09_29: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_29.csv


                                                                                

Day 2018_10_01: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_10_01.csv


                                                                                

Day 2018_09_12: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_09_12.csv


                                                                                

Day 2016_10_02: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2016_10_02.csv


                                                                                

Day 2018_08_31: 1 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_08_31.csv


                                                                                

Day 2017_10_23: 161 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_23.csv


                                                                                

Day 2018_04_23: 285 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_23.csv


                                                                                

Day 2017_12_09: 155 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_09.csv


                                                                                

Day 2018_04_20: 197 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_20.csv


                                                                                

Day 2018_04_24: 271 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_24.csv


                                                                                

Day 2018_04_22: 200 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_22.csv


                                                                                

Day 2017_12_19: 176 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_19.csv


                                                                                

Day 2017_10_30: 149 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_30.csv


                                                                                

Day 2017_12_15: 192 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_12_15.csv


                                                                                

Day 2017_10_21: 113 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_21.csv


                                                                                

Day 2018_04_11: 274 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_04_11.csv


                                                                                

Day 2017_10_22: 158 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2017_10_22.csv
Day 2018_06_27: 215 rows
----------> Saved to: datamart/bronze/orders/bronze_olist_orders_2018_06_27.csv


DataFrame[order_id: string, customer_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, snapshot_date: string]

In [6]:
bronze_processing.process_olist_order_reviews_bronze(bronze_root, spark)

loaded data/olist_order_reviews_dataset.csv  →  104,162 rows
----> saved bronze: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet


DataFrame[review_id: string, order_id: string, review_score: string, review_comment_title: string, review_comment_message: string, review_creation_date: string, review_answer_timestamp: string]

In [7]:
# Inspect some outputs

print("\n================== Sample customers table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/customers/bronze_olist_customers.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample sellers table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/sellers/bronze_olist_sellers.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample geolocation table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/geolocation/bronze_olist_geolocation.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample products table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/products/bronze_olist_products.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample category translation table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/category_translation/bronze_product_category_translation.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample orders items table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/order_items/bronze_olist_order_items.parquet")
df_bronze.show(5)

print('\n')
print("\n================== Sample orders table ==================\n")
df_bronze = spark.read.csv("datamart/bronze/orders/bronze_olist_orders_2018_01_01.csv", header=True, inferSchema=True)
df_bronze.show(5)

print("\n================== Sample order reviews table ==================\n")
df_bronze = spark.read.parquet("datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet")
df_bronze.show(5)





+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|503840d4f2a1a7609...|ffc4233210eac4ec1...|                   14811|          araraquara|            SP|
|52e73a5d0a1d4c56b...|b43530186123fb6d9...|                   62625|               missi|            CE|
|16cb62869f9719571...|c3cc321141423ab8a...|                   55560|           barreiros|            PE|
|4979ba0e6037e4b28...|80768413a59684f1e...|                   29307|cachoeiro de itap...|            ES|
|11ec4bc0610184925...|bd836cf4fce7f808b...|                   22420|      rio de janeiro|            RJ|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows




+--------------------+---

## Build Silver Table

In [8]:
# Create silver root directory
silver_root = "datamart/silver"
os.makedirs(silver_root, exist_ok=True)
print(f"Silver root directory: {silver_root}")

Silver root directory: datamart/silver


In [9]:
# Create all required output directories

# Create silver directory to save customer data
silver_cust_directory = "datamart/silver/customers/"
if not os.path.exists(silver_cust_directory):
    os.makedirs(silver_cust_directory)

# Create silver directory to save seller data
silver_sell_directory = "datamart/silver/sellers/"
if not os.path.exists(silver_sell_directory):
    os.makedirs(silver_sell_directory)

# Create silver directory to save geolocation data
silver_geo_directory = "datamart/silver/geolocation/"
if not os.path.exists(silver_geo_directory):
    os.makedirs(silver_geo_directory)

# Create silver directory to save products data
silver_prod_directory = "datamart/silver/products/"
if not os.path.exists(silver_prod_directory):
    os.makedirs(silver_prod_directory)

# Create silver directory to save product_categories_translation data
silver_prod_cat_trans_directory = "datamart/silver/category_translation/"
if not os.path.exists(silver_prod_cat_trans_directory):
    os.makedirs(silver_prod_cat_trans_directory)

# Create silver directory to save orders data
silver_orders_directory = "datamart/silver/orders/"
if not os.path.exists(silver_orders_directory):
    os.makedirs(silver_orders_directory)

# Create silver directory to save order_items data
silver_order_items_directory = "datamart/silver/order_items/"
if not os.path.exists(silver_order_items_directory):
    os.makedirs(silver_order_items_directory)

### DERIVED SILVER TABLES ###

# Create silver directory to save shipping_infos data
silver_shipping_infos_directory = "datamart/silver/shipping_infos/"
if not os.path.exists(silver_shipping_infos_directory):
    os.makedirs(silver_shipping_infos_directory)

In [None]:
# Process all bronze tables into silver
print("\nProcessing bronze tables...")
silver_processing.process_silver_olist_customers("datamart/bronze/customers/",silver_cust_directory, spark)
silver_processing.process_silver_olist_sellers("datamart/bronze/sellers/",silver_sell_directory, spark)
silver_processing.process_silver_olist_geolocation("datamart/bronze/geolocation/",silver_geo_directory, spark)
# silver_processing.process_silver_olist_products("datamart/bronze/products/",silver_prod_directory, spark)
# silver_processing.process_silver_olist_order_items("datamart/bronze/order_items/",silver_order_items_directory, spark)
# silver_processing.process_silver_olist_product_categories_translation("datamart/bronze/category_translation",silver_prod_cat_trans_directory, spark)
# silver_processing.process_silver_olist_orders("datamart/bronze/orders/",silver_orders_directory, spark)

### Build Customer Table

In [10]:
# Create silver directory to save customer data
silver_cust_directory = "datamart/silver/customers/"
if not os.path.exists(silver_cust_directory):
    os.makedirs(silver_cust_directory)

In [11]:
def process_silver_olist_customers(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_customers.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "customer_id": StringType(),
        "customer_unique_id": StringType(),
        "customer_zip_code_prefix": StringType(),
        "customer_city": StringType(),
        "customer_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Check customer_id duplicates (total rows - distinct ids)
    total_rows = df.count()
    distinct_rows = df.select("customer_id").distinct().count()
    duplicates_customer_id = total_rows - distinct_rows
    print(f"Number of duplicated 'customer_id': {duplicates_customer_id}")

    # Add missing leading zero
    df = df.withColumn(
        "customer_zip_code_prefix",
        F.lpad(col("customer_zip_code_prefix"), 5, "0")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_customers.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [12]:
# Run function manually to test
df = process_silver_olist_customers("datamart/bronze/customers/",silver_cust_directory, spark)

loaded from: datamart/bronze/customers/bronze_olist_customers.parquet row count: 99441
Number of duplicated 'customer_id': 0
saved to: datamart/silver/customers/silver_olist_customers.parquet


In [13]:
# Check schema enforced
df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [14]:
# Check missing leading zero padded
df.groupBy(F.length("customer_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5|99441|
+------+-----+



### Build Seller Table

In [15]:
# Create silver directory to save seller data
silver_sell_directory = "datamart/silver/sellers/"
if not os.path.exists(silver_sell_directory):
    os.makedirs(silver_sell_directory)

In [16]:
def process_silver_olist_sellers(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_sellers.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "seller_id": StringType(),
        "seller_zip_code_prefix": StringType(),
        "seller_city": StringType(),
        "seller_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Check seller_id duplicates (total rows - distinct ids)
    total_rows = df.count()
    distinct_rows = df.select("seller_id").distinct().count()
    duplicates_seller_id = total_rows - distinct_rows
    print(f"Number of duplicated 'seller_id': {duplicates_seller_id}")

    # Add missing leading zero
    df = df.withColumn(
        "seller_zip_code_prefix",
        F.lpad(col("seller_zip_code_prefix"), 5, "0")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_sellers.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [17]:
# Run function manually to test
df = process_silver_olist_sellers("datamart/bronze/sellers/",silver_sell_directory, spark)

loaded from: datamart/bronze/sellers/bronze_olist_sellers.parquet row count: 3095
Number of duplicated 'seller_id': 0
saved to: datamart/silver/sellers/silver_olist_sellers.parquet


In [18]:
# Check schema enforced
df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



In [19]:
# Check missing leading zero padded
df.groupBy(F.length("seller_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5| 3095|
+------+-----+



### Build Geolocation Table

In [20]:
# Create silver directory to save geolocation data
silver_geo_directory = "datamart/silver/geolocation/"
if not os.path.exists(silver_geo_directory):
    os.makedirs(silver_geo_directory)

In [21]:
def process_silver_olist_geolocation(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_geolocation.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "geolocation_zip_code_prefix": StringType(),
        "geolocation_lat": FloatType(),
        "geolocation_lng": FloatType(),
        "geolocation_city": StringType(),
        "geolocation_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Add missing leading zero
    df = df.withColumn(
        "geolocation_zip_code_prefix",
        F.lpad(col("geolocation_zip_code_prefix"), 5, "0")
    )

    # Deduplicate zipcodes by just taking the centroid (mean of lat,lng)
    df_dedupe = df.groupBy("geolocation_zip_code_prefix").agg(
        F.avg("geolocation_lat").alias("geolocation_lat"),
        F.avg("geolocation_lng").alias("geolocation_lng")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_geolocation.parquet"
    filepath = silver_directory + partition_name
    df_dedupe.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df_dedupe

In [22]:
# Run function manually to test
df = process_silver_olist_geolocation("datamart/bronze/geolocation/",silver_geo_directory, spark)

loaded from: datamart/bronze/geolocation/bronze_olist_geolocation.parquet row count: 1000325


                                                                                

saved to: datamart/silver/geolocation/silver_olist_geolocation.parquet


In [23]:
# Check schema enforced
df.printSchema()

root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)



In [24]:
# Check missing leading zero padded
df.groupBy(F.length("geolocation_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5|19177|
+------+-----+



In [25]:
# Check every geolocation_zip_code_prefix only has 1 count. Group by prefix and count occurrences
df.groupBy("geolocation_zip_code_prefix") \
    .agg(F.count("*").alias("count")) \
    .filter("count > 1") \
    .show()

+---------------------------+-----+
|geolocation_zip_code_prefix|count|
+---------------------------+-----+
+---------------------------+-----+



### Build Products Table

In [26]:
# Create silver directory to save products data
silver_prod_directory = "datamart/silver/products/"
if not os.path.exists(silver_prod_directory):
    os.makedirs(silver_prod_directory)

In [27]:
def process_silver_olist_products(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_products.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # Rename columns due to spelling mistakes 
    df = df.withColumnRenamed("product_name_lenght", "product_name_length") \
           .withColumnRenamed("product_description_lenght", "product_description_length")

    
    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "product_id": StringType(),
        "product_category_name": StringType(),
        "product_name_length": DoubleType(),
        "product_description_length": DoubleType(),
        "product_photos_qty": DoubleType(),
        "product_weight_g": DoubleType(),
        "product_length_cm": DoubleType(),
        "product_height_cm": DoubleType(),
        "product_width_cm": DoubleType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Inputting missing values as NaN
    df = df.fillna({"product_category_name": "NaN"})
    df = df.fillna({"product_name_length": float('nan')}) 
    df = df.fillna({"product_description_length": float('nan')}) 
    df = df.fillna({"product_photos_qty": float('nan')}) 
    
    # Check product_id duplicates (total rows - distinct ids)
    total_rows = df.count()
    distinct_rows = df.select("product_id").distinct().count()
    duplicates_product_id = total_rows - distinct_rows
    print(f"Number of duplicated 'product_id': {duplicates_product_id}")
    
    # Merge Product Category translation Table with Products table
    # Load the bronze table  
    df_cat_trans = spark.read.parquet("datamart/bronze/category_translation/bronze_product_category_translation.parquet")
    
    df = df.join(df_cat_trans, on='product_category_name', how='left')

    # Rename original cat name column to cat name portuguese
    df = df.withColumnRenamed("product_category_name", "product_category_name_portuguese")

    # Reorder columns for easy visualization
    
    desired_order = [
    "product_id",
    "product_category_name_portuguese",
    "product_category_name_english",
    "main_category",
    "sub_category",
    "product_name_length",
    "product_description_length",
    "product_photos_qty",
    "product_weight_g",
    "product_length_cm",
    "product_height_cm",
    "product_width_cm"
    ]

    df = df.select(desired_order)


    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_products.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [28]:
# Run function manually to test
# I inputted the bronze_directory manually (amend after our path discrepancies are resolved)
df = process_silver_olist_products("datamart/bronze/products/",silver_prod_directory, spark)

loaded from: datamart/bronze/products/bronze_olist_products.parquet row count: 32951
Number of duplicated 'product_id': 0


                                                                                

saved to: datamart/silver/products/silver_olist_products.parquet


In [29]:
# Check schema enforced
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name_portuguese: string (nullable = false)
 |-- product_category_name_english: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- sub_category: string (nullable = true)
 |-- product_name_length: double (nullable = false)
 |-- product_description_length: double (nullable = false)
 |-- product_photos_qty: double (nullable = false)
 |-- product_weight_g: double (nullable = true)
 |-- product_length_cm: double (nullable = true)
 |-- product_height_cm: double (nullable = true)
 |-- product_width_cm: double (nullable = true)



In [30]:
# Inspect some output
df = spark.read.parquet("datamart/silver/products/silver_olist_products.parquet")
df.show(5)


+--------------------+--------------------------------+-----------------------------+--------------+------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name_portuguese|product_category_name_english| main_category|sub_category|product_name_length|product_description_length|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+--------------------------------+-----------------------------+--------------+------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff454...|                      perfumaria|                    perfumery|       perfume|          NA|               40.0|                     287.0|               1.0|           225.0|             16.0|             10.0|            14.0|


### Build Order_Items Table

In [31]:
# Create silver directory to save order_items data
silver_order_items_directory = "datamart/silver/order_items/"
if not os.path.exists(silver_order_items_directory):
    os.makedirs(silver_order_items_directory)

In [32]:
def process_silver_olist_order_items(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_order_items.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    
    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "order_id": StringType(),
        "order_item_id": LongType(),
        "product_id": StringType(),
        "seller_id": StringType(),
        "shipping_limit_date": TimestampType(),
        "price": DoubleType(),
        "freight_value": DoubleType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    
    # Checking for invalid seller IDs
    # Load df_sellers from SILVER <<<<<<----------------------------------<<<<<<<<<<<<<
    df_sellers = spark.read.parquet("datamart/silver/sellers/silver_olist_sellers.parquet")  

    # Get distinct valid seller IDs
    valid_seller_ids_df = df_sellers.select("seller_id").distinct()
    
    # Perform a left anti join to find sellers with invalid seller_id
    invalid_orders = df.join(valid_seller_ids_df, on="seller_id", how="left_anti")
    
    # Count how many invalid seller IDs there are
    invalid_seller_count = invalid_orders.count()

    # Conditionally drop invalid orders
    if invalid_seller_count > 0:
        initial_count = df.count()
        print("Dropping orders with invalid seller_id...")
        df = df.join(valid_seller_ids_df, on="seller_id", how="inner")
        final_count = df.count()
        dropped_count = initial_count - final_count
        print(f"Dropped {dropped_count} rows")
        
    else:
        print("All seller ids are valid — no need to drop!!")

    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_order_items.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [33]:
# Run function manually to test
df = process_silver_olist_order_items("datamart/bronze/order_items/",silver_order_items_directory, spark)

loaded from: datamart/bronze/order_items/bronze_olist_order_items.parquet row count: 112650
All seller ids are valid — no need to drop!!


                                                                                

saved to: datamart/silver/order_items/silver_olist_order_items.parquet


In [34]:
# Check schema enforced
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: long (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [35]:
# Inspect some output
df = spark.read.parquet("datamart/silver/order_items/silver_olist_order_items.parquet")
df.show(5)

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|89bed55dd88d035e3...|            1|ae6b739ab6e9d7991...|53e4c6e0f4312d4d2...|2018-01-11 17:12:22|  42.0|         17.6|
|89c037e2b749a2ed5...|            1|e906fa76a27488f80...|1835b56ce799e6a4d...|2017-11-30 04:15:33| 53.99|        12.72|
|89c037e2b749a2ed5...|            2|e906fa76a27488f80...|1835b56ce799e6a4d...|2017-11-30 04:15:33| 53.99|        12.72|
|89c04d22504649482...|            1|3d73c88390adac7dd...|c8b0e2b0a7095e5d8...|2018-03-14 00:20:26|199.99|        25.05|
|89c0bf5292a493fb2...|            1|2d40d83fc97b8d4d4...|cca3071e3e9bb7d12...|2017-08-24 03:26:15|  29.9|        11.85|
+--------------------+-------------+----

### Build Orders Table

In [36]:
# Create silver directory to save orders data
silver_orders_directory = "datamart/silver/orders/"
if not os.path.exists(silver_orders_directory):
    os.makedirs(silver_orders_directory)

In [37]:
def process_silver_olist_orders(bronze_directory, silver_directory, spark, partition_name):
    filepath = os.path.join(bronze_directory, partition_name)
    df = spark.read.option("header", True).option("inferSchema", True).csv(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # Clean data: enforce schema / data type
    
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "order_id": StringType(),
        "customer_id": StringType(),
        "order_status": StringType(),
        "order_purchase_timestamp": TimestampType(),
        "order_approved_at": TimestampType(),
        "order_delivered_carrier_date": TimestampType(),
        "order_delivered_customer_date": TimestampType(),
        "order_estimated_delivery_date": TimestampType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Removing Invalid order ids
    # Load the SILVER table  
    df_order_items = spark.read.parquet("datamart/silver/order_items/silver_olist_order_items.parquet") 
    
    # Get distinct order IDs that exist in order items
    valid_order_ids_df = df_order_items.select("order_id").distinct()
    
    
    # Keep only orders that exist in df_order_items
    df_orders_clean = df.join(valid_order_ids_df, on="order_id", how="inner")
    
    # Count how many were dropped
    dropped_orders = df.count() - df_orders_clean.count()
    print(f"Dropped {dropped_orders} orders with no items.")

    df = df_orders_clean


    # Checking for invalid customer IDs
    # Load df_customers from SILVER   
    df_customers = spark.read.parquet("datamart/silver/customers/silver_olist_customers.parquet")  

    # Get distinct valid customer IDs
    valid_customer_ids_df = df_customers.select("customer_id").distinct()
    
    # Perform a left anti join to find orders with invalid customer_id
    invalid_orders = df.join(valid_customer_ids_df, on="customer_id", how="left_anti")
    
    # Count how many invalid customer IDs there are
    invalid_customer_count = invalid_orders.count()

    # Conditionally drop invalid orders
    if invalid_customer_count > 0:
        initial_count = df.count()
        print("Dropping orders with invalid customer_id...")
        df = df.join(valid_customer_ids_df, on="customer_id", how="inner")
        final_count = df.count()
        dropped_count = initial_count - final_count
        print(f"Dropped {dropped_count} rows")
        
    else:
        print("All customer ids are valid — no need to drop!!")


    # Enforcing enum for order statuses
    # Define valid statuses 
    valid_statuses = {
        "created",
        "approved",
        "processing",
        "invoiced",
        "shipped",
        "delivered",
        "canceled",
        "unavailable"
    }
    
    # Clean and standardize the `order_status` column
    df = df.withColumn("order_status", trim(lower(col("order_status"))))
    
    # dentify invalid statuses (those NOT in the valid_statuses set)
    invalid_statuses_df = df.filter(~col("order_status").isin(list(valid_statuses)))
    
    # Print the unique invalid statuses
    invalid_statuses_list = invalid_statuses_df.select("order_status").distinct().rdd.flatMap(lambda x: x).collect()

    if invalid_statuses_list:
        print(f"Invalid statuses found: {invalid_statuses_list}")
    else:
        print("No invalid status found!!")


    # Adding snapshot date column
    snapshot_str = partition_name.replace("bronze_olist_orders_", "").replace(".csv", "")
    # df = df.withColumn("snapshot_date", to_date(lit(snapshot_str), "dd_MM_yyyy"))
    df = df.withColumn("snapshot_date", to_date(lit(snapshot_str), "yyyy_MM_dd"))

    

    # save 
    parquet_name = partition_name.replace("bronze", "silver").replace(".csv", ".parquet")
    output_path = os.path.join(silver_directory, parquet_name)
    df.write.mode("overwrite").parquet(output_path)
    print("-----> saved to:", output_path)

    return df

In [38]:
# Run function manually to test

# Set base directory
bronze_orders_directory = "datamart/bronze/orders/"
silver_orders_directory = "datamart/silver/orders/"

# List all CSV files in the bronze orders folder
csv_files = [f for f in os.listdir(bronze_orders_directory) if f.endswith(".csv")]

# Sort the files according to date
csv_files.sort()

# Loop through each file 
for partition_name in csv_files:
    print(f"\n======== Processing {partition_name} ......... \n")
    df = process_silver_olist_orders(bronze_orders_directory, silver_orders_directory, spark, partition_name)
    
    # Check schema enforced
    df.printSchema()
    # Inspect some output
    df.show(5)




loaded from: datamart/bronze/orders/bronze_olist_orders_2016_09_04.csv row count: 1
Dropped 0 orders with no items.
All customer ids are valid — no need to drop!!


                                                                                

No invalid status found!!
-----> saved to: datamart/silver/orders/silver_olist_orders_2016_09_04.parquet
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- snapshot_date: date (nullable = false)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|snapshot_date|
+-----

## Build Derived Silver Tables

### Order logistics

In [3]:

def build_order_features(spark, order_file_path):
    # Read inputs
    df_order_items = spark.read.parquet("datamart/silver/order_items/silver_olist_order_items.parquet")
    df_products = spark.read.parquet("datamart/silver/products/silver_olist_products.parquet")
    # df_categories = spark.read.parquet("datamart/bronze/category_translation/bronze_product_category_translation.parquet")
    df_orders = spark.read.parquet(order_file_path)

    
    order_metrics = df_order_items.groupBy("order_id").agg(
        F.max("order_item_id").alias("total_qty"),
        F.sum("price").alias("total_price"),
        F.sum("freight_value").alias("total_freight_value")
    )

    
    df_items_with_products = df_order_items.select("order_id", "product_id") \
        .join(
            df_products.select(
                "product_id", "product_weight_g",
                "product_length_cm", "product_height_cm", "product_width_cm"
            ),
            on="product_id", how="left"
        )

    
    df_items_with_products = df_items_with_products.withColumn(
        "product_volume_cm3",
        col("product_length_cm") * col("product_height_cm") * col("product_width_cm")
    )

    
    product_metrics = df_items_with_products.groupBy("order_id").agg(
        F.sum("product_weight_g").alias("total_weight_g"),
        F.sum("product_volume_cm3").alias("total_volume_cm3")
    )

    
    final_df = df_orders.select("order_id", "order_purchase_timestamp") \
        .join(order_metrics, on="order_id", how="inner") \
        .join(product_metrics, on="order_id", how="left") \
        .withColumn(
            "total_density",
            when(col("total_volume_cm3") != 0,
                 col("total_weight_g") / col("total_volume_cm3")
            ).otherwise(None)
        )
    
    
    df_items_with_cats = df_order_items.select("order_id", "product_id") \
        .join(df_products.select("product_id", "product_category_name_english", "main_category", "sub_category"), on="product_id", how="left")
    
    # print('checkpoint 1')
    main_cat_counts = df_items_with_cats.groupBy("order_id", "main_category") \
        .agg(count("*").alias("main_cat_count"))
    # print('checkpoint 2')
    main_cat_window = Window.partitionBy("order_id").orderBy(col("main_cat_count").desc())
    
    most_common_main = main_cat_counts.withColumn(
        "rank", row_number().over(main_cat_window)
    ).filter(col("rank") == 1).drop("rank", "main_cat_count")
    
    sub_cat_counts = df_items_with_cats.groupBy("order_id", "sub_category") \
        .agg(count("*").alias("sub_cat_count"))
    sub_cat_window = Window.partitionBy("order_id").orderBy(col("sub_cat_count").desc())
    most_common_sub = sub_cat_counts.withColumn(
        "rank", row_number().over(sub_cat_window)
    ).filter(col("rank") == 1).drop("rank", "sub_cat_count")

    order_categories = most_common_main.join(most_common_sub, on="order_id", how="outer")
    final_df_with_cats = final_df.join(order_categories, on="order_id", how="left")

    # Adding snapshot date column
    
    partition_name = str(order_file_path)
    
    snapshot_str = partition_name.replace("datamart/silver/orders/silver_olist_orders_", "").replace(".parquet", "")
    
    # final_df_with_cats = final_df_with_cats.withColumn("snapshot_date", to_date(lit(snapshot_str), "dd_MM_yyyy"))
    final_df_with_cats = final_df_with_cats.withColumn("snapshot_date", to_date(lit(snapshot_str), "yyyy_MM_dd"))

    

    return final_df_with_cats


In [4]:
# Keep track of failures
failed_files = []
processed_files = []

order_files = sorted(glob.glob("datamart/silver/orders/silver_olist_orders_*.parquet"))

# Create output directory if it doesn't exist
os.makedirs("datamart/silver/order_logistics", exist_ok=True)

# Loop over files 
for idx, file_path in enumerate(order_files, 1):
    
    basename = os.path.basename(file_path)  
    year_month = basename.replace("silver_olist_orders_", "").replace(".parquet", "")

    output_path = f"datamart/silver/order_logistics/silver_olist_order_logistics_{year_month}.parquet"
    
    print(f"\n[{idx}/{len(order_files)}]  🔄 Processing {year_month} ({basename})...")

    # Skip if already exists
    if os.path.exists(output_path):
        print(f" ⏭️ Skipping {year_month} (already exists)")
        continue
    
    try:
        start_time = time.time()

        # Run feature engineering
        
        final_df = build_order_features(spark, file_path)
        
        # Save to parquet
        final_df.write.mode("overwrite").parquet(output_path)

        # Verify row count
        row_count = final_df.count()
        duration = round(time.time() - start_time, 2)

        
        

        print(f"---> ✅ Saved: {output_path} → {row_count} rows in {duration}s")

        # Check schema enforced
        # final_df.printSchema()

        # View sample
        # final_df.show(5)
        
        processed_files.append((year_month, row_count, duration))

    except Exception as e:
        print(f" Failed on {year_month}: {e}")
        failed_files.append((year_month, str(e)))

# Summary
print("\n===== Processing Summary =====")
print(f" Successfully processed: {len(processed_files)} files")
for ym, r_count, duration in processed_files:
    print(f"  - {ym}: {r_count} rows in {duration}s")

if failed_files:
    print(f"\n Failed files: {len(failed_files)}")
    for ym, err in failed_files:
        print(f"  - {ym}: {err}")
else:
    print("\n All files processed successfully")



[1/634]  🔄 Processing 2016_09_04 (silver_olist_orders_2016_09_04.parquet)...
 ⏭️ Skipping 2016_09_04 (already exists)

[2/634]  🔄 Processing 2016_09_05 (silver_olist_orders_2016_09_05.parquet)...
 ⏭️ Skipping 2016_09_05 (already exists)

[3/634]  🔄 Processing 2016_09_13 (silver_olist_orders_2016_09_13.parquet)...
 ⏭️ Skipping 2016_09_13 (already exists)

[4/634]  🔄 Processing 2016_09_15 (silver_olist_orders_2016_09_15.parquet)...
 ⏭️ Skipping 2016_09_15 (already exists)

[5/634]  🔄 Processing 2016_10_02 (silver_olist_orders_2016_10_02.parquet)...
 ⏭️ Skipping 2016_10_02 (already exists)

[6/634]  🔄 Processing 2016_10_03 (silver_olist_orders_2016_10_03.parquet)...
 ⏭️ Skipping 2016_10_03 (already exists)

[7/634]  🔄 Processing 2016_10_04 (silver_olist_orders_2016_10_04.parquet)...
 ⏭️ Skipping 2016_10_04 (already exists)

[8/634]  🔄 Processing 2016_10_05 (silver_olist_orders_2016_10_05.parquet)...
 ⏭️ Skipping 2016_10_05 (already exists)

[9/634]  🔄 Processing 2016_10_06 (silver_olist_

In [6]:
# Inspect some output
df_orders_logistics = spark.read.parquet("datamart/silver/order_logistics/silver_olist_order_logistics_2018_01_01.parquet")
df_orders_logistics.show(5)

+--------------------+------------------------+---------+-----------+-------------------+--------------+----------------+-------------------+-------------+------------+-------------+
|            order_id|order_purchase_timestamp|total_qty|total_price|total_freight_value|total_weight_g|total_volume_cm3|      total_density|main_category|sub_category|snapshot_date|
+--------------------+------------------------+---------+-----------+-------------------+--------------+----------------+-------------------+-------------+------------+-------------+
|ac9389991e1965425...|     2018-01-01 21:11:00|        1|      69.99|              15.24|         200.0|           880.0|0.22727272727272727|         NULL|        NULL|   2018-01-01|
|af2ded0024fec0cc8...|     2018-01-01 18:39:20|        1|       79.9|              15.31|         500.0|          1638.0| 0.3052503052503053|    telephony|          NA|   2018-01-01|
|45e4d1d16d0174094...|     2018-01-01 11:55:49|        1|      145.0|               8

### Shipping_infos

In [7]:
# Create silver directory to save shipping_infos data
silver_shipping_infos_directory = "datamart/silver/shipping_infos/"
if not os.path.exists(silver_shipping_infos_directory):
    os.makedirs(silver_shipping_infos_directory)

In [8]:
def process_silver_shipping_infos(spark, order_file_path):
    # Read all required data
    df_orders = spark.read.parquet(order_file_path)
    df_sellers = spark.read.parquet("datamart/silver/sellers/silver_olist_sellers.parquet")
    df_customers = spark.read.parquet("datamart/silver/customers/silver_olist_customers.parquet")
    df_order_items = spark.read.parquet("datamart/silver/order_items/silver_olist_order_items.parquet")
    df_geo = spark.read.parquet("datamart/silver/geolocation/silver_olist_geolocation.parquet")
    
    # Get relevant fields in df_orders
    df_orders = df_orders.select("order_id", "order_purchase_timestamp", "customer_id", "snapshot_date")
    
    # df_orders left join df_customers to get customer address
    orders_customers = df_orders.join(df_customers, on="customer_id", how="inner")
    
    # For simplicity, pick a seller per order_id (if order has multiple items from different sellers, any 1 will do)
    order_items_dedupe = df_order_items.select("order_id", "seller_id").dropDuplicates(["order_id"])
    
    # orders_customers left join order_items_dedupe to get seller_id of the order
    orders_customers_sellers = orders_customers.join(order_items_dedupe, on="order_id", how="left")
    
    # orders_customers_sellers left join df_sellers to get seller address
    orders_customers_sellers = orders_customers_sellers.join(df_sellers, on="seller_id", how="left")
    
    # Create separate geolocation table for customer and seller (rename fields)
    geo_customer = df_geo.withColumnRenamed("geolocation_zip_code_prefix", "customer_zip_code_prefix") \
                         .withColumnRenamed("geolocation_lat", "customer_lat") \
                         .withColumnRenamed("geolocation_lng", "customer_lng")
    
    geo_seller = df_geo.withColumnRenamed("geolocation_zip_code_prefix", "seller_zip_code_prefix") \
                       .withColumnRenamed("geolocation_lat", "seller_lat") \
                       .withColumnRenamed("geolocation_lng", "seller_lng")
    
    # Join customer and seller coordinates
    df_with_customer_geo = orders_customers_sellers.join(F.broadcast(geo_customer), on="customer_zip_code_prefix", how="left")
    df_with_both_geo = df_with_customer_geo.join(F.broadcast(geo_seller), on="seller_zip_code_prefix", how="left")
    
    # Compute Haversine distance manually so no need library
    R = 6371.0  # Earth radius in km
    df_with_distance = df_with_both_geo.withColumn("delivery_distance",
        R * 2 * F.atan2(
            F.sqrt(
                F.sin((F.radians(col("customer_lat") - col("seller_lat")) / 2)) ** 2 +
                F.cos(F.radians(col("customer_lat"))) * 
                F.cos(F.radians(col("seller_lat"))) *
                F.sin((F.radians(col("customer_lng") - col("seller_lng")) / 2)) ** 2
            ),
            F.sqrt(1 - (
                F.sin((F.radians(col("customer_lat") - col("seller_lat")) / 2)) ** 2 +
                F.cos(F.radians(col("customer_lat"))) *
                F.cos(F.radians(col("seller_lat"))) *
                F.sin((F.radians(col("customer_lng") - col("seller_lng")) / 2)) ** 2
            ))
        )
    )
    
    # Compute Boolean variables
    df_final = df_with_distance.withColumn("same_zipcode", (col("customer_zip_code_prefix") == col("seller_zip_code_prefix")).cast("int")) \
                               .withColumn("same_city", (col("customer_city") == col("seller_city")).cast("int")) \
                               .withColumn("same_state", (col("customer_state") == col("seller_state")).cast("int"))
    
    # Choose only required columns
    selected_cols = [
        "order_id", "order_purchase_timestamp",
        "customer_zip_code_prefix", "customer_city", "customer_state",
        "customer_lat", "customer_lng",
        "seller_zip_code_prefix", "seller_city", "seller_state",
        "seller_lat", "seller_lng",
        "delivery_distance", "same_zipcode", "same_city", "same_state",
        "snapshot_date"
    ]
    
    df_final = df_final.select(selected_cols)
    
    # save silver table - IRL connect to database to write
    year_month = os.path.basename(order_file_path).replace("silver_olist_orders_", "").replace(".parquet", "")
    partition_name = "silver_shipping_infos_" + year_month + '.parquet'
    
    filepath = "datamart/silver/shipping_infos/" + partition_name
    df_final.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df_final

In [9]:
# Keep track of failures
failed_files = []
processed_files = []

order_files = sorted(glob.glob("datamart/silver/orders/silver_olist_orders_*.parquet"))

# Loop over files 
for idx, file_path in enumerate(order_files, 1):
    # define params
    basename = os.path.basename(file_path)  
    year_month = basename.replace("silver_olist_orders_", "").replace(".parquet", "")
    output_path = f"datamart/silver/shipping_infos/silver_shipping_infos_{year_month}.parquet"
    print(f"\n[{idx}/{len(order_files)}]  Processing {year_month} ({basename})...")
    
    # Skip if already exists
    if os.path.exists(output_path):
        print(f" Skipping {year_month} (already exists)")
        continue
    
    try:
        start_time = time.time()
        final_df = process_silver_shipping_infos(spark, file_path)
        # Verify row count
        row_count = final_df.count()
        duration = round(time.time() - start_time, 2)
        print(f"{row_count} rows in {duration}s")
        
        processed_files.append((year_month, row_count, duration)) # keep track of processed files

    except Exception as e:
        print(f" Failed on {year_month}: {e}")
        failed_files.append((year_month, str(e)))

# Summary
print("\n===== Processing Summary =====")
print(f" Successfully processed: {len(processed_files)} files")
for ym, r_count, duration in processed_files:
    print(f"  - {ym}: {r_count} rows in {duration}s")

if failed_files:
    print(f"\n Failed files: {len(failed_files)}")
    for ym, err in failed_files:
        print(f"  - {ym}: {err}")
else:
    print("\n All files processed successfully")


[1/634]  Processing 2016_09_04 (silver_olist_orders_2016_09_04.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_infos_2016_09_04.parquet
1 rows in 3.49s

[2/634]  Processing 2016_09_05 (silver_olist_orders_2016_09_05.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_infos_2016_09_05.parquet
1 rows in 2.8s

[3/634]  Processing 2016_09_13 (silver_olist_orders_2016_09_13.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_infos_2016_09_13.parquet
0 rows in 2.2s

[4/634]  Processing 2016_09_15 (silver_olist_orders_2016_09_15.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_infos_2016_09_15.parquet
1 rows in 2.9s

[5/634]  Processing 2016_10_02 (silver_olist_orders_2016_10_02.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_infos_2016_10_02.parquet
1 rows in 2.88s

[6/634]  Processing 2016_10_03 (silver_olist_orders_2016_10_03.parquet)...
saved to: datamart/silver/shipping_infos/silver_shipping_

In [11]:
# Inspect some output
df_shipping_infos = spark.read.parquet("datamart/silver/shipping_infos/silver_shipping_infos_2018_01_01.parquet")
df_shipping_infos.show(5)

+--------------------+------------------------+------------------------+--------------------+--------------+-------------------+-------------------+----------------------+-----------+------------+-------------------+-------------------+------------------+------------+---------+----------+-------------+
|            order_id|order_purchase_timestamp|customer_zip_code_prefix|       customer_city|customer_state|       customer_lat|       customer_lng|seller_zip_code_prefix|seller_city|seller_state|         seller_lat|         seller_lng| delivery_distance|same_zipcode|same_city|same_state|snapshot_date|
+--------------------+------------------------+------------------------+--------------------+--------------+-------------------+-------------------+----------------------+-----------+------------+-------------------+-------------------+------------------+------------+---------+----------+-------------+
|2ac00a6a0a00ee7f8...|     2018-01-01 16:39:50|                   41502|            salv

### Delivery_history

In [12]:
# Create silver directory to save delivery_history data
silver_delivery_history_directory = "datamart/silver/delivery_history/"
if not os.path.exists(silver_delivery_history_directory):
    os.makedirs(silver_delivery_history_directory)

In [13]:
def process_silver_delivery_history(spark, order_file_path):
    # Read all required data
    df_orders = spark.read.parquet(order_file_path)
    
    # Add computed columns. Note, datediff will return null if fone of the timestamps is null (i.e. not available)
    df_delivery = df_orders.select(
        "order_id",
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "snapshot_date"
    ).withColumn(
        "is_weekend", when(F.dayofweek("order_purchase_timestamp").isin([1, 7]), 1).otherwise(0)
    ).withColumn(
        "approval_duration", F.datediff("order_approved_at", "order_purchase_timestamp")
    ).withColumn(
        "processing_duration", F.datediff("order_delivered_carrier_date", "order_approved_at")
    ).withColumn(
        "ship_duration", F.datediff("order_delivered_customer_date", "order_delivered_carrier_date")
    ).withColumn(
        "act_days_to_deliver", F.datediff("order_delivered_customer_date", "order_purchase_timestamp")
    ).withColumn(
        "miss_delivery_sla", col("act_days_to_deliver") > 14
    )
    
    df_delivery = df_delivery.select(
        "order_id",
        "order_purchase_timestamp",
        "is_weekend",
        "approval_duration",
        "processing_duration",
        "ship_duration",
        "act_days_to_deliver",
        "miss_delivery_sla"
    )
    
    # save silver table - IRL connect to database to write
    year_month = os.path.basename(order_file_path).replace("silver_olist_orders_", "").replace(".parquet", "")
    partition_name = "silver_delivery_history_" + year_month + '.parquet'
    
    filepath = "datamart/silver/delivery_history/" + partition_name
    df_delivery.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df_delivery

In [14]:
# Keep track of failures
failed_files = []
processed_files = []

order_files = sorted(glob.glob("datamart/silver/orders/silver_olist_orders_*.parquet"))

# Loop over files 
for idx, file_path in enumerate(order_files, 1):
    # define params
    basename = os.path.basename(file_path)  
    year_month = basename.replace("silver_olist_orders_", "").replace(".parquet", "")
    output_path = f"datamart/silver/delivery_history/silver_delivery_history_{year_month}.parquet"
    print(f"\n[{idx}/{len(order_files)}]  Processing {year_month} ({basename})...")
    
    # Skip if already exists
    if os.path.exists(output_path):
        print(f" Skipping {year_month} (already exists)")
        continue
    
    try:
        start_time = time.time()
        final_df = process_silver_delivery_history(spark, file_path)
        # Verify row count
        row_count = final_df.count()
        duration = round(time.time() - start_time, 2)
        print(f"{row_count} rows in {duration}s")
        
        processed_files.append((year_month, row_count, duration)) # keep track of processed files

    except Exception as e:
        print(f" Failed on {year_month}: {e}")
        failed_files.append((year_month, str(e)))

# Summary
print("\n===== Processing Summary =====")
print(f" Successfully processed: {len(processed_files)} files")
for ym, r_count, duration in processed_files:
    print(f"  - {ym}: {r_count} rows in {duration}s")

if failed_files:
    print(f"\n Failed files: {len(failed_files)}")
    for ym, err in failed_files:
        print(f"  - {ym}: {err}")
else:
    print("\n All files processed successfully")


[1/634]  Processing 2016_09_04 (silver_olist_orders_2016_09_04.parquet)...
saved to: datamart/silver/delivery_history/silver_delivery_history_2016_09_04.parquet
1 rows in 1.05s

[2/634]  Processing 2016_09_05 (silver_olist_orders_2016_09_05.parquet)...
saved to: datamart/silver/delivery_history/silver_delivery_history_2016_09_05.parquet
1 rows in 1.07s

[3/634]  Processing 2016_09_13 (silver_olist_orders_2016_09_13.parquet)...
saved to: datamart/silver/delivery_history/silver_delivery_history_2016_09_13.parquet
0 rows in 1.01s

[4/634]  Processing 2016_09_15 (silver_olist_orders_2016_09_15.parquet)...
saved to: datamart/silver/delivery_history/silver_delivery_history_2016_09_15.parquet
1 rows in 1.0s

[5/634]  Processing 2016_10_02 (silver_olist_orders_2016_10_02.parquet)...
saved to: datamart/silver/delivery_history/silver_delivery_history_2016_10_02.parquet
1 rows in 1.02s

[6/634]  Processing 2016_10_03 (silver_olist_orders_2016_10_03.parquet)...
saved to: datamart/silver/delivery_

### Seller_performance

In [20]:
# Create silver directory to save seller_performance data
silver_seller_performance_directory = "datamart/silver/seller_performance/"
if not os.path.exists(silver_seller_performance_directory):
    os.makedirs(silver_seller_performance_directory)

In [21]:
def process_silver_seller_performance(spark, order_file_path: str):

    # extract the date string (YYYY_MM_DD) from the orders filename
    year_month = (os.path.basename(order_file_path).replace("silver_olist_orders_", "").replace(".parquet", ""))

    # load bronze reviews
    reviews_path  = os.path.join("datamart/bronze",
                                "order_reviews",
                                "bronze_olist_order_reviews.parquet")
    df_reviews = spark.read.parquet(reviews_path)
    print("loaded from:", reviews_path, "row count:", df_reviews.count())
    df_reviews = df_reviews.withColumn("review_score", col("review_score").cast(DoubleType()))

    # load silver order_items
    items_path = os.path.join("datamart/silver",
                              "order_items",
                              "silver_olist_order_items.parquet")
    df_items = spark.read.parquet(items_path)
    print("loaded from:", items_path, "row count:", df_items.count())

    # load silver orders 
    df_orders = spark.read.parquet(order_file_path) \
    .select("order_id", "snapshot_date", "order_purchase_timestamp", "order_delivered_customer_date")
    print("loaded from:", order_file_path, "row count:", df_orders.count())
    
    # load silver delivery_history 
    delivery_path = os.path.join(
        "datamart/silver",
        "delivery_history",
        f"silver_delivery_history_{year_month}.parquet")
    df_delivery = (spark.read.parquet(delivery_path).select("order_id", "miss_delivery_sla"))
    print("loaded from:", delivery_path, "row count:", df_delivery.count())

    # join tables
    df_joined = (df_orders
    .join(df_items,   on="order_id", how="inner")
    .join(df_reviews, on="order_id", how="left")
    .join(df_delivery,on="order_id", how="left")
    .withColumn("processing_time",
        datediff(col("order_delivered_customer_date"),
                 col("order_purchase_timestamp")).cast(DoubleType())))

    # perform aggregation
    df_performance = (df_joined
  .filter(col("snapshot_date").isNotNull())
  .groupBy("snapshot_date","seller_id")
  .agg(
    F.avg("review_score").alias("avg_rating"),
    F.avg(col("miss_delivery_sla").cast(DoubleType())).alias("avg_delay_rate"),
    F.avg("processing_time").alias("avg_processing_time")))
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_seller_performance_" + year_month + ".parquet"
    filepath       = "datamart/silver/seller_performance/" + partition_name
    df_performance.write.mode("overwrite").parquet(filepath)
    print("saved to:", filepath)

    return df_performance

In [22]:
failed_files    = []
processed_files = []

order_files = sorted(glob.glob("datamart/silver/orders/silver_olist_orders_*.parquet"))

# Loop over each file
for idx, order_file in enumerate(order_files, 1):
    year_month  = os.path.basename(order_file) \
                        .replace("silver_olist_orders_", "") \
                        .replace(".parquet", "")
    out_path = f"datamart/silver/seller_performance/silver_seller_performance_{year_month}.parquet"

    print(f"\n[{idx}/{len(order_files)}] Processing {year_month}...")

    # Skip if output already exists
    if os.path.exists(out_path):
        print(f"  Skipping {year_month} (already exists)")
        continue

    try:
        start = time.time()
        df_perf = process_silver_seller_performance(spark, order_file)
        cnt = df_perf.count()
        dur = round(time.time() - start, 2)
        print(f"  → {cnt} sellers in {dur}s")
        processed_files.append((year_month, cnt, dur))

    except Exception as e:
        print(f"Failed on {year_month}: {e}")
        failed_files.append((year_month, str(e)))

# Summary
print("\n===== Processing Summary =====")
print(f" Successfully processed: {len(processed_files)} dates")
for ym, cnt, dur in processed_files:
    print(f"  • {ym}: {cnt} rows in {dur}s")

if failed_files:
    print(f"\n Failed files: {len(failed_files)}")
    for ym, err in failed_files:
        print(f"  • {ym}: {err}")
else:
    print("\n All files processed successfully")


[1/634] Processing 2016_09_04...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_09_04.parquet row count: 1
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_09_04.parquet row count: 1


25/06/22 18:11:10 ERROR Executor: Exception in task 3.0 in stage 24529.0 (TID 43325)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_09_04: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[2/634] Processing 2016_09_05...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_09_05.parquet row count: 1
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_09_05.parquet row count: 1


25/06/22 18:11:12 ERROR Executor: Exception in task 3.0 in stage 24548.0 (TID 43349)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_09_05: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[3/634] Processing 2016_09_13...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_09_13.parquet row count: 0
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_09_13.parquet row count: 0


25/06/22 18:11:13 ERROR Executor: Exception in task 3.0 in stage 24565.0 (TID 43371)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_09_13: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[4/634] Processing 2016_09_15...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_09_15.parquet row count: 1
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_09_15.parquet row count: 1


25/06/22 18:11:15 ERROR Executor: Exception in task 3.0 in stage 24586.0 (TID 43399)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_09_15: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[5/634] Processing 2016_10_02...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_02.parquet row count: 1
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_02.parquet row count: 1


25/06/22 18:11:16 ERROR Executor: Exception in task 3.0 in stage 24605.0 (TID 43425)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_02: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[6/634] Processing 2016_10_03...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_03.parquet row count: 8
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_03.parquet row count: 8


25/06/22 18:11:17 ERROR Executor: Exception in task 3.0 in stage 24624.0 (TID 43457)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_03: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[7/634] Processing 2016_10_04...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_04.parquet row count: 60
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_04.parquet row count: 60


25/06/22 18:11:19 ERROR Executor: Exception in task 3.0 in stage 24643.0 (TID 43489)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_04: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[8/634] Processing 2016_10_05...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_05.parquet row count: 42
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_05.parquet row count: 42


25/06/22 18:11:20 ERROR Executor: Exception in task 3.0 in stage 24660.0 (TID 43515)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_05: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[9/634] Processing 2016_10_06...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_06.parquet row count: 49
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_06.parquet row count: 49


25/06/22 18:11:21 ERROR Executor: Exception in task 3.0 in stage 24681.0 (TID 43553)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_06: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[10/634] Processing 2016_10_07...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_07.parquet row count: 45
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_07.parquet row count: 45


25/06/22 18:11:23 ERROR Executor: Exception in task 3.0 in stage 24700.0 (TID 43585)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_07: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[11/634] Processing 2016_10_08...
loaded from: datamart/bronze/order_reviews/bronze_olist_order_reviews.parquet row count: 104162
loaded from: datamart/silver/order_items/silver_olist_order_items.parquet row count: 112650
loaded from: datamart/silver/orders/silver_olist_orders_2016_10_08.parquet row count: 40
loaded from: datamart/silver/delivery_history/silver_delivery_history_2016_10_08.parquet row count: 40


25/06/22 18:11:24 ERROR Executor: Exception in task 3.0 in stage 24717.0 (TID 43611)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.Whol

Failed on 2016_10_08: [CAST_INVALID_INPUT] The value '2018-02-28 11:26:23' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
jdk.internal.reflect.GeneratedMethodAccessor147.invoke(Unknown Source)


[12/634] Processing 2016_10_09...


KeyboardInterrupt: 

In [18]:
perf = spark.read.parquet("datamart/silver/seller_performance/silver_seller_performance_12_01_2018.parquet")
perf.show(5)

+-------------+--------------------+-----------------+--------------+-------------------+
|snapshot_date|           seller_id|       avg_rating|avg_delay_rate|avg_processing_time|
+-------------+--------------------+-----------------+--------------+-------------------+
|   2018-01-12|989becdce12ebc398...|              5.0|           0.0|               14.0|
|   2018-01-12|994f04b3718c2bab3...|              5.0|           0.0|                5.0|
|   2018-01-12|76d5af76d0271110f...|4.333333333333333|           0.0|  9.333333333333334|
|   2018-01-12|080102cd0a76b09e0...|              5.0|           0.0|               12.0|
|   2018-01-12|a1043bafd471dff53...|              5.0|           1.0|               21.0|
+-------------+--------------------+-----------------+--------------+-------------------+
only showing top 5 rows



### Concentration

In [23]:
# Create silver directory to save concentration data
silver_concentration_directory = "datamart/silver/concentration/"
if not os.path.exists(silver_concentration_directory):
    os.makedirs(silver_concentration_directory)

In [24]:
def process_silver_concentration(spark, shipping_infos_path):
    shipping_info_df = spark.read.parquet(shipping_infos_path)
    
    # Define combinations for each region
    combos = [
        ("customer", "customer_city",  "city"),
        ("customer", "customer_state", "state"),
        ("seller",   "seller_city",    "city"),
        ("seller",   "seller_state",   "state"),
    ]
    
    concentration_dfs = []
    for actor, col_name, level in combos:
        # build df with snapshot_date, type, granularity_level, region
        region_df = (
            shipping_info_df
            .select("snapshot_date", F.col(col_name).alias("region"))
            .withColumn("type", F.lit(actor))
            .withColumn("granularity_level", F.lit(level))
        )
        
        # count orders per snapshot, type, granularity_level, region
        count_df = (
            region_df
            .groupBy("snapshot_date", "type", "granularity_level", "region")
            .agg(F.count("*").alias("region_count"))
        )
        
        # window (rolling frame) to get total for all regions per snapshot, type, level
        window_spec = Window.partitionBy("snapshot_date", "type", "granularity_level")
        pct_df = (
            count_df
            .withColumn("total_count", F.sum("region_count").over(window_spec))
            .withColumn("concentration", F.expr("region_count / total_count * 100"))
            .select("snapshot_date", "granularity_level", "type", "region", "concentration")
        )
        
        concentration_dfs.append(pct_df)
    
    # Union 
    final_concentration_df = concentration_dfs[0]
    for part_df in concentration_dfs[1:]: final_concentration_df = final_concentration_df.unionByName(part_df)
    
    # save silver table - IRL connect to database to write
    year_month = os.path.basename(shipping_infos_path).replace("silver_shipping_infos_", "").replace(".parquet", "")
    partition_name = "silver_concentration_" + year_month + ".parquet"
    filepath       = "datamart/silver/concentration/" + partition_name
    final_concentration_df.write.mode("overwrite").parquet(filepath)
    print("saved to:", filepath)
    
    return final_concentration_df

In [25]:
# Keep track of failures
failed_files    = []
processed_files = []

shipping_files = sorted(glob.glob("datamart/silver/shipping_infos/silver_shipping_infos_*.parquet"))

# Loop over each file
for idx, file_path in enumerate(shipping_files, 1):
    # define params
    basename    = os.path.basename(file_path)
    year_month  = basename.replace("silver_shipping_infos_", "").replace(".parquet", "")
    output_path = f"datamart/silver/concentration/silver_concentration_{year_month}.parquet"
    print(f"\n[{idx}/{len(shipping_files)}] Processing {year_month} ({basename})...")

    # Skip if output already exists
    if os.path.exists(output_path):
        print(f" Skipping {year_month} (already exists)")
        continue

    try:
        start_time = time.time()
        final_df = process_silver_concentration(spark, file_path)
        # Verify row count
        row_count = final_df.count()
        duration  = round(time.time() - start_time, 2)
        print(f"{row_count} rows in {duration}s")

        processed_files.append((year_month, row_count, duration))

    except Exception as e:
        print(f"Failed on {year_month}: {e}")
        failed_files.append((year_month, str(e)))

# Summary
print("\n===== Processing Summary =====")
print(f" Successfully processed: {len(processed_files)} files")
for ym, r_count, dur in processed_files:
    print(f"  - {ym}: {r_count} rows in {dur}s")

if failed_files:
    print(f"\n Failed files: {len(failed_files)}")
    for ym, err in failed_files:
        print(f"  - {ym}: {err}")
else:
    print("\n All files processed successfully")


[1/634] Processing 2016_09_04 (silver_shipping_infos_2016_09_04.parquet)...
saved to: datamart/silver/concentration/silver_concentration_2016_09_04.parquet
4 rows in 1.76s

[2/634] Processing 2016_09_05 (silver_shipping_infos_2016_09_05.parquet)...
saved to: datamart/silver/concentration/silver_concentration_2016_09_05.parquet
4 rows in 1.43s

[3/634] Processing 2016_09_13 (silver_shipping_infos_2016_09_13.parquet)...
saved to: datamart/silver/concentration/silver_concentration_2016_09_13.parquet
0 rows in 1.02s

[4/634] Processing 2016_09_15 (silver_shipping_infos_2016_09_15.parquet)...
saved to: datamart/silver/concentration/silver_concentration_2016_09_15.parquet
4 rows in 1.46s

[5/634] Processing 2016_10_02 (silver_shipping_infos_2016_10_02.parquet)...
saved to: datamart/silver/concentration/silver_concentration_2016_10_02.parquet
4 rows in 1.39s

[6/634] Processing 2016_10_03 (silver_shipping_infos_2016_10_03.parquet)...
saved to: datamart/silver/concentration/silver_concentrati

In [27]:
# Inspect some output
df_conc = spark.read.parquet("datamart/silver/concentration/silver_concentration_2018_01_01.parquet")
df_conc.show(5, truncate=False)

+-------------+-----------------+--------+-----------+----------------+
|snapshot_date|granularity_level|type    |region     |concentration   |
+-------------+-----------------+--------+-----------+----------------+
|2018-01-01   |city             |customer|nova iguacu|1.36986301369863|
|2018-01-01   |city             |customer|itu        |1.36986301369863|
|2018-01-01   |city             |customer|viamao     |1.36986301369863|
|2018-01-01   |city             |customer|rio claro  |1.36986301369863|
|2018-01-01   |city             |customer|corumbatai |1.36986301369863|
+-------------+-----------------+--------+-----------+----------------+
only showing top 5 rows


In [28]:
df_conc.printSchema()

root
 |-- snapshot_date: date (nullable = true)
 |-- granularity_level: string (nullable = true)
 |-- type: string (nullable = true)
 |-- region: string (nullable = true)
 |-- concentration: double (nullable = true)



In [29]:
# Validate that concentrations sum to ~100% per group
df_conc.groupBy("snapshot_date", "type", "granularity_level") \
       .agg(F.sum("concentration").alias("total_pct")) \
       .show(truncate=False)

+-------------+--------+-----------------+------------------+
|snapshot_date|type    |granularity_level|total_pct         |
+-------------+--------+-----------------+------------------+
|2018-01-01   |customer|city             |100.00000000000004|
|2018-01-01   |seller  |city             |100.0             |
|2018-01-01   |customer|state            |100.0             |
|2018-01-01   |seller  |state            |100.00000000000001|
+-------------+--------+-----------------+------------------+



## Build Gold Table (Features)

In [None]:
# snapshot_date_str = "2023-01-01"

# start_date_str = "2023-01-01"
# end_date_str = "2024-12-01"

In [None]:
gold_root = "datamart/gold"
os.makedirs(gold_root, exist_ok=True)
print(f"Gold root directory: {gold_root}")

gold_feature_directory = "datamart/gold/feature_store/"
if not os.path.exists(gold_feature_directory):
    os.makedirs(gold_feature_directory)

Gold root directory: datamart/gold


In [None]:
silver_directory = "datamart/silver"
gold_directory = "datamart/gold"

def read_silver_table(table, silver_db, spark):
    """
    Helper function to read all partitions of a silver table
    """
    folder_path = os.path.join(silver_db, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df

# cust_df = read_silver_table('customers', silver_directory, spark)
# geo_df = read_silver_table('geolocation', silver_directory, spark)
# items_df = read_silver_table('order_items', silver_directory, spark)
# # to be changed
# logistic_df = read_silver_table('order_items', silver_directory, spark)
# prod_df = read_silver_table('products', silver_directory, spark)
# sellers_df = read_silver_table('sellers', silver_directory, spark)
orders_df = read_silver_table('orders', silver_directory, spark)

# print(f"Snapshot date for this run: {snapshot_date_str}")
snapshot_date_str = "2016-09-05"

gold_processing_feature.process_feature_gold_table(snapshot_date_str, silver_directory, gold_directory, orders_df, spark)

# gold_processing_feature.process_feature_gold_table(snapshot_date_str, silver_directory, gold_directory, 
#                           cust_df, geo_df, items_df, logistic_df, orders_df, prod_df, sellers_df, spark)

                                                                                

saved to: datamart/gold/gold_feature_store_2016_09_05.parquet
Feature gold table processing completed for snapshot date: 2016-09-05


In [None]:
# silver_directory = "datamart/silver"
# gold_directory = "datamart/gold/features"

# cust_path = silver_directory + "/customers/silver_olist_customers.parquet"
# geo_path = silver_directory + "/geolocation/silver_olist_geolocation.parquet"
# items_path = silver_directory + "/items/silver_olist_order_items.parquet"
# logistics_path = silver_directory + "/logistics/silver_olist_order_payments.parquet"
# products_path = silver_directory + "/products/silver_olist_products.parquet"
# sellers_path = silver_directory + "/sellers/silver_olist_sellers.parquet"

# gold_processing_feature.process_feature_gold_table(snapshot_date_str, silver_directory, gold_directory, 
#                           cust_path, geo_path, items_path, logistics_path,
#                           products_path, sellers_path, spark)

In [None]:
df_silver = spark.read.parquet("datamart/gold/gold_feature_store_2016_09_05.parquet")
df_silver.show(5)

+--------------------+------------------------+
|         customer_id|order_purchase_timestamp|
+--------------------+------------------------+
|683c54fc24d40ee9f...|              2016-09-05|
+--------------------+------------------------+



## Inspect Feature Store

## Build Gold Table (Label)

In [None]:
# Create gold datalake
silver_directory = "datamart/silver"
gold_directory = "datamart/gold"

In [None]:
partitions_list = ['2017-10-04']
y= gold_label_processing.process_gold_label(silver_directory, gold_directory, partitions_list, spark)
orders = y.toPandas()

print(f"Number of rows in label store: {orders.shape[0]}")
#orders.groupby('snapshot_date').size()

Building label store...


Saving labels: 100%|██████████| 1/1 [00:00<00:00,  6.13it/s]


Label store Completed
Number of rows in label store: 96478


In [None]:
def read_silver_table(table, silver_directory, spark):
    """
    Helper function to read all partitions of a silver table
    """
    folder_path = os.path.join(silver_directory, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df

gold_directory = "datamart/gold"
order_df = read_silver_table('label_store', gold_directory, spark)
order_df = order_df.toPandas()
order_df

Unnamed: 0,order_id,miss_delivery_sla,snapshot_date
0,95bfa2a85ef50d3192609d8f29b92cf9,0,2017-10-04
1,c3fd670b03599718895218d479f660b6,1,2017-10-04
2,b6c70f4b37438a78c820423809997c20,1,2017-10-04
3,ab53b19e9f59776c6556ebf49e85a52c,0,2017-10-04
4,9913ce9487d390ef37cd3b6cc3883f0e,1,2017-10-04
...,...,...,...
151,d36b13fdc087b62c490a9db5c0e0a913,0,2017-10-04
152,d4304f4104fca54e2a93b03e5b04962b,0,2017-10-04
153,d9e98b1f6961932f22bf340d0153bbad,0,2017-10-04
154,f495e955026183e7f6bbb3dac79b88e6,0,2017-10-04


In [None]:
order_df['order_purchase_timestamp'].min()
#max_date = '2018-09-03'
#min_date = '2016-09-04'


Timestamp('2016-09-04 21:15:19')

In [None]:
def read_silver_table(table, silver_directory, spark):
    """
    Helper function to read all partitions of a silver table
    """
    folder_path = os.path.join(silver_directory, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df

############################
# Label Store
############################
def build_label_store(sla, df):
    """
    Function to build label store
    """
    ####################
    # Create labels
    ####################

    # get customer at mob
    df = df.filter(col("order_status") == 'delivered')

    # get label
    df = df.withColumn("order_purchase_timestamp", to_date(col("order_purchase_timestamp")))
    df = df.withColumn("snapshot_date", col("order_purchase_timestamp"))
    df = df.withColumn("miss_delivery_sla", when(col("order_delivered_customer_date") > date_add(col("snapshot_date"), sla), 1).otherwise(0))

    # select columns to save
    df = df.select("order_id", "miss_delivery_sla", "snapshot_date")

    return df

############################
# Pipeline
############################

def process_gold_label(silver_directory, gold_directory, partitions_list, spark):
    """
    Wrapper function to build all gold tables
    """
    # Read silver tables
    orders_df = read_silver_table('orders', silver_directory, spark)

    # Build label store
    print("Building label store...")
    df_label = build_label_store(14, orders_df)

    for date_str in tqdm(partitions_list, total=len(partitions_list), desc="Saving labels"):
        partition_name = date_str.replace('-','_') + '.parquet'
        label_filepath = os.path.join(gold_directory, 'label_store', partition_name)
        df_label.filter(col('snapshot_date') == date_str).write.mode('overwrite').parquet(label_filepath)
        #df_label_filtered = df_label.filter(col('snapshot_date') == date_str)

    print("Label store Completed")

    return df_label

In [None]:
def process_gold_label(silver_directory, gold_directory, partitions_list, spark):
    """
    Wrapper function to build all gold tables
    """
    # Read silver tables
    orders_df = read_silver_table('orders', silver_directory, spark)

    # Build label store
    print("Building label store...")
    df_label = build_label_store(14, orders_df)

    for date_str in tqdm(partitions_list, total=len(partitions_list), desc="Saving labels"):
        partition_name = date_str.replace('-','_') + '.parquet'
        label_filepath = os.path.join(gold_directory, 'label_store', partition_name)
        df_label.filter(col('snapshot_date') == date_str).write.mode('overwrite').parquet(label_filepath)
        #df_label_filtered = df_label.filter(col('snapshot_date') == date_str)

    print("Label store Completed")

    return df_label

In [None]:
start_date_str = ['2017-01-01']

print("Building gold feature tables...")
# Create gold datalake
silver_directory = "datamart/silver"
gold_directory = "datamart/gold"

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# Build gold tables
y = process_gold_label(silver_directory, gold_directory, start_date_str, spark)

# Check for the rows ingested
y_pdf = y.toPandas()
y_count = y_pdf.shape[0]
print(f"Number of rows in label store: {y_pdf.shape[0]}")

print(f"Gold feature tables built successfully from start date: {start_date_str}")

Building gold feature tables...
Building label store...


Saving labels: 100%|██████████| 1/1 [00:00<00:00,  2.19it/s]


Label store Completed
Number of rows in label store: 96478
Gold feature tables built successfully from start date: ['2017-01-01']


## Inspect Label Store

## Stop Spark Session

In [30]:
# End spark session
spark.stop()

print('\n\n---completed job---\n\n')



---completed job---


