In [1]:
import argparse
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

import utils.data_processing_bronze_table as bronze_processing
import utils.data_processing_silver_table as silver_processing

## set up pyspark session

In [2]:
print('\n\n---starting job---\n\n')

# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("olist_bronze_processing") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")



---starting job---




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/17 06:16:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Build Bronze Table

Important note: There is some discrepancy in where the datamart folder is created when the main.py script is run vs this Jupyter notebook is run.

* This Jupyter notebook will create the datamart folder inside `scripts` folder and output the bronze tables there.
* When you run the main.py script, the datamart folder will be created inside `app` folder (i.e. root) and output the bronze tables there.

Need to have team meeting to resolve this

I chose to run the main.py script, therefore subsequent code on Silver Tables built references the path from `app` folder to access the bronze tables.

In [6]:
# Create bronze root directory
bronze_root = "datamart/bronze"
os.makedirs(bronze_root, exist_ok=True)
print(f"Bronze root directory: {bronze_root}")

Bronze root directory: datamart/bronze


In [None]:
# Process all Olist datasets
print("\nProcessing Olist datasets...")
bronze_processing.process_olist_customers_bronze(bronze_root, spark)
bronze_processing.process_olist_geolocation_bronze(bronze_root, spark)
bronze_processing.process_olist_order_items_bronze(bronze_root, spark)
bronze_processing.process_olist_order_payments_bronze(bronze_root, spark)
bronze_processing.process_olist_order_reviews_bronze(bronze_root, spark)
bronze_processing.process_olist_products_bronze(bronze_root, spark)
bronze_processing.process_olist_sellers_bronze(bronze_root, spark)
bronze_processing.process_product_cat_translation_bronze(bronze_root, spark)

In [None]:
# Process orders with monthly partitioning
bronze_processing.process_olist_orders_bronze(bronze_root, spark)

In [5]:
# Inspect some output
# I put the actual path due to the discrepancy in paths above. Will amend later
df_bronze = spark.read.parquet("../datamart/bronze/customers/bronze_olist_customers.parquet")
df_bronze.show(5)

# Can read

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|503840d4f2a1a7609...|ffc4233210eac4ec1...|                   14811|          araraquara|            SP|
|52e73a5d0a1d4c56b...|b43530186123fb6d9...|                   62625|               missi|            CE|
|16cb62869f9719571...|c3cc321141423ab8a...|                   55560|           barreiros|            PE|
|4979ba0e6037e4b28...|80768413a59684f1e...|                   29307|cachoeiro de itap...|            ES|
|11ec4bc0610184925...|bd836cf4fce7f808b...|                   22420|      rio de janeiro|            RJ|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



## Build Silver Table

Important note: There is some discrepancy in where the datamart folder is created when the main.py script is run vs this Jupyter notebook is run.

* This Jupyter notebook will create the datamart folder inside `scripts` folder and output the silver tables there.
* When you run the main.py script, the datamart folder will be created inside `app` folder (i.e. root) and output the silver tables there.

Need to have team meeting to resolve this

In [3]:
# Create silver root directory
silver_root = "datamart/silver"
os.makedirs(silver_root, exist_ok=True)
print(f"Silver root directory: {silver_root}")

Silver root directory: datamart/silver


In [4]:
# Create all required output directories

# Create silver directory to save customer data
silver_cust_directory = "datamart/silver/customers/"
if not os.path.exists(silver_cust_directory):
    os.makedirs(silver_cust_directory)

# Create silver directory to save seller data
silver_sell_directory = "datamart/silver/sellers/"
if not os.path.exists(silver_sell_directory):
    os.makedirs(silver_sell_directory)

# Create silver directory to save geolocation data
silver_geo_directory = "datamart/silver/geolocation/"
if not os.path.exists(silver_geo_directory):
    os.makedirs(silver_geo_directory)

In [5]:
# Process all bronze tables into silver
print("\nProcessing bronze tables...")
silver_processing.process_silver_olist_customers("../datamart/bronze/customers/",silver_cust_directory, spark)
silver_processing.process_silver_olist_sellers("../datamart/bronze/sellers/",silver_sell_directory, spark)
silver_processing.process_silver_olist_geolocation("../datamart/bronze/geolocation/",silver_geo_directory, spark)
# add more below


Processing bronze tables...
loaded from: ../datamart/bronze/customers/bronze_olist_customers.parquet row count: 99441
Number of duplicated 'customer_id': 0


                                                                                

saved to: datamart/silver/customers/silver_olist_customers.parquet
loaded from: ../datamart/bronze/sellers/bronze_olist_sellers.parquet row count: 3095
Number of duplicated 'seller_id': 0
saved to: datamart/silver/sellers/silver_olist_sellers.parquet
loaded from: ../datamart/bronze/geolocation/bronze_olist_geolocation.parquet row count: 1000325


                                                                                

saved to: datamart/silver/geolocation/silver_olist_geolocation.parquet


DataFrame[geolocation_zip_code_prefix: string, geolocation_lat: double, geolocation_lng: double]

In [None]:
# Process orders with monthly partitioning
# add more below

In [6]:
# Inspect some output
df_silver = spark.read.parquet("datamart/silver/geolocation/silver_olist_geolocation.parquet")
df_silver.show(5)

# Can read

+---------------------------+-------------------+-------------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|
+---------------------------+-------------------+-------------------+
|                      49290|-11.274805005391439|-37.790795516967776|
|                      49630|-10.605308055877686|-37.113027572631836|
|                      55445|   -8.5616774559021|  -35.8295783996582|
|                      57051| -9.655002400681779| -35.73440123893119|
|                      57085| -9.558634171119103| -35.73914117079515|
+---------------------------+-------------------+-------------------+
only showing top 5 rows



### Build Customer Table

In [10]:
# Create silver directory to save customer data
silver_cust_directory = "datamart/silver/customers/"
if not os.path.exists(silver_cust_directory):
    os.makedirs(silver_cust_directory)

In [11]:
def process_silver_olist_customers(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_customers.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "customer_id": StringType(),
        "customer_unique_id": StringType(),
        "customer_zip_code_prefix": StringType(),
        "customer_city": StringType(),
        "customer_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Check customer_id duplicates (total rows - distinct ids)
    total_rows = df.count()
    distinct_rows = df.select("customer_id").distinct().count()
    duplicates_customer_id = total_rows - distinct_rows
    print(f"Number of duplicated 'customer_id': {duplicates_customer_id}")

    # Add missing leading zero
    df = df.withColumn(
        "customer_zip_code_prefix",
        F.lpad(col("customer_zip_code_prefix"), 5, "0")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_customers.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [13]:
# Run function manually to test
# I inputted the bronze_directory manually (amend after our path discrepancies are resolved)
df = process_silver_olist_customers("../datamart/bronze/customers/",silver_cust_directory, spark)

loaded from: ../datamart/bronze/customers/bronze_olist_customers.parquet row count: 99441
Number of duplicated 'customer_id': 0
saved to: datamart/silver/customers/silver_olist_customers.parquet


In [14]:
# Check schema enforced
df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [15]:
# Check missing leading zero padded
df.groupBy(F.length("customer_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5|99441|
+------+-----+



### Build Seller Table

In [16]:
# Create silver directory to save seller data
silver_sell_directory = "datamart/silver/sellers/"
if not os.path.exists(silver_sell_directory):
    os.makedirs(silver_sell_directory)

In [17]:
def process_silver_olist_sellers(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_sellers.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "seller_id": StringType(),
        "seller_zip_code_prefix": StringType(),
        "seller_city": StringType(),
        "seller_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Check seller_id duplicates (total rows - distinct ids)
    total_rows = df.count()
    distinct_rows = df.select("seller_id").distinct().count()
    duplicates_seller_id = total_rows - distinct_rows
    print(f"Number of duplicated 'seller_id': {duplicates_seller_id}")

    # Add missing leading zero
    df = df.withColumn(
        "seller_zip_code_prefix",
        F.lpad(col("seller_zip_code_prefix"), 5, "0")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_sellers.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df

In [18]:
# Run function manually to test
# I inputted the bronze_directory manually (amend after our path discrepancies are resolved)
df = process_silver_olist_sellers("../datamart/bronze/sellers/",silver_sell_directory, spark)

loaded from: ../datamart/bronze/sellers/bronze_olist_sellers.parquet row count: 3095
Number of duplicated 'seller_id': 0
saved to: datamart/silver/sellers/silver_olist_sellers.parquet


In [19]:
# Check schema enforced
df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



In [20]:
# Check missing leading zero padded
df.groupBy(F.length("seller_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5| 3095|
+------+-----+



### Build Geolocation Table

In [21]:
# Create silver directory to save geolocation data
silver_geo_directory = "datamart/silver/geolocation/"
if not os.path.exists(silver_geo_directory):
    os.makedirs(silver_geo_directory)

In [10]:
def process_silver_olist_geolocation(bronze_directory, silver_directory, spark):
    
    # connect to bronze table
    partition_name = "bronze_olist_geolocation.parquet"
    filepath = bronze_directory + partition_name
    df = spark.read.parquet(filepath)
    print('loaded from:', filepath, 'row count:', df.count())

    # clean data: enforce schema / data type
    # Dictionary specifying columns and their desired datatypes
    column_type_map = {
        "geolocation_zip_code_prefix": StringType(),
        "geolocation_lat": FloatType(),
        "geolocation_lng": FloatType(),
        "geolocation_city": StringType(),
        "geolocation_state": StringType(),
    }

    for column, new_type in column_type_map.items():
        df = df.withColumn(column, col(column).cast(new_type))

    # Add missing leading zero
    df = df.withColumn(
        "geolocation_zip_code_prefix",
        F.lpad(col("geolocation_zip_code_prefix"), 5, "0")
    )

    # Deduplicate zipcodes by just taking the centroid (mean of lat,lng)
    df_dedupe = df.groupBy("geolocation_zip_code_prefix").agg(
        F.avg("geolocation_lat").alias("geolocation_lat"),
        F.avg("geolocation_lng").alias("geolocation_lng")
    )
    
    # save silver table - IRL connect to database to write
    partition_name = "silver_olist_geolocation.parquet"
    filepath = silver_directory + partition_name
    df.write.mode("overwrite").parquet(filepath)
    print('saved to:', filepath)
    
    return df_dedupe

In [11]:
# Run function manually to test
# I inputted the bronze_directory manually (amend after our path discrepancies are resolved)
df = process_silver_olist_geolocation("../datamart/bronze/geolocation/",silver_geo_directory, spark)

loaded from: ../datamart/bronze/geolocation/bronze_olist_geolocation.parquet row count: 1000325


                                                                                

saved to: datamart/silver/geolocation/silver_olist_geolocation.parquet


In [12]:
# Check schema enforced
df.printSchema()

root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)



In [13]:
# Check missing leading zero padded
df.groupBy(F.length("geolocation_zip_code_prefix").alias("length")).count().show()

+------+-----+
|length|count|
+------+-----+
|     5|19177|
+------+-----+



In [14]:
# Check every geolocation_zip_code_prefix only has 1 count. Group by prefix and count occurrences
df.groupBy("geolocation_zip_code_prefix") \
    .agg(F.count("*").alias("count")) \
    .filter("count > 1") \
    .show()

+---------------------------+-----+
|geolocation_zip_code_prefix|count|
+---------------------------+-----+
+---------------------------+-----+



## Build Gold Table (Features)

## Inspect Feature Store

## Build Gold Table (Label)

## Inspect Label Store

## Stop Spark Session

In [1]:
# End spark session
spark.stop()

print('\n\n---completed job---\n\n')

NameError: name 'spark' is not defined