In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

In [4]:
USER_NAME = "sourcedb1"
PASS_WORD = "sourcedb1"
HOST_NAME = "postgres-source"
DB_NAME = "sourcedb"

CONN_STRING = f"postgresql+psycopg2://{USER_NAME}:{PASS_WORD}@{HOST_NAME}/{DB_NAME}"
engine = create_engine(CONN_STRING)

In [5]:
appName = "pyspark_postgres"
master = "local"

In [6]:
spark = SparkSession.builder \
        .master(master) \
        .appName(appName) \
        .getOrCreate()

21/09/10 05:49:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Customers Table

Perhaps I should read using Spark instead of Pandas

In [8]:
customers_df = pd.read_sql('SELECT * FROM ecommerce.customers', con=engine)

In [9]:
customers_sdf = spark.createDataFrame(customers_df)

In [10]:
print(customers_sdf.schema)

StructType(List(StructField(customer_id,StringType,true),StructField(customer_unique_id,StringType,true),StructField(customer_zip_code_prefix,StringType,true),StructField(customer_city,StringType,true),StructField(customer_state,StringType,true)))


Reading in raw CSV file

In [7]:
schema = StructType() \
    .add("customer_id", StringType(), True) \
    .add("customer_unique_id", StringType(), True) \
    .add("customer_zip_code_prefix", StringType(), True) \
    .add("customer_city", StringType(), True) \
    .add("customer_state", StringType(), True)

In [12]:
customers_csv_sdf = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/home/jovyan/source-data/olist_customers_dataset.csv")

Are the two dataframes equivalent? Yep

In [18]:
customers_sdf.schema == customers_csv_sdf.schema

True

In [20]:
customers_sdf.collect() == customers_csv_sdf.collect()

21/09/10 05:54:53 WARN TaskSetManager: Stage 3 contains a task of very large size (9655 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

True

In [21]:
def isEqual(df1, df2):
    if df1.schema != df2.schema:
        return False
    if df1.collect() != df2.collect():
        return False
    return True

In [22]:
isEqual(customers_sdf, customers_csv_sdf)

21/09/10 05:56:44 WARN TaskSetManager: Stage 5 contains a task of very large size (9655 KiB). The maximum recommended task size is 1000 KiB.


True

## Orders Table

In [10]:
orders_df = pd.read_sql('SELECT * FROM ecommerce.orders', con=engine)

In [11]:
orders_sdf = spark.createDataFrame(orders_df)

In [12]:
print(orders_sdf.schema)

StructType(List(StructField(order_id,StringType,true),StructField(customer_id,StringType,true),StructField(order_status,StringType,true),StructField(order_purchase_timestamp,TimestampType,true),StructField(order_approved_at,TimestampType,true),StructField(order_delivered_carrier_date,TimestampType,true),StructField(order_delivered_customer_date,TimestampType,true),StructField(order_estimated_delivery_date,TimestampType,true)))


## Geolocation Table

In [13]:
geolocation_df = pd.read_sql('SELECT * FROM ecommerce.geolocation', con=engine)

In [14]:
geolocation_sdf = spark.createDataFrame(geolocation_df)

In [15]:
print(geolocation_sdf.schema)

StructType(List(StructField(geolocation_zip_code_prefix,StringType,true),StructField(geolocation_lat,DoubleType,true),StructField(geolocation_lng,DoubleType,true),StructField(geolocation_city,StringType,true),StructField(geolocation_state,StringType,true)))


## Output as Parquet

In [17]:
customers_sdf.write.parquet('/home/jovyan/filesystem/customers.parquet', compression='snappy')

21/09/08 07:14:40 WARN TaskSetManager: Stage 1 contains a task of very large size (9655 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [18]:
orders_sdf.write.parquet('/home/jovyan/filesystem/orders.parquet', compression='snappy')

21/09/08 07:14:45 WARN TaskSetManager: Stage 2 contains a task of very large size (12593 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [19]:
geolocation_sdf.write.parquet('/home/jovyan/filesystem/geolocation.parquet', compression='snappy')

21/09/08 07:14:49 WARN TaskSetManager: Stage 3 contains a task of very large size (46472 KiB). The maximum recommended task size is 1000 KiB.
                                                                                