In [None]:
# !pip install -r -q /home/jovyan/requirements.txt

In [5]:
import os
import sys
# import pendulum
from pyspark.sql import SparkSession
# from faker import Faker

sys.path.insert(1, "/home/jovyan/work")

# from src.helpers.gen_order import OrderGen

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    FloatType,
    IntegerType,
    ArrayType,
    TimestampType
)

In [6]:
from pyspark.sql.functions import explode, col


In [None]:
# Inicia um gerador de vendas
order_generator = OrderGen()
customers_file = "/home/jovyan/data/datasets/customers.json"
products_file = "/home/jovyan/data/datasets/products.json"
data = order_generator.generate(customers_file=customers_file, products_file=products_file )

In [None]:
# Faker.seed(0)
# fake = Faker("pt_BR")
start_date=pendulum.from_format("2024-08-01", "YYYY-MM-DD", tz="America/Fortaleza")
def fake_date():
    fake = Faker("pt_BR")
    return fake.date_between(
                    start_date="-30d",
                    end_date="-1d",
    )



json_dataset = [order_generator.generate(customers_file=customers_file, products_file=products_file ) for _ in range(20000)]
print(len(json_dataset))

In [None]:
# Step 1: Initialize SparkSession
spark = SparkSession.builder \
    .appName("Create DataFrame from List of Dictionaries") \
    .getOrCreate()

In [None]:
with open
json_dataset

## Schema

In [None]:
customer_schema = StructType(
    [
        StructField("id", StringType(), nullable=False),
        StructField("created_at", StringType(), nullable=False),
        StructField("name", StringType(), nullable=False),
        StructField("email", StringType(), nullable=False),
        StructField("tax_id", StringType(), nullable=False),
    ]
)

In [None]:
item_schema = StructType([
      StructField("reference_id",StringType(),nullable=False),
      StructField("name",StringType(),nullable=False),
      StructField("categoria",StringType(),nullable=False),
      StructField("unit_price",FloatType(),nullable=False),
      StructField("quantity",IntegerType(),nullable=False),
  ])

In [None]:
payment_method = StructType(
    [
        StructField(
            "type", StringType(), nullable=True
        ),
        StructField(
            "pix",
            StructType(
                [
                    StructField(
                        "notification_id",
                        StringType(),
                        nullable=False,
                    ),
                    StructField(
                        "end_to_end_id",
                        StringType(),
                        nullable=False,
                    ),
                    StructField(
                        "holder",
                        StructType(
                            [
                                StructField(
                                    "name",
                                    StringType(),
                                    nullable=False,
                                ),
                                StructField(
                                    "tax_id",
                                    StringType(),
                                    nullable=False,
                                ),
                            ]
                        ),
                        nullable=False,
                    ),
                ]
            ),
            nullable=True,
        ),
    ]
)
charges_schema = ArrayType(
                    StructType(
                        [
                            StructField("id", StringType(), nullable=False),
                            StructField("reference_id", StringType(), nullable=False),
                            StructField("status", StringType(), nullable=False),
                            StructField("created_at", StringType(), nullable=False),
                            StructField("paid_at", StringType(), nullable=True),
                            StructField("description", StringType(), nullable=True),
                            StructField(
                                "amount",
                                StructType(
                                    [
                                        StructField(
                                            "value", StringType(), nullable=False
                                        ),
                                        StructField(
                                            "currency", StringType(), nullable=False
                                        ),
                                        StructField(
                                            "summary",
                                            StructType(
                                                [
                                                    StructField(
                                                        "total",
                                                        StringType(),
                                                        nullable=False,
                                                    ),
                                                    StructField(
                                                        "paid",
                                                        StringType(),
                                                        nullable=False,
                                                    ),
                                                    StructField(
                                                        "refunded",
                                                        StringType(),
                                                        nullable=False,
                                                    ),
                                                ]
                                            ),
                                            nullable=False,
                                        ),
                                    ]
                                ),
                                nullable=False,
                            ),
                            StructField(
                                "payment_method", payment_method, nullable=False,
                            ),
                        ]
                    )
                )

In [None]:
# Shipping
shipping_schema = StructType(
                    [
                        StructField("street", StringType(), nullable=False),
                        StructField("number", StringType(), nullable=False),
                        StructField("complement", StringType(), nullable=True),
                        StructField("locality", StringType(), nullable=True),
                        StructField("city", StringType(), nullable=True),
                        StructField("region_code", StringType(), nullable=True),
                        StructField("country", StringType(), nullable=True),
                        StructField("postal_code", StringType(), nullable=False),
                    ]
                )

In [None]:
# Todo o JSON
schema = StructType([
    StructField("id", StringType(), nullable=False),
    StructField("reference_id", StringType(), nullable=False),
    StructField("created_at", StringType(), nullable=False),
    StructField("shipping", shipping_schema, nullable=False),
    StructField("items", ArrayType(item_schema), nullable=False),
    StructField("customer", customer_schema, nullable=False),
    StructField("charges", charges_schema, nullable=False)
])

## Read RAW Data

In [None]:
# Step 3: Create DataFrame from the list of dictionaries
df0 = spark.createDataFrame(json_dataset, schema=schema)

In [None]:
df = df0.withColumn("charge", explode("charges")).withColumn("item", explode("items")).drop(*["charges", "items"])

In [None]:
df.printSchema()

In [None]:
%%time
df.write.parquet('/home/jovyan/data/volumes/jupyter/original.parquet')

In [None]:
%%time
# df = spark.read.parquet('/home/jovyan/data/volumes/jupyter/original.parquet')

In [None]:
# df.show(truncate=False)
pandas_df = df.toPandas()
pandas_df.sample(5)

## Tables

### OrderItems

In [None]:
order_items_df = df.select(
    col("id").alias("order_id"),
    col("item.reference_id").alias("product_reference"),
    col("item.unit_price").cast('decimal(10,2)').alias('unit_price'),
    col("item.quantity").cast('integer').alias('quantity')
).dropDuplicates()

order_items_df.show(truncate=False)
order_items_df.printSchema()

## Venda

In [None]:
vendas_df = df \
    .withColumn("created_at", col("created_at").cast(TimestampType())) \
    .withColumn("customer_id", col("customer.id")) \
    .withColumn("paid_at", col("charge.paid_at").cast(TimestampType())) \
    .withColumn("status", col("charge.status")) \
    .withColumn("payment_method", col("charge.payment_method.type")) \
    .withColumn("payment_total", col("charge.amount.summary.total").cast("decimal(10,2)")) \
    .drop(*["charge", "customer", "item"]) \
    .dropDuplicates() \
    .select("*", "shipping.*").drop("shipping") \
    .withColumn("number", col("number").cast(IntegerType()))

vendas_df.printSchema()
vendas_df.select("id", "customer_id", "paid_at", "payment_method", "payment_total", "region_code").show(truncate=True)


In [None]:
from pyspark.sql.functions import col, explode, window, window, sum as _sum
# Define a sliding window of 1 hour
windowed_stream = (
    vendas_df.withWatermark("timestamp", "1 day")
    .groupBy(window(col("paid_at"), "30 days").alias("window"))
    .agg(_sum("payment_total").alias("total_revenue"))
).withColumn("window", col("window").cast(StringType()))
windowed_stream.show(truncate=False)

In [None]:
type(windowed_stream)