In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import datetime
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Spark session
spark = SparkSession.builder \
        .appName("test") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.jars.packages",
                "org.postgresql:postgresql:42.7.4,"
                "org.apache.hadoop:hadoop-azure:3.3.6,"
                "org.apache.hadoop:hadoop-azure-datalake:3.3.6,"
                "io.delta:delta-spark_2.13:4.0.0") \
        .getOrCreate()

# ADLS Gen2 configuration
storage_account_name = "firstprojectde"
storage_account_key = os.getenv("AZURE_STORAGE_KEY")
container_name = "steps"

# Set Hadoop configuration for ADLS Gen2
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

# Base ADLS path
adls_base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/"

# Previous dayâ€™s date (e.g., 2025-08-27 on August 28)
yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

#test
yesterday = "2025-10-20"




In [2]:
sales_df.show()

+-------+-----------+--------+-------------------+------------+----------+--------------------+--------------------+--------+-------------------+
|sale_id|customer_id|store_id|          timestamp|total_amount|      date|          store_name|             address|    town|number_of_employees|
+-------+-----------+--------+-------------------+------------+----------+--------------------+--------------------+--------+-------------------+
| 110000|      C1466|    S008|2025-10-20 08:55:00|      507.39|2025-10-20|Store Murphy, Car...|    1143 Samuel Isle|   Gabes|                 13|
| 110001|      C1206|    S013|2025-10-20 20:32:00|      384.35|2025-10-20|Store Potter-Thom...|7715 Alice Mount ...|    Sfax|                 17|
| 110002|      C1479|    S007|2025-10-20 18:42:00|      375.06|2025-10-20|Store Cunningham-...|0466 Martinez Poi...|Monastir|                 18|
| 110003|      C1309|    S002|2025-10-20 09:08:00|      644.51|2025-10-20|    Store Reed Group|731 Goodwin Cente...|    Sfax

In [3]:
sales_df.count()

50

In [5]:
from src.modeling.io_utils import read_parquet
from src.utils.config import get_config


config = get_config()

date = "2025-10-20"

In [6]:
sales_df = read_parquet(spark, config, "raw", "transactions", "sales").dropDuplicates(["sale_id"])


In [8]:
sales_df = read_parquet(spark, config, "cleaned", "transactions", "sales").dropDuplicates(["sale_id"])

In [7]:
sales_df.count()

2300

In [7]:
sales_df.select("timestamp").distinct().show()


+--------------------+
|           timestamp|
+--------------------+
|2025-09-02 20:34:...|
|2025-09-03 07:34:...|
|2025-09-03 05:36:...|
|2025-09-03 15:40:...|
|2025-09-03 03:08:...|
| 2025-08-20 10:51:00|
| 2025-08-02 17:15:00|
| 2025-08-17 08:25:00|
| 2025-08-22 18:02:00|
| 2025-08-09 09:01:00|
| 2025-08-06 14:35:00|
| 2025-08-04 09:04:00|
|2025-09-03 02:11:...|
|2025-09-03 12:02:...|
| 2025-09-04 07:04:56|
|2025-09-03 02:46:...|
| 2025-08-19 12:24:00|
| 2025-08-21 20:13:00|
| 2025-08-19 15:21:00|
| 2025-08-18 14:09:00|
+--------------------+
only showing top 20 rows


In [None]:


sales_df = read_parquet(spark, config, "cleaned", "transactions", "sales", date=date).dropDuplicates(["sale_id"])
sales_details_df = read_parquet(spark, config, "cleaned", "transactions", "sales_details", date=date).dropDuplicates(["detail_id"])
payments_df = read_parquet(spark, config, "cleaned", "transactions", "payments", date=date).dropDuplicates(["payment_id"])

In [3]:
sales_df.count()

50

In [7]:
sales_df.show()

+-------+-----------+--------+-------------------+------------+----------+--------------------+--------------------+--------+-------------------+
|sale_id|customer_id|store_id|          timestamp|total_amount|      date|          store_name|             address|    town|number_of_employees|
+-------+-----------+--------+-------------------+------------+----------+--------------------+--------------------+--------+-------------------+
| 110000|      C1466|    S008|2025-10-20 08:55:00|      507.39|2025-10-20|Store Murphy, Car...|    1143 Samuel Isle|   Gabes|                 13|
| 110001|      C1206|    S013|2025-10-20 20:32:00|      384.35|2025-10-20|Store Potter-Thom...|7715 Alice Mount ...|    Sfax|                 17|
| 110002|      C1479|    S007|2025-10-20 18:42:00|      375.06|2025-10-20|Store Cunningham-...|0466 Martinez Poi...|Monastir|                 18|
| 110003|      C1309|    S002|2025-10-20 09:08:00|      644.51|2025-10-20|    Store Reed Group|731 Goodwin Cente...|    Sfax

In [4]:
from src.modeling.modeling_utils import create_dim_product, create_dim_store, create_dim_date, create_dim_payment_method, create_fact_sales

dim_payment_method_existing = read_parquet(spark, config, "curated", "analytics", "dim_payment_method")
            
# Create new dimension data
dim_product_new = create_dim_product(sales_details_df)
dim_store_new = create_dim_store(sales_df)
dim_date_new = create_dim_date(sales_df)
dim_payment_method_new = create_dim_payment_method(payments_df, dim_payment_method_existing)

# Create fact table


In [13]:
dim_payment_method_existing.show()

+---------+------+------------------+
|method_id|method|       description|
+---------+------+------------------+
|        1|Crypto|Payment via Crypto|
|        2|Cheque|Payment via Cheque|
|        3|  Card|  Payment via Card|
|        4|  Cash|  Payment via Cash|
|        5|Mobile|Payment via Mobile|
+---------+------+------------------+



In [34]:
sales_alias = sales_df.select(
    F.col("sale_id"),
    F.col("store_id"),
    F.col("total_amount"),
    F.to_date(F.col("timestamp")).alias("sale_date")
).alias("s")

details_alias = sales_details_df.alias("d")
payments_alias = payments_df.select(F.col("sale_id"), F.col("method")).alias("p")
date_alias = dim_date_new.alias("dt")
payment_dim_alias = dim_payment_method_new.alias("pm")

# Use LEFT joins temporarily to see which join is causing the issue
result = details_alias.join(
    sales_alias, on="sale_id", how="left"  # Changed to left
).join(
    payments_alias, on="sale_id", how="left"  # Changed to left
).join(
    date_alias, F.col("s.sale_date") == F.col("dt.date"), "left"  # Changed to left
).join(
    payment_dim_alias, F.col("p.method") == F.col("pm.method"), "left"  # Changed to left
)





In [35]:
result.count()

188

In [36]:
ntija = result.select(
        F.col("d.sale_id"),
        F.col("d.product_id"),
        F.col("s.store_id"),
        F.col("dt.date_id"),
        F.col("pm.method_id"),
        F.col("d.quantity"),
        F.col("d.unit_price"),
        (F.col("d.unit_price") * F.col("d.quantity")).alias("line_amount"),
        F.col("d.VAT"),
        F.col("s.sale_date").alias("date")
    )

In [37]:
ntija.count()

188

In [7]:
fact_sales = create_fact_sales(sales_df, sales_details_df, payments_df, dim_date_new, dim_payment_method_new)

In [8]:
fact_sales.count()

188

In [9]:
dim_product_new.count()

91

In [10]:
dim_store_new.count()

20

In [11]:
dim_date_new.count()

1

In [12]:
dim_payment_method_new.count()

0