# 1. Upload Data

In [0]:
spark


In [0]:
path = "/Volumes/workspace/default/bops/BOPS data.xlsx"


In [0]:
# See sheets name
sheets = (spark.read.format("excel")
          .option("operation", "listSheets")
          .load(path))

display(sheets)


In [0]:
# Read B&M
bm = (spark.read.format("excel")
      .option("headerRows", 1)                 # first row = column names
      .option("dataAddress", "B&M Sales")    # whole sheet
      .option("inferSchema", True)             # infer column types
      .load(path))

display(bm.limit(5))

In [0]:

online = (spark.read.format("excel")
          .option("headerRows", 1)
          .option("dataAddress", "Online Sales")
          .option("inferSchema", True)             # infer column types
          .load(path))
display(online.limit(5))


In [0]:
from pyspark.sql import functions as F

bm_clean = (bm
  .withColumnRenamed("id (store)", "id_store")
  .withColumn("date", F.to_date("date"))        # keep date-only
  .withColumn("sales", F.col("sales").cast("double"))  # easier for stats/plots
  .withColumn("after", F.col("after").cast("int")) # make binary flags explicitly int (or boolean) for clarity:
  .withColumn("usa", F.col("usa").cast("int"))
)


In [0]:
# see duplicates
print("rows:", bm_clean.count())
print("unique (id_store, date):", bm_clean.select("id_store","date").distinct().count())


In [0]:
online_clean = (online
  .withColumnRenamed("id (DMA)", "id_dma")
  .withColumn("date", F.to_date("date"))        # keep date-only
  .withColumn("sales", F.col("sales").cast("double"))  # easier for stats/plots
  .withColumn("after", F.col("after").cast("int"))
  .withColumn("close", F.col("close").cast("int"))
)


In [0]:
# see duplicates
print("rows:", online_clean.count())
print("unique (id_dma, date):", online_clean.select("id_dma","date").distinct().count())


In [0]:
# save data
bm_clean.write.format("delta").mode("overwrite").saveAsTable("bm_sales_clean")
online_clean.write.format("delta").mode("overwrite").saveAsTable("online_sales_clean")