In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

In [0]:
# Create table once
spark.sql("""
CREATE TABLE IF NOT EXISTS budget_cata.silver.manual_transactions (
  Date date,
  Amount double,
  Description string,
  Type string,
  cleaned_desc string,
  category string,
  location string,
  note string,
  bank string,
  payment string,
  transaction_id string,
  updated_at timestamp
)
USING DELTA
""")

DataFrame[]

In [0]:
import os
import sys

project_path = os.path.join(os.getcwd(), "..","..","..")
sys.path.append(project_path)

from utils.manual_inserts import *

In [0]:
tgt = "budget_cata.silver.manual_transactions"
df_tgt = spark.table(tgt)

In [0]:
# Ensure the metadata columns exist (schema evolve)
existing_cols = set(spark.table(tgt).columns)

if "transaction_id" not in existing_cols:
    spark.sql(f"ALTER TABLE {tgt} ADD COLUMNS (transaction_id STRING)")
if "updated_at" not in existing_cols:
    spark.sql(f"ALTER TABLE {tgt} ADD COLUMNS (updated_at TIMESTAMP)")

In [0]:
business_cols = ["Date","Type","category","note","Amount","cleaned_desc","Description","location","bank","payment"]

df_base = df_tgt.select(*business_cols)

# Apply utils function - manual_inserts
df_union = manual_inserts(df_base)  # your helper returns existing + new inserts (no duplicates)

In [0]:
# Compute transaction_id for ALL rows (stable)
df_union = df_union.withColumn(
    "transaction_id",
    sha2(concat_ws("||",
        coalesce(col("bank"), lit("")),
        col("Date").cast("string"),
        col("Amount").cast("string"),
        coalesce(col("Description"), lit(""))
    ), 256)
)

In [0]:
# Only keep the rows that are NEW vs target (based on transaction_id)
df_existing_ids = df_tgt.select("transaction_id").where(col("transaction_id").isNotNull()).distinct()

df_new_only = df_union.join(df_existing_ids, on="transaction_id", how="left_anti") \
                      .withColumn("updated_at", current_timestamp())

In [0]:
# Append only new rows (no overwrite, no merge required if manual rows never get corrected)
(df_new_only.write
 .format("delta")
 .mode("append")
 .saveAsTable(tgt))