In [23]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


# Start Spark session
spark = SparkSession.builder \
    .appName("CSV Import Example") \
    .getOrCreate()

# Load first CSV
postcodes_df = spark.read.csv("../data/income/2024 Locality to 2021 SA2 Coding Index.csv", header=True, inferSchema=True)

# Load second CSV
income_df = spark.read.csv("../data/income/sa2_income.csv", header=True, inferSchema=True)

In [None]:
# Rename columns in df2 for easier handling
income_clean = (
    income_df
    .withColumnRenamed("Statistical Areas Level 2 2021 code", "SA2_CODE_2021")
    .withColumnRenamed("Statistical Areas Level 2 2021 name", "SA2_NAME_2021")
)

# Make sure join keys are the same type
postcodes_df = postcodes_df.withColumn("SA2_CODE_2021", F.col("SA2_CODE_2021").cast("string"))

income_clean = income_clean.withColumn("SA2_CODE_2021", F.col("SA2_CODE_2021").cast("string"))

income_clean = income_clean.filter(F.col("`Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)`") != 0)


# Perform join on SA2 code
merged_df = postcodes_df.join(income_clean, on="SA2_CODE_2021", how="left")

merged_df = merged_df.drop(income_clean.SA2_NAME_2021)  # drop df2’s version


result_df = (
    merged_df
    .groupBy(F.col("POSTCODE").alias("postcode"))
    .agg(
        F.avg(
            F.col("`Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)`")
        ).alias("median_total_income_2020")
    )
)



result_df.show(100, truncate=False)

result_df.printSchema()



result_df.write.csv("../data/curated/merged_postcode_income.csv", header=True, mode="overwrite")


+--------+-------------+------------------------+
|postcode|SA2_CODE_2021|median_total_income_2020|
+--------+-------------+------------------------+
|2347    |110041205    |45533.0                 |
|2338    |110041201    |40419.0                 |
|2100    |122031429    |67756.0                 |
|4053    |302021033    |56271.0                 |
|4300    |310041298    |52536.0                 |
|4287    |311011305    |45369.0                 |
|4207    |311021307    |45129.0                 |
|4154    |301031019    |69264.0                 |
|5720    |406021141    |47941.0                 |
|5482    |405031121    |34929.0                 |
|7254    |602031058    |48942.0                 |
|3019    |213031347    |47576.0                 |
|3869    |205041094    |49079.0                 |
|3238    |217031476    |36510.0                 |
|3006    |206041509    |NULL                    |
|3570    |202031032    |48282.0                 |
|3938    |214021384    |41760.0                 |


In [25]:
missing_count = merged_df.filter(F.col("`Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)`").isNull()).count()
print(missing_count)

1699


In [26]:
consumer_df = spark.read.csv("../data/tables/merchant_data/tbl_consumer.csv", header=True, inferSchema=True)