In [5]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder \
    .appName("CSV Import Example") \
    .getOrCreate()

# Load first CSV
postcodes_df = spark.read.csv("../data/income/2024 Locality to 2021 SA2 Coding Index.csv", header=True, inferSchema=True)

# Load second CSV
income_df = spark.read.csv("../data/income/sa2_income.csv", header=True, inferSchema=True)

In [6]:
from pyspark.sql import functions as F

# Rename columns in df2 for easier handling
income_clean = (
    income_df
    .withColumnRenamed("Statistical Areas Level 2 2021 code", "SA2_CODE_2021")
    .withColumnRenamed("Statistical Areas Level 2 2021 name", "SA2_NAME_2021")
)

# Make sure join keys are the same type
postcodes_df = postcodes_df.withColumn("SA2_CODE_2021", F.col("SA2_CODE_2021").cast("string"))
income_clean = income_clean.withColumn("SA2_CODE_2021", F.col("SA2_CODE_2021").cast("string"))

# Perform join on SA2 code
merged_df = postcodes_df.join(income_clean, on="SA2_CODE_2021", how="left")

merged_df = merged_df.drop(income_clean.SA2_NAME_2021)  # drop df2’s version


# Optional: select only needed columns
result_df = merged_df.selectExpr(
    "POSTCODE",
    "LOCALITY_NAME",
    "STATE",
    "SA2_CODE_2021",
    "SA2_NAME_2021",
    "`Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)`"
)


result_df.show(100, truncate=False)

result_df.write.csv("../data/curated/merged_postcode_income.csv", header=True, mode="overwrite")


+--------+------------------+-----+-------------+--------------------+-----------------------------------------------------------------------------------------------------+
|POSTCODE|LOCALITY_NAME     |STATE|SA2_CODE_2021|SA2_NAME_2021       |Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)|
+--------+------------------+-----+-------------+--------------------+-----------------------------------------------------------------------------------------------------+
|2902    |RIVERLEA          |ACT  |801111140    |ACT - South West    |68987                                                                                                |
|2913    |KINLYSIDE         |ACT  |801041117    |Gungahlin - West    |NULL                                                                                                 |
|2618    |PARKWOOD          |ACT  |801011144    |West Belconnen      |NULL                                                             