## Cleaning the Weekly Rental Data

### Create Spark Session

In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession, functions as F

# Create Spark session
spark = (
    SparkSession.builder.appName('Weekly Rental Listings Preprocessing')
    .config("spark.sql.repl.eagerEval.enabled", True)  # display full dataframe in console
    .config("spark.sql.parquet.cacheMetadata", "true") # cache parquet metadata
    .config("spark.sql.session.timeZone", "Etc/UTC") # set timezone to UTC
    # memory configurations - hopefully will reduce crashing
    .config("spark.driver.memory", "4g") # set driver memory
    .config("spark.executor.memory", "4g") # set executor memory
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/22 00:54:00 WARN Utils: Your hostname, WhiteChonk, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/22 00:54:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/22 00:54:00 WARN Utils: Your hostname, WhiteChonk, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/22 00:54:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLe

In [2]:
# set download path
data_path = "../data/raw/domain/rental_listings_*.csv"

# download data from raw/domain
sdf = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path)
    .withColumn("source_file", F.input_file_name()) # add source file column
)

25/09/22 00:54:03 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: ../data/raw/domain/rental_listings_*.csv.
java.io.FileNotFoundException: File ../data/raw/domain/rental_listings_*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.apache

In [3]:
# extract year and quarter from filename using regex
sdf = (
    sdf.withColumn("year", F.regexp_extract(F.col("source_file"), r"_(\d{4})_", 1).cast("int"))
    .withColumn("quarter", F.regexp_extract(F.col("source_file"), r"_(\d{2}).csv", 1).cast("int"))
    .drop("source_file")
)

In [4]:
sdf.cache

25/09/22 00:54:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


<bound method DataFrame.cache of +-----------+------------+------------+------------+--------------------+-----------------+----------------+------------------+---------+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+----+-------+
|age_0_to_19|age_20_to_39|age_40_to_59| age_60_plus|        

In [5]:
sdf.printSchema()

root
 |-- age_0_to_19: string (nullable = true)
 |-- age_20_to_39: string (nullable = true)
 |-- age_40_to_59: string (nullable = true)
 |-- age_60_plus: string (nullable = true)
 |-- agency_name: string (nullable = true)
 |-- agent_name: string (nullable = true)
 |-- appointment_only: string (nullable = true)
 |-- avg_days_on_market: string (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- car_spaces: string (nullable = true)
 |-- description: string (nullable = true)
 |-- family_percentage: string (nullable = true)
 |-- features_list: string (nullable = true)
 |-- first_listed_date: string (nullable = true)
 |-- full_address: string (nullable = true)
 |-- image_urls: string (nullable = true)
 |-- inspection_text: string (nullable = true)
 |-- land_area: string (nullable = true)
 |-- last_sold_date: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- listing_status: string (nullable = true)
 |-- listing_tag: strin

In [6]:
sdf.show(1, truncate=100, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------
 age_0_to_19         | 0.142523363                                                                                          
 age_20_to_39        | 0.5                                                                                                  
 age_40_to_59        | 0.226635516                                                                                          
 age_60_plus         | 0.130841121                                                                                          
 agency_name         | RT Edgar - Northside                                                                                 
 agent_name          | Lily Passarelli                                                                                      
 appointment_only    | False                                                                                                


### Schemas

We need to first fix the schemas, as spark inferred that most are string types. Let's fix this by casting the features to the correct datatypes

In [7]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, DoubleType, 
    DateType, BooleanType, TimestampType
)

# define the expected schema
defined_schema = StructType([
    StructField("age_0_to_19", DoubleType(), True),
    StructField("age_20_to_39", DoubleType(), True),
    StructField("age_40_to_59", DoubleType(), True),
    StructField("age_60_plus", DoubleType(), True),
    StructField("agency_name", StringType(), True),
    StructField("agent_name", StringType(), True),
    StructField("appointment_only", BooleanType(), True),
    StructField("avg_days_on_market", DoubleType(), True),
    StructField("bathrooms", IntegerType(), True),
    StructField("bedrooms", IntegerType(), True),
    StructField("car_spaces", IntegerType(), True),
    StructField("description", StringType(), True),
    StructField("family_percentage", DoubleType(), True),
    StructField("features_list", StringType(), True),
    StructField("first_listed_date", TimestampType(), True),
    StructField("full_address", StringType(), True),
    StructField("image_urls", StringType(), True),
    StructField("inspection_text", StringType(), True),
    StructField("land_area", DoubleType(), True),
    StructField("last_sold_date", DateType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("listing_status", StringType(), True),
    StructField("listing_tag", StringType(), True),
    StructField("listing_url", StringType(), True),
    StructField("long_term_resident", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("median_rent_price", IntegerType(), True),
    StructField("median_sold_price", IntegerType(), True),
    StructField("number_of_photos", IntegerType(), True),
    StructField("number_sold", IntegerType(), True),
    StructField("owner_percentage", DoubleType(), True),
    StructField("postcode", IntegerType(), True),
    StructField("property_features", StringType(), True),
    StructField("property_id", IntegerType(), True),
    StructField("property_type", StringType(), True),
    StructField("rental_price", StringType(), True),
    StructField("renter_percentage", DoubleType(), True),
    StructField("schools", StringType(), True),
    StructField("single_percentage", DoubleType(), True),
    StructField("state_abbreviation", StringType(), True),
    StructField("street", StringType(), True),
    StructField("street_number", StringType(), True),
    StructField("structured_features", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("unit_number", StringType(), True),
    StructField("updated_date", DateType(), True),
    StructField("url", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("quarter", IntegerType(), True)
])

In [8]:
# Cast columns to the defined schema where possible
# if casting fails, the value will be set to null
casts = []
for field in defined_schema.fields:
    if isinstance(field.dataType, (StringType, IntegerType, DoubleType, DateType, BooleanType, TimestampType)):
        casts.append(
            F.expr(
                f"try_cast(`{field.name}` AS {field.dataType.simpleString()})"
            ).alias(field.name)
        )
    else:
        casts.append(F.col(field.name))

In [9]:
sdf = sdf.select(*casts)
required_cols = [field.name for field in defined_schema.fields if not field.nullable]
sdf = sdf.dropna(subset=required_cols)

In [10]:
sdf.count()

31289

In [11]:
sdf.printSchema()

root
 |-- age_0_to_19: double (nullable = true)
 |-- age_20_to_39: double (nullable = true)
 |-- age_40_to_59: double (nullable = true)
 |-- age_60_plus: double (nullable = true)
 |-- agency_name: string (nullable = true)
 |-- agent_name: string (nullable = true)
 |-- appointment_only: boolean (nullable = true)
 |-- avg_days_on_market: double (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- car_spaces: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- family_percentage: double (nullable = true)
 |-- features_list: string (nullable = true)
 |-- first_listed_date: timestamp (nullable = true)
 |-- full_address: string (nullable = true)
 |-- image_urls: string (nullable = true)
 |-- inspection_text: string (nullable = true)
 |-- land_area: double (nullable = true)
 |-- last_sold_date: date (nullable = true)
 |-- latitude: double (nullable = true)
 |-- listing_status: string (nullable = true)
 |-- listing_tag: 

In [12]:
sdf.show(1, truncate=100, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------
 age_0_to_19         | 0.142523363                                                                                          
 age_20_to_39        | 0.5                                                                                                  
 age_40_to_59        | 0.226635516                                                                                          
 age_60_plus         | 0.130841121                                                                                          
 agency_name         | RT Edgar - Northside                                                                                 
 agent_name          | Lily Passarelli                                                                                      
 appointment_only    | false                                                                                                


### Weekly Rent

We now want to make sure we have a 'weekly rent' feature, which takes in the string in 'rental_price' (which is either weekly or monthly), and converts it into a rental price per week of data type 'double'

In [13]:
# see how many different types of values are in 'rental_price'
sdf = (
    sdf
      # 1. Extract numeric part and cast to double
      .withColumn(
          "price_value",
          F.regexp_replace("rental_price", "[^0-9.]", "")
           .cast("double")
      )
      # 2. Compute weekly rent: divide by 4 for monthly prices only
      .withColumn(
          "weekly_rent",
          F.when(
            F.col("rental_price").rlike("(?i)per month"),
            F.col("price_value") / F.lit(4)
          )
          .otherwise(F.col("price_value"))
      )
      # 3. (Optional) Drop the helper column
      .drop("price_value")
)

We also need to make sure that 'rental_price' must not be null, as the only other way of calculating the price is by imputation

In [14]:
# create an sdf of exceptions
# in this case: records where 'rental_price' is null
sdf_exceptions = sdf.filter(F.col("rental_price").isNull())

In [15]:
sdf_exceptions.count()

25/09/22 00:54:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2025_06.csv
25/09/22 00:54:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2024_12.csv
25/09/22 00:54:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2023_09.csv
25/09/22 00:54:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file:

17338

In [21]:
sdf_exceptions.show(5, truncate=100, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------
 age_0_to_19         | 0.06559406                                                                                           
 age_20_to_39        | 0.6311881                                                                                            
 age_40_to_59        | 0.225247532                                                                                          
 age_60_plus         | 0.0779702961                                                                                         
 agency_name         | Dingle Partners                                                                                      
 agent_name          | Shane Dangen                                                                                         
 appointment_only    | true                                                                                                 


In [16]:
sdf = sdf.filter(F.col("rental_price").isNotNull())

In [19]:
sdf.count()

25/09/22 00:54:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2023_03.csv
25/09/22 00:54:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2024_03.csv
25/09/22 00:54:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file: file:///home/zhanmusi/project-2-group-real-estate-industry-project-23-2025/data/raw/domain/rental_listings_2025_06.csv
25/09/22 00:54:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: rental_price
Expected: rental_price but found: 
CSV file:

13951

In [17]:
sdf.show(1, vertical=True, truncate=100)

-RECORD 0-------------------------------------------------------------------------------------------------------------------
 age_0_to_19         | 0.142523363                                                                                          
 age_20_to_39        | 0.5                                                                                                  
 age_40_to_59        | 0.226635516                                                                                          
 age_60_plus         | 0.130841121                                                                                          
 agency_name         | RT Edgar - Northside                                                                                 
 agent_name          | Lily Passarelli                                                                                      
 appointment_only    | false                                                                                                


In [20]:
sdf.show(4)

25/09/22 00:54:25 ERROR Executor: Exception in task 0.0 in stage 16.0 (TID 53)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 8 in cell [13]

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidInputInCastToNumberError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.ha

NumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "DOUBLE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 8 in cell [13]
