In [1]:

import os

# Set environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/koushik/spark"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create Spark session
spark = SparkSession.builder \
    .appName("7006SCN_Feature_Engineering") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("Spark Version:", spark.version)

26/02/26 08:09:27 WARN Utils: Your hostname, KoushikPC resolves to a loopback address: 127.0.1.2; using 10.255.255.254 instead (on interface lo)
26/02/26 08:09:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/26 08:09:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/26 08:09:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Version: 3.5.1


In [2]:
df = spark.read.parquet("/home/koushik/pp_parquet")

print("Rows:", df.count())
print("Columns:", len(df.columns))
df.show(3)

Rows: 30856185
Columns: 9
+-----+-------------------+--------+---------+-------+--------+-------------+---------+---------------+
|price|               date|postcode|prop_type|old_new|duration|         town| district|         county|
+-----+-------------------+--------+---------+-------+--------+-------------+---------+---------------+
|75000|1999-11-19 00:00:00| DY5 4PZ|        D|      N|       F|BRIERLEY HILL|   DUDLEY|  WEST MIDLANDS|
|49995|1999-10-28 00:00:00| DN6 7UP|        T|      Y|       F|    DONCASTER|DONCASTER|SOUTH YORKSHIRE|
|79995|1999-06-11 00:00:00| IG1 1YF|        T|      N|       F|       ILFORD|REDBRIDGE| GREATER LONDON|
+-----+-------------------+--------+---------+-------+--------+-------------+---------+---------------+
only showing top 3 rows



In [3]:
df.printSchema()
df.describe().show()

root
 |-- price: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- postcode: string (nullable = true)
 |-- prop_type: string (nullable = true)
 |-- old_new: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- town: string (nullable = true)
 |-- district: string (nullable = true)
 |-- county: string (nullable = true)



26/02/26 08:10:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------+--------+---------+--------+--------+--------------+---------+--------+
|summary|            price|postcode|prop_type| old_new|duration|          town| district|  county|
+-------+-----------------+--------+---------+--------+--------+--------------+---------+--------+
|  count|         30856185|30856185| 30856185|30856185|30856185|      30856185| 30856185|30856185|
|   mean|233159.2306302286|    NULL|     NULL|    NULL|    NULL|          NULL|     NULL|    NULL|
| stddev|955949.1723705888|    NULL|     NULL|    NULL|    NULL|          NULL|     NULL|    NULL|
|    min|                1| AL1 1AJ|        D|       N|       F|ABBOTS LANGLEY|ABERCONWY|    AVON|
|    max|        900000000|YO91 1RT|        T|       Y|       U| YSTRAD MEURIG|     YORK|    YORK|
+-------+-----------------+--------+---------+--------+--------+--------------+---------+--------+



                                                                                

In [4]:
from pyspark.sql.functions import year

df = df.withColumn("year", year("date"))

df.select("date", "year").show(5)

+-------------------+----+
|               date|year|
+-------------------+----+
|1999-11-19 00:00:00|1999|
|1999-10-28 00:00:00|1999|
|1999-06-11 00:00:00|1999|
|1999-03-01 00:00:00|1999|
|1999-12-09 00:00:00|1999|
+-------------------+----+
only showing top 5 rows



In [5]:
df = df.drop("date")

In [6]:
for c in ["prop_type", "old_new", "duration", "district", "county"]:
    print(c, ":", df.select(c).distinct().count())

                                                                                

prop_type : 5
old_new : 2
duration : 3


                                                                                

district : 467
county : 132


In [7]:
from pyspark.sql.functions import log1p

df = df.withColumn("log_price", log1p("price"))

In [8]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

categorical_cols = ["prop_type", "old_new", "duration", "district", "county"]

indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep")
    for c in categorical_cols
]

encoders = [
    OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec")
    for c in categorical_cols
]

assembler = VectorAssembler(
    inputCols=[c+"_vec" for c in categorical_cols] + ["year"],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + encoders + [assembler])

df_features = pipeline.fit(df).transform(df)

df_features.select("features", "log_price").show(3, truncate=False)

                                                                                

+-----------------------------------------------------+------------------+
|features                                             |log_price         |
+-----------------------------------------------------+------------------+
|(610,[2,5,7,40,480,609],[1.0,1.0,1.0,1.0,1.0,1999.0])|11.225256725762893|
|(610,[0,6,7,33,488,609],[1.0,1.0,1.0,1.0,1.0,1999.0])|10.819698281210112|
|(610,[0,5,7,62,477,609],[1.0,1.0,1.0,1.0,1.0,1999.0])|11.289731912405976|
+-----------------------------------------------------+------------------+
only showing top 3 rows



In [9]:
df_model = df_features.select("features", "log_price")

print("Final Dataset Rows:", df_model.count())
df_model.show(3)

Final Dataset Rows: 30856185
+--------------------+------------------+
|            features|         log_price|
+--------------------+------------------+
|(610,[2,5,7,40,48...|11.225256725762893|
|(610,[0,6,7,33,48...|10.819698281210112|
|(610,[0,5,7,62,47...|11.289731912405976|
+--------------------+------------------+
only showing top 3 rows



In [10]:
df_model.write.mode("overwrite").parquet("/home/koushik/pp_features")

                                                                                