In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("Housing price prediction pipeline") \
  .getOrCreate()

In [2]:
data = spark.read.csv("data/kc_house_data.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)



In [60]:

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DoubleType

filtered_data = data.drop('id', 'date', 'sqft_living15', 'sqft_lot15', 'zipcode', 'lat','long', 'yr_renovated')

filtered_data \
    .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in filtered_data.columns]) \
    .show()

# Count how many rows will be dropped
filtered_data.select([
    F.count(F.when(F.col(c) == 0, 1)).alias(c) for c in filtered_data.columns
]).show()

+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+
|price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|
+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+
|    0|       0|        0|          0|       0|     0|         0|   0|        0|    0|         0|            0|       0|
+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+

+-----+--------+---------+-----------+--------+------+----------+-----+---------+-----+----------+-------------+--------+
|price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront| view|condition|grade|sqft_above|sqft_basement|yr_built|
+-----+--------+---------+-----------+--------+------+----------+-----+---------+-----+----------+-------------+--------+
|    0|      13|       10|  

In [61]:
data_cleaned = filtered_data.filter((F.col("bedrooms") != 0) & (F.col("bathrooms") != 0))

In [62]:
train_data, test_data = data_cleaned.randomSplit([0.8, 0.2], seed=24)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  17288
Test size:  4309


In [63]:
train_data.select('grade') \
  .distinct() \
  .show()

+-----+
|grade|
+-----+
|   12|
|   13|
|    6|
|    5|
|    9|
|    4|
|    8|
|    7|
|   10|
|   11|
+-----+



In [81]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

encoders= OneHotEncoder(inputCols=['waterfront', 'view', 'condition', 'grade'], \
                           outputCols=['waterfront_vec', 'view_vec', 'condition_vec', 'grade_vec'], \
                        dropLast=False)

feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                'waterfront_vec', 'view_vec', 'condition_vec', 'grade_vec',
                'sqft_above', 'sqft_basement', 'yr_built']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

lr = LinearRegression(featuresCol='features', labelCol='price')

pipeline = Pipeline(stages=[encoders, assembler, lr])

pipeline_model = pipeline.fit(train_data)

In [82]:
test_predictions = pipeline_model.transform(test_data)

In [83]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_mae = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='mae')
mae = evaluator_mae.evaluate(test_predictions)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 132483.95305184484
