In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CSCI316_Project1_Regression_4_2") \
    .getOrCreate()

print("Spark version:", spark.version)


Spark version: 4.0.2


In [4]:
PARQUET_DIR = "/content/drive/MyDrive/cleaned_parquet"

import os
parquet_files = [f for f in os.listdir(PARQUET_DIR) if f.endswith(".parquet")]
print("Parquet files found:", len(parquet_files))
print(parquet_files[:20])


Parquet files found: 200
['part-00172-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00194-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00185-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00188-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00196-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00178-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00169-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00182-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00193-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00159-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00199-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00167-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00195-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c000.snappy.parquet', 'part-00161-1fd552f4-5ba5-48c4-ad91-1cb3b7309ccc-c0

In [5]:
df = spark.read.parquet(PARQUET_DIR)
print("Loaded parquet")

Loaded parquet


In [6]:
print("Schema:")
df.printSchema()

print("\nRow count:", df.count())

print("\nSample rows:")
df.show(5, truncate=False)


Schema:
root
 |-- transaction_id: string (nullable = true)
 |-- procedure_id: integer (nullable = true)
 |-- trans_group_id: integer (nullable = true)
 |-- trans_group_ar: string (nullable = true)
 |-- trans_group_en: string (nullable = true)
 |-- procedure_name_ar: string (nullable = true)
 |-- procedure_name_en: string (nullable = true)
 |-- instance_date: string (nullable = true)
 |-- property_type_id: integer (nullable = true)
 |-- property_type_ar: string (nullable = true)
 |-- property_type_en: string (nullable = true)
 |-- property_sub_type_id: integer (nullable = true)
 |-- property_sub_type_ar: string (nullable = true)
 |-- property_sub_type_en: string (nullable = true)
 |-- property_usage_ar: string (nullable = true)
 |-- property_usage_en: string (nullable = true)
 |-- reg_type_id: integer (nullable = true)
 |-- reg_type_ar: string (nullable = true)
 |-- reg_type_en: string (nullable = true)
 |-- area_id: integer (nullable = true)
 |-- area_name_ar: string (nullable = true)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [7]:
from pyspark.sql import functions as F

# Create real date column from the string
df2 = df.withColumn("instance_date_dt", F.to_date(F.col("instance_date"), "dd-MM-yyyy"))


df2.select("instance_date", "instance_date_dt").show(10, truncate=False)

# Create consistent feature columns (year/month/dow)
df2 = (
    df2.withColumn("year", F.year("instance_date_dt"))
       .withColumn("month", F.month("instance_date_dt"))
       .withColumn("dow", F.dayofweek("instance_date_dt"))
)

df2.select("instance_date", "instance_date_dt", "year", "month", "dow").show(10, truncate=False)


+-------------+----------------+
|instance_date|instance_date_dt|
+-------------+----------------+
|05-07-2010   |2010-07-05      |
|24-02-2009   |2009-02-24      |
|14-08-2007   |2007-08-14      |
|04-09-2018   |2018-09-04      |
|02-07-2007   |2007-07-02      |
|26-10-2009   |2009-10-26      |
|25-07-2011   |2011-07-25      |
|01-03-2010   |2010-03-01      |
|29-03-2010   |2010-03-29      |
|13-06-2018   |2018-06-13      |
+-------------+----------------+
only showing top 10 rows
+-------------+----------------+----+-----+---+
|instance_date|instance_date_dt|year|month|dow|
+-------------+----------------+----+-----+---+
|05-07-2010   |2010-07-05      |2010|7    |2  |
|24-02-2009   |2009-02-24      |2009|2    |3  |
|14-08-2007   |2007-08-14      |2007|8    |3  |
|04-09-2018   |2018-09-04      |2018|9    |3  |
|02-07-2007   |2007-07-02      |2007|7    |2  |
|26-10-2009   |2009-10-26      |2009|10   |2  |
|25-07-2011   |2011-07-25      |2011|7    |2  |
|01-03-2010   |2010-03-01      |2

In [8]:
from pyspark.sql import functions as F

label_col = "meter_sale_price"
numeric_cols = ["procedure_area", "has_parking", "year", "month", "dow"]
cat_cols = ["area_name_en", "nearest_metro_en", "nearest_mall_en", "nearest_landmark_en",
            "property_type_en", "rooms_en"]

needed_cols = [label_col] + numeric_cols + cat_cols

missing_exprs = [F.sum(F.col(c).isNull().cast("int")).alias(c) for c in needed_cols]
missing_row2 = df2.select(missing_exprs).collect()[0].asDict()
missing_row2

{'meter_sale_price': 0,
 'procedure_area': 0,
 'has_parking': 0,
 'year': 0,
 'month': 0,
 'dow': 0,
 'area_name_en': 0,
 'nearest_metro_en': 6837,
 'nearest_mall_en': 6872,
 'nearest_landmark_en': 1627,
 'property_type_en': 0,
 'rooms_en': 418}

In [12]:
from pyspark.sql import functions as F

label_col = "meter_sale_price"

numeric_cols = ["procedure_area", "has_parking", "year", "month", "dow"]
cat_cols = ["area_name_en", "nearest_metro_en", "nearest_mall_en", "nearest_landmark_en",
            "property_type_en", "rooms_en"]

needed_cols = [label_col] + numeric_cols + cat_cols
feature_cols = numeric_cols + cat_cols

# Keep only needed columns
df_reg = df2.select(needed_cols)


df_reg = (
    df_reg
    .filter(F.col(label_col) > 0)
    .filter(F.col("procedure_area") > 0)
)

# Fill missing categoricals with "unknown"
fill_map = {c: "unknown" for c in cat_cols}
df_reg = df_reg.fillna(fill_map)

# Fill missing has_parking with 0
df_reg = df_reg.fillna({"has_parking": 0})

print("Rows before:", df2.count())
print("Rows after :", df_reg.count())

df_reg.select(label_col, "procedure_area", "area_name_en", "nearest_metro_en", "nearest_mall_en").show(5, truncate=False)



Rows before: 30173
Rows after : 30173
+----------------+--------------+------------------+--------------------------+------------------+
|meter_sale_price|procedure_area|area_name_en      |nearest_metro_en          |nearest_mall_en   |
+----------------+--------------+------------------+--------------------------+------------------+
|5711.51         |151.8         |Burj Khalifa      |Business Bay Metro Station|Dubai Mall        |
|2653.16         |76.0          |Al Warsan First   |Rashidiya Metro Station   |City Centre Mirdif|
|3142.93         |356.3         |Al Thanayah Fourth|Nakheel Metro Station     |Marina Mall       |
|3653.13         |159.3         |Al Goze Fourth    |Noor Bank Metro Station   |Dubai Mall        |
|1772.12         |531.29        |Al Thanayah Fourth|Nakheel Metro Station     |Marina Mall       |
+----------------+--------------+------------------+--------------------------+------------------+
only showing top 5 rows


In [13]:
from pyspark.sql import functions as F

distinct_counts = df_reg.agg(
    *[F.countDistinct(c).alias(c) for c in cat_cols]
).collect()[0].asDict()

distinct_counts

{'area_name_en': 67,
 'nearest_metro_en': 38,
 'nearest_mall_en': 6,
 'nearest_landmark_en': 14,
 'property_type_en': 2,
 'rooms_en': 12}

In [14]:
df_reg.select([label_col] + feature_cols).describe().show()

+-------+-----------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----------------+------------------+--------------------+----------------+--------+
|summary| meter_sale_price|    procedure_area|        has_parking|              year|             month|               dow|   area_name_en| nearest_metro_en|   nearest_mall_en| nearest_landmark_en|property_type_en|rooms_en|
+-------+-----------------+------------------+-------------------+------------------+------------------+------------------+---------------+-----------------+------------------+--------------------+----------------+--------+
|  count|            30173|             30173|              30173|             30173|             30173|             30173|          30173|            30173|             30173|               30173|           30173|   30173|
|   mean|23615.96894077486|146.69104928247134| 0.7473237662811123|2012.9983428893381|6.6813376197262455|

# Regression

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler


label_col = "meter_sale_price"

numeric_cols = ["procedure_area", "has_parking", "year", "month", "dow"]
cat_cols = ["area_name_en", "nearest_metro_en", "nearest_mall_en",
            "nearest_landmark_en", "property_type_en", "rooms_en"]

# Index categorical columns
indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep")
    for c in cat_cols
]

# OneHot encode them
encoder = OneHotEncoder(
    inputCols=[f"{c}_idx" for c in cat_cols],
    outputCols=[f"{c}_ohe" for c in cat_cols],
    handleInvalid="keep"
)

# Assemble everything into feature vector
assembler = VectorAssembler(
    inputCols=numeric_cols + [f"{c}_ohe" for c in cat_cols],
    outputCol="features_raw"
)

# Scale features
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withMean=True,
    withStd=True
)



In [16]:
print("Numeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['procedure_area', 'has_parking', 'year', 'month', 'dow']
Categorical columns: ['area_name_en', 'nearest_metro_en', 'nearest_mall_en', 'nearest_landmark_en', 'property_type_en', 'rooms_en']


In [17]:
train_df, test_df = df_reg.randomSplit([0.8, 0.2], seed=42)
print("Train:", train_df.count(), "Test:", test_df.count())


Train: 24263 Test: 5910


## Model 1 — Linear Regression (Baseline)

In [18]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(featuresCol="features", labelCol=label_col)

pipe_lr = Pipeline(stages=indexers + [encoder, assembler, scaler, lr])
model_lr = pipe_lr.fit(train_df)

pred_lr = model_lr.transform(test_df)

rmse_eval = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
mae_eval  = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="mae")
r2_eval   = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="r2")

rmse = rmse_eval.evaluate(pred_lr)
mae  = mae_eval.evaluate(pred_lr)
r2   = r2_eval.evaluate(pred_lr)

print(f"Linear Regression -> RMSE={rmse:.2f} | MAE={mae:.2f} | R2={r2:.4f}")

pred_lr.select(label_col, "prediction", "procedure_area", "area_name_en", "nearest_metro_en").show(5, truncate=False)

Linear Regression -> RMSE=312022.94 | MAE=39021.79 | R2=0.0718
+----------------+-------------------+--------------+------------------+---------------------+
|meter_sale_price|prediction         |procedure_area|area_name_en      |nearest_metro_en     |
+----------------+-------------------+--------------+------------------+---------------------+
|269.87          |7426.639403459172  |100.17        |Wadi Al Safa 5    |unknown              |
|504.98          |14747.630819908192 |1221.39       |Al Thanayah Fourth|Nakheel Metro Station|
|542.9           |20913.540456543837 |745.11        |Wadi Al Safa 6    |unknown              |
|705.64          |-418.6678913261203 |510.02        |Al Thanayah Fourth|Nakheel Metro Station|
|826.4           |-12711.041560363552|291.56        |Al Thanayah Fourth|Damac Properties     |
+----------------+-------------------+--------------+------------------+---------------------+
only showing top 5 rows


## Model 2 — Decision Tree Regression

In [19]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(
    featuresCol="features",
    labelCol=label_col,
    maxDepth=8,
    seed=42
)

pipe_dt = Pipeline(stages=indexers + [encoder, assembler, scaler, dt])
model_dt = pipe_dt.fit(train_df)

pred_dt = model_dt.transform(test_df)

rmse_dt = rmse_eval.evaluate(pred_dt)
mae_dt  = mae_eval.evaluate(pred_dt)
r2_dt   = r2_eval.evaluate(pred_dt)

print(f"Decision Tree -> RMSE={rmse_dt:.2f} | MAE={mae_dt:.2f} | R2={r2_dt:.4f}")

Decision Tree -> RMSE=43988.91 | MAE=4250.37 | R2=0.9816


### Model validation analysis

In [20]:
for col_name in ["procedure_area", "has_parking", "year", "month", "dow"]:
    corr = df_reg.stat.corr("meter_sale_price", col_name)
    print(col_name, "correlation:", corr)

procedure_area correlation: -0.029957118001162505
has_parking correlation: 0.03226628907222292
year correlation: -0.0078085660188522196
month correlation: 0.003540467002043718
dow correlation: 0.04151403601482724


In [None]:
print("Linear relationships are weak thats why LR underperformed")

Linear relationships are weak thats why LR underperformed


In [21]:
print("Train R2:")
pred_train_dt = model_dt.transform(train_df)
print(r2_eval.evaluate(pred_train_dt))

print("Test R2:")
print(r2_dt)

Train R2:
0.9653089127155318
Test R2:
0.9815524317738921


In [None]:
print("Results show that there is no major overfitting and the tree has generalized well")

Results show that there is no major overfitting and the tree has generalized well


## Model 3 — RandomForest

In [22]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=label_col,
    numTrees=80,
    maxDepth=10,
    seed=42
)

pipe_rf = Pipeline(stages=indexers + [encoder, assembler, scaler, rf])
model_rf = pipe_rf.fit(train_df)

pred_rf = model_rf.transform(test_df)

rmse_rf = rmse_eval.evaluate(pred_rf)
mae_rf  = mae_eval.evaluate(pred_rf)
r2_rf   = r2_eval.evaluate(pred_rf)

print(f"Random Forest -> RMSE={rmse_rf:.2f} | MAE={mae_rf:.2f} | R2={r2_rf:.4f}")

Random Forest -> RMSE=60903.11 | MAE=5290.98 | R2=0.9646


In [23]:
pred_lr.select(label_col, "prediction").show(5)
pred_dt.select(label_col, "prediction").show(5)
pred_rf.select(label_col, "prediction").show(5)

+----------------+-------------------+
|meter_sale_price|         prediction|
+----------------+-------------------+
|          269.87|  7426.639403459172|
|          504.98| 14747.630819908192|
|           542.9| 20913.540456543837|
|          705.64| -418.6678913261203|
|           826.4|-12711.041560363552|
+----------------+-------------------+
only showing top 5 rows
+----------------+------------------+
|meter_sale_price|        prediction|
+----------------+------------------+
|          269.87| 7791.114618443687|
|          504.98|1248.0702727272726|
|           542.9| 3343.613353293414|
|          705.64|1248.0702727272726|
|           826.4| 5887.941458167331|
+----------------+------------------+
only showing top 5 rows
+----------------+------------------+
|meter_sale_price|        prediction|
+----------------+------------------+
|          269.87| 7308.485728140845|
|          504.98|1441.1824985522994|
|           542.9|2776.8805277700585|
|          705.64|1492.64507003

## Manual CV - Decision Tree

---



In [24]:
from pyspark.sql.functions import rand, floor

k = 10

df_cv = train_df.withColumn("fold", floor(rand(seed=42) * k))

df_cv.groupBy("fold").count().show()

+----+-----+
|fold|count|
+----+-----+
|   0| 2391|
|   7| 2440|
|   6| 2419|
|   9| 2343|
|   5| 2481|
|   1| 2481|
|   3| 2415|
|   8| 2368|
|   2| 2395|
|   4| 2530|
+----+-----+



In [26]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

dt = DecisionTreeRegressor(
    featuresCol="features",
    labelCol=label_col,
    maxDepth=8,
    seed=42
)

rmse_scores = []

for fold in range(k):

    train_fold = df_cv.filter(df_cv.fold != fold)
    val_fold   = df_cv.filter(df_cv.fold == fold)

    pipe = Pipeline(stages=indexers + [encoder, assembler, scaler, dt])
    model = pipe.fit(train_fold)

    preds = model.transform(val_fold)

    rmse = rmse_eval.evaluate(preds)
    rmse_scores.append(rmse)

    print(f"Fold {fold} RMSE: {rmse:.2f}")

print("Average RMSE:", sum(rmse_scores)/len(rmse_scores))

Fold 0 RMSE: 18633.36
Fold 1 RMSE: 108073.11
Fold 2 RMSE: 82140.80
Fold 3 RMSE: 31562.31
Fold 4 RMSE: 41666.85
Fold 5 RMSE: 144988.07
Fold 6 RMSE: 157469.06
Fold 7 RMSE: 62500.64
Fold 8 RMSE: 67545.96
Fold 9 RMSE: 44690.91
Average RMSE: 75927.10663991304


## Manual CV - Linear Regression

---

In [27]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

lr = LinearRegression(featuresCol="features", labelCol=label_col)

rmse_lr_scores = []

for fold in range(k):

    train_fold = df_cv.filter(df_cv.fold != fold)
    val_fold   = df_cv.filter(df_cv.fold == fold)

    pipe = Pipeline(stages=indexers + [encoder, assembler, scaler, lr])
    model = pipe.fit(train_fold)

    preds = model.transform(val_fold)

    rmse = rmse_eval.evaluate(preds)
    rmse_lr_scores.append(rmse)

    print(f"LR Fold {fold} RMSE: {rmse:.2f}")

print("LR Average RMSE:", sum(rmse_lr_scores)/len(rmse_lr_scores))

LR Fold 0 RMSE: 273007.23
LR Fold 1 RMSE: 342460.01
LR Fold 2 RMSE: 340887.24
LR Fold 3 RMSE: 272975.02
LR Fold 4 RMSE: 247539.88
LR Fold 5 RMSE: 313628.03
LR Fold 6 RMSE: 428906.59
LR Fold 7 RMSE: 217027.85
LR Fold 8 RMSE: 274601.32
LR Fold 9 RMSE: 246549.54
LR Average RMSE: 295758.2704787035


## Manual CV - Random Forest

---

In [28]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=label_col,
    numTrees=80,
    maxDepth=10,
    seed=42
)

rmse_rf_scores = []

for fold in range(k):

    train_fold = df_cv.filter(df_cv.fold != fold)
    val_fold   = df_cv.filter(df_cv.fold == fold)

    pipe = Pipeline(stages=indexers + [encoder, assembler, scaler, rf])
    model = pipe.fit(train_fold)

    preds = model.transform(val_fold)

    rmse = rmse_eval.evaluate(preds)
    rmse_rf_scores.append(rmse)

    print(f"RF Fold {fold} RMSE: {rmse:.2f}")

print("RF Average RMSE:", sum(rmse_rf_scores)/len(rmse_rf_scores))

RF Fold 0 RMSE: 32573.59
RF Fold 1 RMSE: 52809.36
RF Fold 2 RMSE: 89267.25
RF Fold 3 RMSE: 35494.23
RF Fold 4 RMSE: 47731.92
RF Fold 5 RMSE: 128219.94
RF Fold 6 RMSE: 123140.47
RF Fold 7 RMSE: 54043.65
RF Fold 8 RMSE: 75034.83
RF Fold 9 RMSE: 48008.39
RF Average RMSE: 68632.36410951514


## Ensemble Model

---

In [29]:
from pyspark.sql.functions import monotonically_increasing_id

train_df = train_df.withColumn("row_id", monotonically_increasing_id())
test_df  = test_df.withColumn("row_id", monotonically_increasing_id())

In [30]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

B = 10

dt_base = DecisionTreeRegressor(
    featuresCol="features",
    labelCol=label_col,
    maxDepth=8,
    seed=42
)

pred_list = []

for b in range(B):
    boot_train = train_df.sample(withReplacement=True, fraction=1.0, seed=100 + b)

    pipe = Pipeline(stages=indexers + [encoder, assembler, scaler, dt_base])
    model = pipe.fit(boot_train)

    preds = model.transform(test_df).select(
        "row_id",
        F.col("prediction").alias(f"pred_{b}")
    )

    pred_list.append(preds)
    print(f"Bag model {b+1}/{B} done")

Bag model 1/10 done
Bag model 2/10 done
Bag model 3/10 done
Bag model 4/10 done
Bag model 5/10 done
Bag model 6/10 done
Bag model 7/10 done
Bag model 8/10 done
Bag model 9/10 done
Bag model 10/10 done


In [31]:
# join all prediction columns by row_id
bagged = pred_list[0]
for i in range(1, B):
    bagged = bagged.join(pred_list[i], on="row_id", how="inner")

# bring true label from test_df
bagged = bagged.join(test_df.select("row_id", label_col), on="row_id", how="inner")

# average predictions
pred_cols = [F.col(f"pred_{i}") for i in range(B)]
bagged = bagged.withColumn("prediction", sum(pred_cols) / F.lit(B))

# evaluate
rmse_bag = rmse_eval.evaluate(bagged)
mae_bag  = mae_eval.evaluate(bagged)
r2_bag   = r2_eval.evaluate(bagged)

print(f"Bagging -> RMSE={rmse_bag:.2f} | MAE={mae_bag:.2f} | R2={r2_bag:.4f}")

Bagging -> RMSE=44032.79 | MAE=4191.56 | R2=0.9815


Decision Tree Regression significantly outperformed Linear Regression across all metrics and was therefore selected as the final regression model