# Prediction

Insert path to data below (sample path inserted):

In [None]:
path = "/Users/rossfleming/Documents/GitHub/bigdata_hw3/exploitation_zone/idealista.parquet/part-00000-5b53f1a9-dc1d-44b9-96a5-9d8ba51dd7a3-c000.snappy.parquet"

In [9]:
import pandas as pd
import glob
import re

from pyspark.sql import SparkSession
from pyspark import  SparkConf
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier



In [17]:
import os
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk'


appName = "app"
master = "local[*]" # Spark will use all cores (*) available
if not 'spark' in globals(): 
  conf = SparkConf().setAppName(appName).setMaster(master)
  spark = SparkSession.builder \
        .config(conf=conf) \
        .getOrCreate()

In [18]:
spark

In [20]:
data = spark.read.parquet(path)


In [37]:
data.printSchema()

root
 |-- bathrooms: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- exterior: boolean (nullable = true)
 |-- floor: integer (nullable = true)
 |-- has360: boolean (nullable = true)
 |-- has3dtour: boolean (nullable = true)
 |-- haslift: boolean (nullable = true)
 |-- hasplan: boolean (nullable = true)
 |-- hasstaging: boolean (nullable = true)
 |-- hasvideo: boolean (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- newdevelopment: boolean (nullable = true)
 |-- numphotos: long (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- showaddress: boolean (nullable = true)
 |-- size: double (nullable = true)
 |-- topnewdevelopment: boolean (nullable = true)
 |-- hasparkingspace: boolean (nullable = true)
 |-- isparkingspaceincludedinprice: boolean (nullable = true)
 |-- year: integer (nullable = true)
 |-- 2013_per_meter: double (nullable = true)
 |-- 2013_used_per_meter: double (nullable = true)
 |-- 2013_

In [59]:
mlflow.start_run()

<ActiveRun: >

In [75]:
train_data, validation_data = data.randomSplit([0.8, 0.2], seed=42)


In [76]:

feature_cols = [col for col in data.columns if col not in ['price_classification']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_data = assembler.transform(train_data)
validation_data = assembler.transform(validation_data)


In [84]:


lr = LogisticRegression(featuresCol='features', labelCol='price_classification')
lr_model = lr.fit(train_data)

# Log model and parameters
mlflow.spark.log_model(lr_model, "logistic-regression-model")
mlflow.log_param("model_type", "logistic_regression")


'logistic_regression'

In [85]:

predictions_lr = lr_model.transform(validation_data)
evaluator_lr = MulticlassClassificationEvaluator(labelCol="price_classification", metricName="accuracy")
accuracy_lr = evaluator.evaluate(predictions_lr)
mlflow.log_metric("accuracy", accuracy_lr)

print(accuracy_lr)

0.9282899921197794


In [86]:

# Train the Decision Tree model
dt = DecisionTreeClassifier(featuresCol='features', labelCol='price_classification')
dt_model = dt.fit(train_data)

# Log model and parameters in MLflow
mlflow.spark.log_model(dt_model, "decision-tree-model")
mlflow.log_param("maxDepth", dt_model.getMaxDepth())
mlflow.log_param("maxBins", dt_model.getMaxBins())

# Predict and evaluate
predictions_dt = dt_model.transform(validation_data)

In [88]:
evaluator_dt = MulticlassClassificationEvaluator(labelCol="price_classification", metricName="accuracy")
accuracy_dt = evaluator.evaluate(predictions_dt)
mlflow.log_metric("accuracy", accuracy_dt)

print(accuracy_dt)

0.9204097714736013


In [90]:
if accuracy_dt > accuracy_lr:
    best_model = dt_model
else: 
    best_model = lr_model
    
print(best_model)

LogisticRegressionModel: uid=LogisticRegression_6e6360ea7862, numClasses=2, numFeatures=123
