In [51]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

import findspark
findspark.init()

import sys
import seaborn as sns 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings

from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression


#from pyspark.ml.recommendation import ALS, ALSModel
#from pyspark.mllib.recommendation import MatrixFactorizationModel
from pyspark.sql import Row, SQLContext,SparkSession
import os

parking = "TresCrucesShopping"
hadoop_home = "/opt/hadoop/hadoop-3.3.2"

os.environ['HADOOP_USER_NAME'] = "hadoop"
os.environ['HADOOP_CONF_DIR'] = hadoop_home + "/etc/hadoop"
os.environ['YARN_CONF_DIR'] = hadoop_home + "/etc/hadoop"

In [2]:
warnings.filterwarnings("ignore")

In [3]:
master = "local[*]"
#master = "yarn"

# Spark session & context
spark = (SparkSession
         .builder
         .master(master)
         .appName('MLIB-RL-Simple')
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext

22/08/04 22:21:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/04 22:21:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/04 22:21:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark

In [10]:
df_parking_machine_learning = spark \
    .read \
    .parquet("hdfs://hadoop-namenode:9000/machineLearning/Parkings/"+parking+"/year=2022/month=7/")

In [13]:
df_parking_machine_learning = df_parking_machine_learning.drop("parking_id", "hour", "minutes", "day").orderBy("datetime")

In [14]:
df_parking_machine_learning.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- device_id: string (nullable = true)
 |-- parking_temperature: double (nullable = true)
 |-- parking_humidity: double (nullable = true)
 |-- level_id: integer (nullable = true)
 |-- area_id: integer (nullable = true)
 |-- area_name: string (nullable = true)
 |-- area_total_spots: integer (nullable = true)
 |-- parking_weather_status: string (nullable = true)
 |-- parking_wind_speed: double (nullable = true)
 |-- parking_holiday_status: integer (nullable = true)
 |-- parking_holiday_type: string (nullable = true)
 |-- parking_closed: integer (nullable = true)
 |-- area_occupied_spots: integer (nullable = true)
 |-- area_occupation_percentage: integer (nullable = true)
 |-- area_available_spots: integer (nullable = true)
 |-- slot_id: integer (nullable = true)
 |-- slot_state: integer (nullable = true)



In [35]:
vectorAssembler = VectorAssembler(
                        inputCols = ["parking_temperature", "parking_humidity", 
                                     "area_total_spots", "parking_wind_speed",
                                     "parking_closed", "parking_holiday_status",
                                     "area_available_spots", "area_occupied_spots",
                                     "slot_state"],
                        outputCol = 'features')

In [36]:
va_df_parking = vectorAssembler.transform(df_parking_machine_learning)

In [37]:
va_df_parking = va_df_parking.select(['datetime', "level_id", "area_id", "slot_id", 'features', 'area_occupation_percentage'])
va_df_parking.show(3)



+-------------------+--------+-------+-------+--------------------+--------------------------+
|           datetime|level_id|area_id|slot_id|            features|area_occupation_percentage|
+-------------------+--------+-------+-------+--------------------+--------------------------+
|2022-07-01 18:49:00|       1|      1|      5|[10.01,87.0,10.0,...|                        80|
|2022-07-01 18:49:00|       1|      2|      1|[10.01,87.0,10.0,...|                        70|
|2022-07-01 18:49:00|       1|      2|      3|[10.01,87.0,10.0,...|                        70|
+-------------------+--------+-------+-------+--------------------+--------------------------+
only showing top 3 rows



                                                                                

In [38]:
train_df, test_df = va_df_parking.randomSplit([0.7, 0.3])

In [39]:
lr = LinearRegression(featuresCol = 'features', labelCol='area_occupation_percentage', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

[Stage 73:>                                                         (0 + 4) / 4]

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,-4.916784697362235,4.916784697371434,0.0]
Intercept: 49.994950244869614




In [40]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.269448
r2: 0.999723


In [41]:
train_df.describe().show()

                                                                                

+-------+------------------+------------------+------------------+--------------------------+
|summary|          level_id|           area_id|           slot_id|area_occupation_percentage|
+-------+------------------+------------------+------------------+--------------------------+
|  count|           1653246|           1653246|           1653246|                   1653246|
|   mean| 1.889034662718071|2.1105479765261794| 5.500019355861136|        49.700522487276544|
| stddev|0.7370708422474119|0.9934471083179036|2.8718371828594016|        16.195227684021123|
|    min|                 1|                 1|                 1|                         0|
|    max|                 3|                 4|                10|                       100|
+-------+------------------+------------------+------------------+--------------------------+



In [46]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select('datetime', "level_id", "area_id", "slot_id","prediction","area_occupation_percentage","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="area_occupation_percentage",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

                                                                                

+-------------------+--------+-------+-------+-----------------+--------------------------+--------------------+
|           datetime|level_id|area_id|slot_id|       prediction|area_occupation_percentage|            features|
+-------------------+--------+-------+-------+-----------------+--------------------------+--------------------+
|2022-07-01 18:49:00|       1|      1|      2|79.49565842911662|                        80|[10.01,87.0,10.0,...|
|2022-07-01 18:49:00|       1|      1|      5|79.49565842911662|                        80|[10.01,87.0,10.0,...|
|2022-07-01 18:49:00|       1|      1|      6|79.49565842911662|                        80|[10.01,87.0,10.0,...|
|2022-07-01 18:49:00|       1|      1|     10|79.49565842911662|                        80|[10.01,87.0,10.0,...|
|2022-07-01 18:49:00|       1|      2|      3|69.66208903438294|                        70|[10.01,87.0,10.0,...|
+-------------------+--------+-------+-------+-----------------+--------------------------+-----

[Stage 108:>                                                        (0 + 4) / 4]

R Squared (R2) on test data = 0.999723




In [47]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 10
objectiveHistory: [0.5000000000000009, 0.3629994541167867, 0.022008374073255794, 0.017991776472006324, 0.015628629925099596, 0.01560958276540908, 0.015608989092570302, 0.015608971466004382, 0.015608970942660573, 0.015608970927121373, 0.015608970926660276]


[Stage 112:>                                                        (0 + 1) / 1]

+------------------+
|         residuals|
+------------------+
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.3379109656170556|
|0.3379109656170556|
|0.3379109656170556|
|0.3379109656170556|
|0.3379109656170556|
|0.3379109656170556|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
|0.5043415708833834|
+------------------+
only showing top 20 rows



                                                                                

In [52]:
predictions = lr_model.transform(test_df)
predictions = predictions.select("datetime", "level_id", "area_id",
                   "slot_id","prediction","area_occupation_percentage",
                   "features") \
            .withColumnRenamed("prediction", "area_ocupation_percentage_target")


[Stage 128:>                                                        (0 + 1) / 1]

+-------------------+--------+-------+-------+--------------------------------+--------------------------+
|           datetime|level_id|area_id|slot_id|area_ocupation_percentage_target|area_occupation_percentage|
+-------------------+--------+-------+-------+--------------------------------+--------------------------+
|2022-07-01 18:49:00|       1|      1|      2|               79.49565842911662|                        80|
|2022-07-01 18:49:00|       1|      1|      5|               79.49565842911662|                        80|
|2022-07-01 18:49:00|       1|      1|      6|               79.49565842911662|                        80|
|2022-07-01 18:49:00|       1|      1|     10|               79.49565842911662|                        80|
|2022-07-01 18:49:00|       1|      2|      3|               69.66208903438294|                        70|
|2022-07-01 18:49:00|       1|      2|      6|               69.66208903438294|                        70|
|2022-07-01 18:49:00|       1|      2

                                                                                

In [None]:
predictions.drop("features").show()

In [None]:
predictions