In [1]:
import findspark
findspark.init()

import sys
import seaborn as sns 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings

from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.ml.evaluation import RegressionEvaluator


#from pyspark.ml.recommendation import ALS, ALSModel
#from pyspark.mllib.recommendation import MatrixFactorizationModel
from pyspark.sql import Row, SQLContext,SparkSession
import os

parking = "TresCrucesShopping"
hadoop_home = "/opt/hadoop/hadoop-3.3.2"

os.environ['HADOOP_USER_NAME'] = "hadoop"
os.environ['HADOOP_CONF_DIR'] = hadoop_home + "/etc/hadoop"
os.environ['YARN_CONF_DIR'] = hadoop_home + "/etc/hadoop"

In [2]:
warnings.filterwarnings("ignore")

In [3]:
master = "local[*]"
#master = "yarn"

# Spark session & context
spark = (SparkSession
         .builder
         .master(master)
         .appName('MLIB-DataPreparation-'+parking)
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext

22/08/03 20:57:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/03 20:57:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/03 20:57:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark

## Carga de datos

Se realiza la carga de datos de la data de un día anterior a los datos

In [5]:
df_parking = spark.read.parquet("hdfs://hadoop-namenode:9000/data/Parkings/TresCrucesShopping/year=2022/month=7/")

                                                                                

In [6]:
df_parking_cleaned = df_parking \
            .withColumn("year", year(col("device_timestamp"))) \
            .withColumn("month", month(col("device_timestamp"))) \
            .withColumn("day", dayofmonth(col("device_timestamp"))) \
            .withColumn("parking_temperature", col("parking_temperature").cast("double"))\
            .withColumn("parking_humidity", col("parking_humidity").cast("double"))\
            .withColumn(
                "parking_holiday_status",
                when(col("day") == 18,True)
                .otherwise(col("parking_holiday_status"))
            ) \
            .withColumn(
                "parking_holiday_type",
                when(col("day") == 18, 'NL')
                .otherwise(col("parking_holiday_type"))
            ) \
            .withColumn("slot_state", col("slot_state").cast("integer")) \
            .withColumn("parking_closed", col("parking_closed").cast("integer")) \
            .withColumn("parking_holiday_status", col("parking_holiday_status").cast("integer")) \
            .drop("device_slots", 
                  "parking_address",
                  "parking_description",
                  "parking_weather_status_detailed",
                  "parking_uuid",
                  "parking_holiday_description",
                  "level_name",
                  "area_occupation",
                  "device_timestamp",
                  #"parking_latitude",
                  #"parking_longitude"
            )

In [7]:
df_parking_cleaned.printSchema()

root
 |-- parking_name: string (nullable = true)
 |-- parking_id: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- parking_latitude: double (nullable = true)
 |-- parking_longitude: double (nullable = true)
 |-- parking_temperature: double (nullable = true)
 |-- parking_humidity: double (nullable = true)
 |-- level_id: integer (nullable = true)
 |-- area_id: integer (nullable = true)
 |-- area_name: string (nullable = true)
 |-- area_total_spots: integer (nullable = true)
 |-- parking_weather_status: string (nullable = true)
 |-- parking_wind_speed: double (nullable = true)
 |-- parking_holiday_status: integer (nullable = true)
 |-- parking_holiday_type: string (nullable = true)
 |-- parking_closed: integer (nullable = true)
 |-- minutes: integer (nullable = true)
 |-- area_occupied_spots: integer (nullable = true)
 |-- area_occupation_percentage: integer (nullable = true)
 |-- area_available_spots: integer (nullable = true)
 |-- slot_id: integer (nullable = true)

In [8]:
df_parking_cleaned.createOrReplaceTempView("tres_cruces_shopping")

In [9]:
tres_cruces_shopping_cleaned = spark.sql("""
SELECT
    DISTINCT(tcs.year, tcs.month, tcs.day, tcs.hour, tcs.minutes, tcs.device_id, tcs.level_id, tcs.area_id, tcs.slot_id) as MASTER_KEY,
    CAST(CONCAT("2022-7-", tcs.day, " ", tcs.hour, ":", tcs.minutes, ":00") as timestamp) as datetime,
    tcs.*
FROM
    tres_cruces_shopping tcs
""").drop("MASTER_KEY", "parking_name")

In [11]:
tres_cruces_shopping_cleaned \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .partitionBy("year", "month", "day") \
    .save("hdfs://hadoop-namenode:9000/machineLearning/Parkings/"+parking+"/")

                                                                                

The  next is just for doing integration with google colab 

In [12]:
tres_cruces_shopping_cleaned\
    .coalesce(1).write.format("parquet").mode("overwrite").partitionBy("year", "month").save("hdfs://hadoop-namenode:9000/machineLearningForDebugging/Parkings/"+parking+"/")

                                                                                