In [1]:
import subprocess
import findspark
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import from_json
from pyspark.sql.functions import col,year,month,dayofmonth,coalesce,lit,from_json, hour
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
import os
os.environ["HADOOP_USER_NAME"] = "hadoop"

In [3]:
findspark.init(spark_home='/opt/spark')

In [4]:
# topics = subprocess.check_output("/opt/kafka/bin/kafka-topics.sh --list --bootstrap-server hadoop-namenode:9092", shell=True)
# topics = topics.split()
# topics = [ topic.decode("UTF-8") for topic in topics ]
# topics.pop()
# topics

In [5]:
# Spark session & context
spark = (SparkSession
         .builder
         .master('local[*]')
         .appName('TresCrucesShoppingHDFS')
         .config('spark.jars', 'file:///opt/smart-parking/Python/Spark-Jars-Utils/spark-sql-kafka-0-10_2.12-3.2.1.jar,file:///opt/smart-parking/Python/Spark-Jars-Utils/kafka-clients-3.1.0.jar')
         .config('spark.executor.extraClassPath','file:///opt/smart-parking/Python/Spark-Jars-Utils/spark-sql-kafka-0-10_2.12-3.2.1.jar:file:///opt/smart-parking/Python/Spark-Jars-Utils/kafka-clients-3.1.0.jar')
         .config('spark.executor.extraLibrary','file:///opt/smart-parking/Python/Spark-Jars-Utils/spark-sql-kafka-0-10_2.12-3.2.1.jar:file:///opt/smart-parking/Python/Spark-Jars-Utils/kafka-clients-3.1.0.jar')
         .config('spark.driver.extraClassPath', 'file:///opt/smart-parking/Python/Spark-Jars-Utils/spark-sql-kafka-0-10_2.12-3.2.1.jar:file:///opt/smart-parking/Python/Spark-Jars-Utils/kafka-clients-3.1.0.jar')
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext

22/06/14 01:16:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark

In [7]:
# Subscribe to 1 topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "hadoop-namenode:9092") \
  .option("subscribe", "TresCrucesShopping") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [8]:
def CountOccupation(lista):
    i=0
    for opp in lista:
        if opp == True:
            i+=1
    return i

countOcuppation = udf(lambda x: CountOccupation(x), IntegerType())      

In [9]:
def convertList(lista):
    i=0
    for opp in lista:
        if opp == True:
            i+=1
    return i

countOcuppation = udf(lambda x: CountOccupation(x))      

In [10]:
def percentagetoString(sp_total, sp_ocupied):
    percnt = int((sp_ocupied*100)/sp_total)
    #print(percnt)
    if percnt >= 0 and percnt <=25:
        return "Empty"
    elif percnt > 25 and percnt <=50:
        return "Almost Empty"
    elif percnt > 50 and percnt <=75:
        return "Almost Full"
    elif percnt > 75 and percnt <=100:
        return "Full"
    else:
        return "Other"
    
percentageOcuppation = udf(lambda x, y: percentagetoString(x,y), StringType())   

areaAvailableSpots = udf(lambda x, y: x-y, IntegerType())

In [13]:
def func(batch_df, batch_id):
    batch_df.persist()
    
    df = batch_df.withColumn("value", col("value").cast("string"))
    tableSchema = StructType() \
            .add("parking_name", StringType())\
            .add("parking_id", StringType())\
            .add("parking_address", StringType())\
            .add("parking_description", StringType())\
            .add("device_timestamp", TimestampType())\
            .add("device_id", StringType())\
            .add("parking_latitude", DoubleType())\
            .add("parking_longitude", DoubleType())\
            .add("parking_temperature", StringType())\
            .add("parking_humidity", StringType())\
            .add("parking_uuid", StringType())\
            .add("device_level_id", StringType())\
            .add("device_area_id", StringType())\
            .add("device_area_name", StringType())\
            .add("device_spots", StringType())\
            .add("device_slots", ArrayType(BooleanType()))\
            .add("parking_closed", BooleanType())
        
    prov = df.select("*",from_json("value",tableSchema).alias("data_parsed")).select("data_parsed.*")
    
    dlist = prov.columns
    
    dfalldata = prov.select(dlist)
    df_partitioned =  dfalldata.withColumn("year", year(col("device_timestamp"))) \
        .withColumn("month", month(col("device_timestamp"))) \
        .withColumn("day", dayofmonth(col("device_timestamp"))) \
        .withColumn("hour", hour(col("device_timestamp"))) \
        .withColumn("minutes", minute(col("device_timestamp"))) \
        .drop(col("device_timestamp"))
    
    # dfalldata = dfalldata.orderBy(dfalldata.device_timestamp.desc())
    # dfalldata = dfalldata.dropDuplicates((['device_address']))
    df_partitioned = df_partitioned\
                    .withColumnRenamed("device_area_name", "area_name")\
                    .withColumnRenamed("device_area_id", "area_id")\
                    .withColumnRenamed("device_level_id", "level_id")\
                    .withColumn("area_description", lit("short description"))\
                    .withColumn("area_occupied_spots", countOcuppation(col("device_slots")))\
                    .withColumn("area_occupation", percentageOcuppation(col("device_spots").cast("int"), col("area_occupied_spots").cast("int")))\
                    .withColumn("area_available_spots", areaAvailableSpots(col("device_spots").cast("int"), col("area_occupied_spots").cast("int")))\
                    .withColumnRenamed("device_spots", "area_total_spots")\
                    .withColumn("level_name", lit("Piso ")+col("level_id"))\
                    .withColumn("level_id", col("level_id").cast("int"))\
                    .withColumn("area_id", col("area_id").cast("int"))\
                    .withColumn("area_total_spots", col("area_total_spots").cast("int"))
    
    df_partitioned = df_partitioned.select("*", posexplode("device_slots").alias("slot_id", "slot_state"))
    df_partitioned = df_partitioned \
                        .withColumn("slot_id", col("slot_id") + 1)
    
    #df_partitioned.printSchema()
    df_partitioned.show()
    if not df_partitioned.rdd.isEmpty():
        df_partitioned.write\
        .mode("append").\
        format("parquet").\
        partitionBy("parking_name", "year", "month", "day","hour") \
        .save("hdfs://hadoop-namenode:9000/data/Parkings/")
    
    #df_partitioned.select("*", "device_slots.*").show()
#     for i in range(1,10):
#         df_partitioned = df_partitioned.withColumn("slots_id_" + str(i), col("device_slots").getItem(i-1))
#     df_partitioned.show()    
#     windowPartitionAgg  = Window.partitionBy("device_level_id")              
#     df2 = dfalldata.withColumn("level_total_spots", sum(col("device_spots")).over(windowPartitionAgg))\
#                    .withColumn("level_occupied_slots", sum(col("area_occupied_slots")).over(windowPartitionAgg))
    
    """
    A nivel del streaming como se está usando el foreachBatch hace que agarre fragmentos de información,
    en mi caso puntual este desarrollo serí para realizar la integración con Hadoop y no con el Backend
    en esta seccion ver parte del notebook dedicada a codigo de agregación
    """
 
    batch_df.unpersist()

In [None]:
# Inicia la consulta e imprime el resultado
CHECKPOINT_DIRECTORY = 'file:///opt/smart-parking/Python/GetData/CommitLogHadoop'
df \
.writeStream\
.trigger(processingTime='120 seconds')\
.outputMode("append") \
.option("checkpointLocation", CHECKPOINT_DIRECTORY)\
.foreachBatch(func) \
.start()\
.awaitTermination()

+------------------+----------+--------------------+--------------------+----------------+----------------+-----------------+-------------------+----------------+--------------------+--------+-------+---------+----------------+--------------------+--------------+----+-----+---+----+-------+-----------------+-------------------+---------------+--------------------+----------+-------+----------+
|      parking_name|parking_id|     parking_address| parking_description|       device_id|parking_latitude|parking_longitude|parking_temperature|parking_humidity|        parking_uuid|level_id|area_id|area_name|area_total_spots|        device_slots|parking_closed|year|month|day|hour|minutes| area_description|area_occupied_spots|area_occupation|area_available_spots|level_name|slot_id|slot_state|
+------------------+----------+--------------------+--------------------+----------------+----------------+-----------------+-------------------+----------------+--------------------+--------+-------+------

                                                                                

## Codigo para agregar en el otro notebook:

In [None]:
#        
# .withColumn("level_occupation", percentageOcuppation(col("level_total_spots"), col("level_occupied_slots")))\
# .withColumn('area_name', merge_udf(dfalldata['area_name']).over(windowPartitionAgg))\
# .withColumn('device_address', merge_udf(dfalldata['device_address']).over(windowPartitionAgg))\
# .withColumn('area_occupation', merge_udf(dfalldata['area_occupation']).over(windowPartitionAgg))\
# .withColumn('spots', merge_udf(dfalldata['spots'].cast("string")).over(windowPartitionAgg))\
# .withColumn('slots', merge_udf(dfalldata['slots'].cast("string")).over(windowPartitionAgg))\
# .withColumn('area_occupied_slots', merge_udf(dfalldata['area_occupied_slots'].cast("string")).over(windowPartitionAgg))\
# .dropDuplicates((['level_id']))

# windowPartitionAgg  = Window.partitionBy("parking_name")
# df3 = df2.withColumn("level_total_spots", merge_udf_total(df2['level_total_spots'].cast("string")).over(windowPartitionAgg))\
# .withColumn("level_occupied_slots", merge_udf_total(df2['level_occupied_slots'].cast("string")).over(windowPartitionAgg))\
# .withColumn("level_occupation", merge_udf_total(df2['level_occupation']).over(windowPartitionAgg))\
# .withColumn('area_name', merge_udf_total(df2['area_name']).over(windowPartitionAgg))\
# .withColumn('device_address', merge_udf_total(df2['device_address']).over(windowPartitionAgg))\
# .withColumn('area_occupation', merge_udf_total(df2['area_occupation']).over(windowPartitionAgg))\
# .withColumn('spots', merge_udf_total(df2['spots'].cast("string")).over(windowPartitionAgg))\
# .withColumn('slots', merge_udf_total(df2['slots'].cast("string")).over(windowPartitionAgg))\
# .withColumn('area_occupied_slots', merge_udf_total(df2['area_occupied_slots'].cast("string")).over(windowPartitionAgg))\
# .dropDuplicates((['parking_name']))
# 

In [None]:
df\
    .writeStream\
    .outputMode("update") \
    .option("checkpointLocation", CHECKPOINT_DIRECTORY)\
    .foreachBatch(func) \
    .start()\
    .awaitTermination()