In [1]:
import sys
import time
import datetime

In [2]:
TOPIC_Step2_NAME="Sahamyab-Tweets2"
KAFKA_SERVER="kafka-broker:29092"

In [3]:
import os
# https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html

# setup arguments
os.environ['PYSPARK_SUBMIT_ARGS']='--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2 pyspark-shell'

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step2_5-Count-Hashtags-Window") \
    .config("spark.executor.memory", "1024mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .config("spark.sql.session.timeZone", "Asia/Tehran") \
    .getOrCreate()    
    
    



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-462884e4-ebd1-456c-b3fb-0b256b2fe056;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 983ms :: artifacts dl 22ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.8-1 from central in [default]
	org.apache.commons#commons-pool2;2.6.2 from central in [default]


In [4]:
spark.sparkContext.setLogLevel("ERROR")


In [5]:
# https://sparkbyexamples.com/spark/spark-sql-structtype-on-dataframe/
schema = StructType([StructField("id", StringType(), True),\
                         StructField("content", StringType(), True),\
                         StructField("sendTime", StringType(), True),\
                         StructField("sendTimePersian", StringType(), True),\
                         StructField("senderName", StringType(), True),\
                         StructField("senderUsername", StringType(), True),\
                         StructField("type", StringType(), True),\
                         StructField("hashtags", ArrayType(StringType()), True)
                    ])

In [6]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_SERVER) \
  .option("subscribe", TOPIC_Step2_NAME) \
  .option("startingOffsets", "earliest") \
  .option("kafka.group.id", "step2_5-count-hashtags-window")\
  .load()

In [7]:
tweetsStringDF = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

tweetsDF = tweetsStringDF.select(from_json(col("value"), schema).alias("data")).select("data.*")
tweetsDF = tweetsDF.withColumn("timestamp", unix_timestamp("sendTime", "yyyy-MM-dd'T'HH:mm:ss'Z'").cast('timestamp')) \
            .withColumn("persian_timestamp", from_utc_timestamp("timestamp", "Asia/Tehran").cast('timestamp')) \
            .withColumn("persianYear", tweetsDF['sendTimePersian'].substr(0, 4)) \
            .withColumn("persianMonth", tweetsDF['sendTimePersian'].substr(6, 2)) \
            .withColumn("persianDay", tweetsDF['sendTimePersian'].substr(9, 2))


In [None]:
# WithWatermark
# CheckPoint Location
windowedHashtagCounts = tweetsDF.withWatermark("persian_timestamp", "10 minutes")\
                                .select("persian_timestamp", explode("hashtags").alias("hashtag")) \
                                .groupBy(
                                            window(tweetsDF.persian_timestamp, 
                                                    "1 hours", 
                                                    "30 minutes"),
                                            "hashtag")\
                                .count()\
                                .filter(col('count')>2) \
                                .orderBy([ col("window").desc(),col("count").desc()])
                                     

query = windowedHashtagCounts.writeStream\
                              .outputMode("complete")\
                              .format("console")\
                              .option("truncate", "false")\
                              .option("numRows","20")\
                              .option("checkpointLocation", "/opt/spark-apps/checkpoints/Step2_4-Count-Hashtags-Window")\
                              .start()\
                              .awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |67   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |67   |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |232  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |84   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |84   |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |249  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_



-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |95   |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |95   |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |260  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |104  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |104  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |269  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |114  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |114  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |279  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |124  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |124  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |289  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |hashtag  |count|
+------------------------------------------+---------+-----+
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غفارس    |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|نظرسنجی  |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شگویا    |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|پالایش   |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|وبملت    |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|میدکو    |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|شاخص_بورس|133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|فافق     |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|خودرو    |133  |
|{2021-10-08 22:00:00, 2021-10-08 23:00:00}|غصینو    |133  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|غفارس    |298  |
|{2021-10-08 21:30:00, 2021-10-08 22:30:00}|شاخص_



### Submit Sample Spark App in Pyspark Container Bash 

- Go to Pyspark Shell :
```bash
docker exec -it pyspark bash
```
- cd /opt/spark-app/
- run this command :
```bash
unset PYSPARK_DRIVER_PYTHON
spark-submit --master  spark-master:7077  --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2 count_hashtags-window.py
export PYSPARK_DRIVER_PYTHON=python
```