In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

PYSPARK_SUBMIT_ARGS = """--num-executors 3 pyspark-shell"""

os.environ["PYSPARK_SUBMIT_ARGS"] = PYSPARK_SUBMIT_ARGS

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ivashnikov").getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.sql.functions import countDistinct, col, explode, from_json, to_json, lit, struct, when, from_unixtime, to_date, date_format, window, sum, mean, count, unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

In [3]:
# params
topic_in = 'lab03_input_data'
topic_out = 'dmitry_ivashnikov_lab03b_out'

kafka_params_in = {
    "kafka.bootstrap.servers": "spark-master-1:6667",
    "startingOffsets": "earliest",
    "subscribe": topic_in,
}
kafka_params_out = {
    "kafka.bootstrap.servers": "10.0.0.5:6667", 
    "checkpointLocation": "checkpoint",
}

# open_data
input_data = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_params_in) \
    .load()

# processing
clmns_in = [
    'category', 
    'event_type', 
    'item_id', 
    'item_price', 
    'timestamp', 
    'uid',
]
clmns_out = [
    'start_ts', 
    'end_ts', 
    'revenue', 
    'visitors', 
    'purchases', 
    'aov',
]

jsonSchema = StructType([StructField(clmn, StringType()) for clmn in clmns_in])
value_in = from_json(col('value').cast("string"), jsonSchema).alias('value')
clmns = list(map(lambda x: 'value.%s' % x, clmns_in))
timestamp = from_unixtime(col('timestamp') / 1000)

revenue = sum(when(col('event_type') == 'buy', col('item_price')).otherwise(0)).cast(LongType()).alias('revenue')
visitors = count(col('uid')).alias('visitors')
purchases = sum(when(col('event_type') == 'buy', 1).otherwise(0)).cast(LongType()).alias('purchases')
aov = (revenue / purchases).alias('aov')

start_ts = unix_timestamp(col('window.start'))
end_ts = unix_timestamp(col('window.end'))

value_out = to_json(struct(clmns_out)).alias('value')
topic = lit(topic_out).alias('topic')

agg_data = input_data \
    .select(value_in) \
    .select(*clmns) \
    .groupby(window(timestamp, '1 hour')) \
    .agg(revenue, visitors, purchases, aov) \
    .withColumn('start_ts', start_ts) \
    .withColumn('end_ts', end_ts) \
    .select(value_out, topic)
    
# write
# sq = agg_data \
#     .writeStream \
#     .format("kafka") \
#     .outputMode("Update") \
#     .trigger(processingTime="5 seconds") \
#     .options(**kafka_params_out).start()
# sq.awaitTermination()

In [25]:
kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1:6667",
    "startingOffsets": "earliest",
    "subscribe": 'dmitry_ivashnikov',
}




spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load() \
    .select(col('value').cast('string')) \
    .show(20, 70)    

+-----+
|value|
+-----+
+-----+



In [21]:
df.show()

+--------------------+
|               value|
+--------------------+
|{"event_type": "v...|
|{"event_type": "b...|
|{"event_type": "v...|
|{"event_type": "b...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "b...|
|{"event_type": "v...|
|{"event_type": "b...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "v...|
|{"event_type": "b...|
|{"event_type": "b...|
|{"event_type": "v...|
+--------------------+
only showing top 20 rows



In [407]:
sss = sum(when(col('event_type') == 'buy', col('item_price')).otherwise(0))

In [424]:
agg_data = tmp \
    .select(value_in) \
    .select(*clmns) \
#     .groupby(window(timestamp, '1 hour')) \
#     .agg(revenue, visitors, purchases, aov) \
#     .withColumn('start_ts', start_ts) \
#     .withColumn('end_ts', end_ts) \
#     .select(value_out, topic)

agg_data.show()

+--------------------+----------+--------------------+----------+-------------+--------------------+
|            category|event_type|             item_id|item_price|    timestamp|                 uid|
+--------------------+----------+--------------------+----------+-------------+--------------------+
|Entertainment-equ...|       buy|Entertainment-equ...|      2529|1577865600000|40b29579-e845-45c...|
|    Everyday-jewelry|       buy|  Everyday-jewelry-0|      4320|1577865660000|                null|
|             Cameras|      view|           Cameras-1|      1856|1577865720000|ab0e7dd1-5899-488...|
|             Luggage|      view|           Luggage-7|      4975|1577865720000|                null|
|       Mobile-phones|      view|    Mobile-phones-10|      3981|1577865780000|d1206327-0e9f-410...|
|    Everyday-jewelry|       buy|  Everyday-jewelry-0|      4786|1577865780000|                null|
|             Cameras|      view|          Cameras-15|       312|1577865780000|6d5212d9-73e

In [426]:
agg_data.groupby().agg(count(col('event_type') == 'view')).show()

+--------------------------+
|count((event_type = view))|
+--------------------------+
|                      1000|
+--------------------------+



In [401]:
agg_data.filter('uid != "null"').count()

462

In [402]:
agg_data.count()

1000

In [403]:
agg_data.groupby().agg(count(col('uid'))).show()

+----------+
|count(uid)|
+----------+
|       462|
+----------+



In [362]:
tmp \
    .select(col('value').cast('string')) \
    .show(20, 70) 

AttributeError: 'int' object has no attribute 'select'

In [292]:
tmp.show()

+---+-----+-----+---------+------+---------+-------------+
|key|value|topic|partition|offset|timestamp|timestampType|
+---+-----+-----+---------+------+---------+-------------+
+---+-----+-----+---------+------+---------+-------------+



In [428]:
kafka_params = {
     "kafka.bootstrap.servers": "10.0.0.5:6667",
    "startingOffsets": "earliest",
    "subscribe": 'dmitry_ivashnikov_lab03b_out',
}

spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load().count()

0

In [423]:
kafka_params = {
     "kafka.bootstrap.servers": "10.0.0.5:6667",
    "startingOffsets": "earliest",
    "subscribe": 'dmitry_ivashnikov_lab03b_out',
}

spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load() \
    .select(col('value').cast('string')) \
    .show(50, 120)   

+-------------------------------------------------------------------------------------------------------------------+
|                                                                                                              value|
+-------------------------------------------------------------------------------------------------------------------+
|  {"start_ts":1577901600,"end_ts":1577905200,"revenue":51954,"visitors":21,"purchases":49,"aov":1060.2857142857142}|
|  {"start_ts":1577883600,"end_ts":1577887200,"revenue":94244,"visitors":45,"purchases":103,"aov":914.9902912621359}|
|  {"start_ts":1577869200,"end_ts":1577872800,"revenue":96379,"visitors":32,"purchases":87,"aov":1107.8045977011495}|
| {"start_ts":1577880000,"end_ts":1577883600,"revenue":139741,"visitors":42,"purchases":98,"aov":1425.9285714285713}|
|   {"start_ts":1577872800,"end_ts":1577876400,"revenue":86863,"visitors":52,"purchases":99,"aov":877.4040404040404}|
|  {"start_ts":1577876400,"end_ts":1577880000,"revenue":

In [None]:
{"start_ts":1577865600,"end_ts":1577869200,"revenue":80271,"visitors":46,"purchases":28,"aov":2866.8214285714284}
{"start_ts":1577869200,"end_ts":1577872800,"revenue":96379,"visitors":32,"purchases":40,"aov":2409.475}

In [246]:
def kill_all():
    streams = spark.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [415]:
kill_all()

Stopped KafkaV2[Subscribe[dmitry_ivashnikov]]


In [429]:
spark.stop()