In [5]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

PYSPARK_SUBMIT_ARGS = """--num-executors 3 pyspark-shell"""

os.environ["PYSPARK_SUBMIT_ARGS"] = PYSPARK_SUBMIT_ARGS

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ivashnikov").getOrCreate()
sc = spark.sparkContext

spark.conf.set("spark.sql.session.timeZone", "GMT")

In [None]:
spark.sparkContext.getConf().getAll()

In [2]:
from pyspark.sql.functions import countDistinct, col, explode, from_json, from_unixtime, to_date, date_format, max as spark_max, from_utc_timestamp
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

In [19]:
topic_name = 'dmitry_ivashnikov'
offset = 'earliest'

kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1:6667",
    "startingOffsets": """ { "%s": { "0": %s } } """ % (topic_name, offset) if offset.isdigit() else offset,
    "endingOffsets": "latest",
    "subscribe": topic_name,
}


json_clmns = [
    'category', 
    'event_type', 
    'item_id', 
    'item_price', 
    'timestamp', 
    'uid',
]

jsonSchema = StructType(
    [StructField(clmn, StringType()) for clmn in json_clmns]
)

value = from_json(col('value').cast("string"), jsonSchema)
clmns = list(map(lambda x: 'value.%s' % x, json_clmns))
date = date_format(
    to_date(from_unixtime(col('timestamp') / 1000)), 
    'yyyyMMdd'
)


input_data = spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load() \
    .select(value.alias('value')) \
    .select(*clmns) \
    .withColumn('date', date)

In [20]:
input_data.show()

+--------------------+----------+--------------------+----------+-------------+--------------------+--------+
|            category|event_type|             item_id|item_price|    timestamp|                 uid|    date|
+--------------------+----------+--------------------+----------+-------------+--------------------+--------+
|           Computers|      view|        Computers-14|      3551|1588233600000|83952311b9d949463...|20200430|
|  Kitchen-appliances|       buy|Kitchen-appliances-2|      4174|1588233600000|5f7be40206c4f5b17...|20200430|
|           Computers|      view|         Computers-7|      4052|1588233600000|f488c78dd79e9be52...|20200430|
|  Kitchen-appliances|       buy|Kitchen-appliance...|      3630|1588233780000|abf2ffd0532a4ed8b...|20200430|
|             Luggage|      view|           Luggage-9|      4490|1588233780000|84ecbcc7fb9c56273...|20200430|
|           Computers|      view|         Computers-0|      3050|1588233780000|29168ccb6ed46c01d...|20200430|
|         

In [7]:
input_data.where('date == 20200430').show()

+--------------------+----------+--------------------+----------+-------------+--------------------+--------+
|            category|event_type|             item_id|item_price|    timestamp|                 uid|    date|
+--------------------+----------+--------------------+----------+-------------+--------------------+--------+
|             Luggage|      view|          Luggage-15|       701|1588194000000|                null|20200430|
|    Everyday-jewelry|       buy|  Everyday-jewelry-8|      2024|1588194060000|                null|20200430|
|             Luggage|       buy|           Luggage-5|      2443|1588194060000|                null|20200430|
|House-repairs-pai...|      view|House-repairs-pai...|      1440|1588194120000|                null|20200430|
|           Cosmetics|       buy|        Cosmetics-10|      4817|1588194120000|                null|20200430|
|House-repairs-pai...|       buy|House-repairs-pai...|      2937|1588194180000|                null|20200430|
|House-rep

In [236]:
input_data = spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load()
input_data.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [237]:
input_data.show()

+----+--------------------+----------------+---------+------+--------------------+-------------+
| key|               value|           topic|partition|offset|           timestamp|timestampType|
+----+--------------------+----------------+---------+------+--------------------+-------------+
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182540|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182541|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182542|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182543|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182544|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182545|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab03_input_data|        0|182546|2020-10-26 12:24:...|            0|
|null|[7B 22 65 76 65 6...|lab

In [255]:
input_data.withColumn('offset', col('offset').cast(LongType())).groupby().agg({'offset': 'min'}).collect()

[Row(min(offset)=182540)]

In [204]:
input_data.cache()

DataFrame[category: string, event_type: string, item_id: string, item_price: string, timestamp: string, uid: string, date: string]

In [205]:
input_data.show(3,10)

+----------+----------+----------+----------+----------+----------+--------+
|  category|event_type|   item_id|item_price| timestamp|       uid|    date|
+----------+----------+----------+----------+----------+----------+--------+
|Enterta...|       buy|Enterta...|      2529|1577865...|40b2957...|20200101|
|Everyda...|       buy|Everyda...|      4320|1577865...|      null|20200101|
|   Cameras|      view| Cameras-1|      1856|1577865...|ab0e7dd...|20200101|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 3 rows



In [184]:
input_data.count()

182540

In [186]:
input_data.where('event_type == "view"').count()

117498

In [187]:
input_data.where('event_type == "buy"').count()

65042

In [180]:
spark.read.json("visits/view/date=20200429").count()

947

In [None]:
input_data \
    .filter('event_type == "view"') \
    .write \
    .format("json") \
    .partitionBy('date') \
    .mode("overwrite") \
    .save("visits/view/")

input_data \
    .filter('event_type == "buy"') \
    .write \
    .format("json") \
    .partitionBy('date') \
    .mode("overwrite") \
    .save("visits/buy/")

In [168]:
offset = 'earliest'

In [206]:
path = 'visits'

In [212]:
for event_type in ['view', 'buy']:
    input_data \
        .filter('event_type == "%s"' % event_type) \
        .write \
        .format("json") \
        .partitionBy('date') \
        .mode("overwrite") \
        .save(os.path.join(path, event_type))

In [83]:
input_data.show(3, 10)

+----------+----------+----------+----------+----------+----------+--------+
|  category|event_type|   item_id|item_price| timestamp|       uid|    date|
+----------+----------+----------+----------+----------+----------+--------+
|Enterta...|       buy|Enterta...|      2529|1577865...|40b2957...|20200101|
|Everyda...|       buy|Everyda...|      4320|1577865...|      null|20200101|
|   Cameras|      view| Cameras-1|      1856|1577865...|ab0e7dd...|20200101|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 3 rows



In [80]:
input_data \
    .select() \
    .show()

+-------------------------------------------+
|date_format(to_date(`timestamp`), yyyyMMdd)|
+-------------------------------------------+
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                   20200101|
|                                 

In [139]:
data_view = input_data \
    .filter('event_type == "view"') \
    .repartition(int(cnt.loc['view']), 'date')

data_buy = input_data \
    .filter('event_type == "buy"') \
    .repartition(int(cnt.loc['buy']), 'date')

In [142]:
data_view.show(1, 10)

+--------+----------+----------+----------+----------+----+--------+
|category|event_type|   item_id|item_price| timestamp| uid|    date|
+--------+----------+----------+----------+----------+----+--------+
|Clothing|      view|Clothin...|      1803|1578344...|null|20200107|
+--------+----------+----------+----------+----------+----+--------+
only showing top 1 row



In [87]:
data_view.show(3, 10)

+----------+----------+----------+----------+----------+----------+--------+
|  category|event_type|   item_id|item_price| timestamp|       uid|    date|
+----------+----------+----------+----------+----------+----------+--------+
|   Cameras|      view| Cameras-1|      1856|1577865...|ab0e7dd...|20200101|
|   Luggage|      view| Luggage-7|      4975|1577865...|      null|20200101|
|Mobile-...|      view|Mobile-...|      3981|1577865...|d120632...|20200101|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 3 rows

