In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("nikita.mospan lab4") \
    .getOrCreate()

In [2]:
lab4ModelPath = "lab4_model"
labelToGenderAgePath = "lab4_label_to_age_gender"

In [9]:
from pyspark.ml import PipelineModel
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StringType, StructField, LongType
%run 'VisitsTransformer.ipynb'
m = __import__("__main__")
setattr(m, 'VisitsTransformer', VisitsTransformer)

In [4]:
model = PipelineModel.load(lab4ModelPath)

In [5]:
labelToAgeGenderDf = spark.read.format("parquet").load(labelToGenderAgePath)

In [11]:
writeKafkaParams = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "nikita.mospan"
}

def processBatch(df, epoch_id):
    predictionsDf = model.transform(df).withColumnRenamed("prediction", "label")
    predictionsDf.join(labelToAgeGenderDf, "label")\
        .select("uid", "gender", "age")\
        .selectExpr("to_json(struct(*)) AS value") \
        .write.format("kafka").options(**writeKafkaParams)\
        .mode("append")\
        .save()

In [12]:
! hdfs dfs -rm -r lab4_checkpoint

21/03/13 12:18:11 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/nikita.mospan/lab4_checkpoint' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/nikita.mospan/.Trash/Current/user/nikita.mospan/lab4_checkpoint1615627091904


In [13]:
readKafkaParams = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_nikita.mospan",
    "startingOffsets": "latest"
}

kafkaValueSchema = StructType([
    StructField("uid", StringType()),
    StructField("visits", StringType())
])

spark.readStream.format("kafka").options(**readKafkaParams).load()\
    .select(F.from_json(F.col("value").cast("string"), kafkaValueSchema)\
    .alias("jsonData")) \
    .select("jsonData.*")\
    .writeStream.foreachBatch(processBatch)\
    .trigger(processingTime='5 seconds') \
    .option("checkpointLocation", "lab4_checkpoint")\
    .start()\
    .awaitTermination()

+---+------+-------+--------+-------------+-----------+-----+
|uid|visits|domains|features|rawPrediction|probability|label|
+---+------+-------+--------+-------------+-----------+-----+
+---+------+-------+--------+-------------+-----------+-----+

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|                 uid|              visits|             domains|            features|       rawPrediction|         probability|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|bd7a30e1-a25d-4cb...|[{"url": "http://...|[interfax.ru, ame...|(1000,[1,7,8,9,10...|[4.29197537273163...|[0.21459876863658...|  2.0|
|bd7a6f52-45db-49b...|[{"url": "https:/...|[packagetrackr.co...|(1000,[1,8,9,15,2...|[4.74839157933060...|[0.23741957896653...|  0.0|
|bd7a7fd9-ab06-42f...|[{"url": "http://...|[mk.ru, mk.ru, mk...|   (1000,[87],[3.

KeyboardInterrupt: 

In [None]:
spark.stop()