# Init

In [5]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [6]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.executor.instances", "6")
conf.set("spark.executor.cores", "3")
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.cores", "3")
conf.set("spark.driver.memory", "6g")

spark = SparkSession\
    .builder\
    .config(conf = conf)\
    .appName("Lab 04 Streaming").getOrCreate()

In [None]:
!hdfs dfs -head /labs/slaba04/gender_age_dataset.txt

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import HashingTF, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Model

In [None]:
schema = StructType([
    StructField('gender', StringType()), 
    StructField('age', StringType()),
    StructField('uid', StringType()),
    StructField('user_json', StringType())])

In [None]:
train_data = spark.read\
    .format("csv")\
    .option("inferSchema", "true")\
    .schema(schema)\
    .option("header", "true")\
    .option("delimiter", "\\t")\
    .load("/labs/slaba04/gender_age_dataset.txt")

In [None]:
train_data.show(2, 200, vertical=True)

In [None]:
visits_schema = StructType([
    StructField("visits", ArrayType(
      StructType([
          StructField("url", StringType(), True),
          StructField("timestamp", LongType(), True)
      ])
   ))
]) 

In [None]:
clear_data = train_data\
    .withColumn('visits', from_json(col('user_json'), schema=visits_schema))\
    .withColumn('visit', explode('visits.visits').alias('visit'))\
    .withColumn('host', 
                regexp_replace(expr('parse_url(visit.url, "HOST")').alias('host')), "^www.", "")\
    .filter("age != '-'")\
    .filter("gender != '-'")\
    .drop('visits', 'visit', 'user_json')

In [None]:
clear_data.show(2, 200, vertical=True)

In [None]:
group_data = clear_data\
    .groupBy(col("gender"), col("age"), col("uid"))\
    .agg(collect_list("host").alias("hosts"))

In [None]:
group_data.show(2, 200, vertical=True)

In [None]:
hashing_TF = HashingTF(inputCol="hosts", outputCol="rawFeatures", numFeatures=10000, binary=False)

In [None]:
indexer_age = StringIndexer(inputCol="age", outputCol="ageIndex").fit(group_data)

In [None]:
indexer_gender = StringIndexer(inputCol="gender", outputCol="genderIndex").fit(group_data)

In [None]:
rfc_age = RandomForestClassifier(featuresCol="rawFeatures", labelCol="ageIndex", predictionCol="age_index_prediction", 
        rawPredictionCol="age_index_raw_prediction", probabilityCol="age_probability")

In [None]:
rfc_gender = RandomForestClassifier(featuresCol="rawFeatures", labelCol="genderIndex", predictionCol="gender_index_prediction",
                rawPredictionCol="gender_index_raw_prediction", probabilityCol="gender_probability")

In [None]:
pipeline = Pipeline(stages=[hashing_TF, indexer_age, indexer_gender, rfc_age, rfc_gender])

In [None]:
model = pipeline.fit(group_data)

In [None]:
model.show(2, 200, vertical=True)

In [None]:
model.write().overwrite().save("/user/andrey.blednykh/labs/lab04_model")

# Kafka

In [None]:
input_kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "subscribe": "input_andrey.blednykh",
    "startingOffsets": "earliest",
    "maxOffsetsPerTrigger": "5"
}

In [None]:
write_kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "topic": "andrey.blednykh"
}

In [None]:
sdf = spark.readStream.format("kafka").options(**input_kafka_params).load()

In [None]:
event_type = StructType([
    StructField("uid", StringType(), True),
    StructField("visits", StringType(), True)
])

In [None]:
visit_type = ArrayType(
    StructType([
        StructField("url", StringType(), True),
        StructField("timestamp", LongType(), True)
    ])
)

In [None]:
def foreachBatchFunction(batch_df, batch_id) {
    
    parced_data = batch_df\
        .select(col("value").cast("string").alias("value"))\
        .select(from_json(col("value"), event_type).alias("data"))\
        .select("data.*")\
        .select("uid", from_json(col("visits"), visit_type).alias("visits"))
    
    proc_df = parced_data\
        .withColumn("visit", explode("visits").alias("visits"))\
        .withColumn("host", 
                    regexp_replace(expr("parse_url(visit.url), 'HOST'").alias("host")), "^www.", "")\
        .filter("age != '-'")\
        .filter("gender != '-'")\    
        .drop("visits", "visit")\
        .groupBy("uid")\
        .agg(collect_list("host").alias("hosts"))
    
    predict_df = lab_model\
        .transform(proc_df)\
        .select("uid", "PredictedGender", "PredictedAge")\
        .withColumnRenamed("PredictedAge", "age")\
        .withColumnRenamed("PredictedGender", "gender")
    
    predict_df
        .select(to_json(struct(*predict_df.columns)).alias("value"))
        .write("kafka")
        .options(**write_kafka_params)\
        .mode("append")
        .save()
}

In [None]:
lab_model = PipelineModel.load("/user/andrey.blednykh/labs/lab04_model")

In [None]:
sdf.writeStream\
    .foreachBatch(foreachBatchFunction)\
    .option('checkpointLocation', 'streaming/chk/chk_andrey_blednykh')
    .start()

# Final

In [None]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))

In [None]:
kill_all()

In [7]:
spark.stop()