In [14]:
from pyspark.sql.functions import udf

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 2g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.ml.feature import HashingTF, StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.param.shared import HasOutputCol, HasInputCol
from pyspark import keyword_only

from pyspark.sql.types import LongType, StringType, StructType, StructField, IntegerType, FloatType, ArrayType, LongType
import pyspark.sql.functions as f

conf = SparkConf()
conf.set("spark.app.name", "lab4_5") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
data_path = "/labs/slaba04/gender_age_dataset.txt"

In [4]:
schema = StructType(fields=[
    StructField("gender", StringType()),
    StructField("age", StringType()),
    StructField("uid", StringType()),
    StructField("user_json", StringType())
])

In [22]:
df = spark.read.csv(data_path, schema=schema, sep='\t', header=True)

In [23]:
df.show(2)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 2 rows



In [24]:
json_schema = StructType(fields=[
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True)        
        ])
    ))
])

In [25]:
df = df \
    .filter(f.col("gender").isin(["F", "M"])) \
    .withColumn("concat_target", f.concat(f.col("age"), f.lit(":"), f.col("gender"))) \

In [26]:
df.count()

36138

In [27]:
clear_urls = df \
    .withColumn("url_array", f.from_json(f.col("user_json"), json_schema)) \
    .withColumn("url", f.explode(f.col("url_array.visits.url"))) \
    .withColumn("parsed_url", f.expr("parse_url(url, 'HOST')")) \
    .groupBy("uid") \
    .agg(f.collect_list(f.col("parsed_url")).alias("clear_urls"))\
    .select("uid", "clear_urls")    

In [28]:
df = df.join(clear_urls, on="uid")

In [29]:
encode = {"18-24:F": 0,
          "25-34:F": 1,
          "35-44:F": 2,
          "45-54:F": 3,
          ">=55:F": 4,
          "18-24:M": 5,
          "25-34:M": 6,
          "35-44:M": 7,
          "45-54:M": 8,
          ">=55:M": 9}

In [30]:
def encode_target(s):
      return encode[s]

encode_target_udf = udf(encode_target, LongType())

In [31]:
df = df.withColumn("target", encode_target_udf("concat_target"))

In [32]:
hashingTF = HashingTF(inputCol="clear_urls", outputCol="features", numFeatures=15000, binary=False)
rf = RandomForestClassifier(labelCol="target", featuresCol=hashingTF.getOutputCol(), numTrees=20)

In [33]:
pipeline = Pipeline(stages=[
    hashingTF,
    rf
])

In [34]:
rf_model = pipeline.fit(df)

## Kafka batch

In [35]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_alexander.zhukov",
    "startingOffsets": "earliest",
    "failOnDataLoss": "False"
}

In [36]:
kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).load().cache()

In [37]:
event_schema = StructType([
    StructField("uid", StringType(), True),
    StructField("visits", StringType(), True)
])

visit_schema = ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True)   
        ])
)

In [38]:
clean_df = kafka_sdf.select(f.col("value").cast("string").alias("value"))\
         .select(f.from_json(f.col("value"), event_schema).alias("event"))\
         .select(f.col("event.uid").alias("uid"),
                 f.from_json(f.col("event.visits"), visit_schema).alias("url_array")) \
         .select("uid",
                 f.col("url_array.url").alias("url"))

In [39]:
clean_df = clean_df.select("uid",
                           f.expr("""transform(url, url -> parse_url(url, 'HOST'))""").alias("clear_urls"))

In [40]:
prediction = rf_model.transform(clean_df)

In [42]:
decode = {0: "18-24:F",
          1: "25-34:F",
          2: "35-44:F",
          3: "45-54:F",
          4: ">=55:F",
          5: "18-24:M",
          6: "25-34:M",
          7: "35-44:M",
          8: "45-54:M",
          9: ">=55:M"}

In [43]:
def decode_target(s):
      return decode[s]

decode_target_udf = udf(decode_target, StringType())

In [44]:
result = prediction.withColumn("test", f.split(decode_target_udf("prediction"), ":")) \
      .withColumn("gender", f.col("test")[1]) \
      .withColumn("age", f.col("test")[0]) \
      .select("uid", "gender", "age")

In [46]:
result.show(5)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|bd7a30e1-a25d-4cb...|     M|25-34|
|bd7a6f52-45db-49b...|     M|25-34|
|bd7a7fd9-ab06-42f...|     M|25-34|
|bd7c5d7a-0def-41d...|     M|25-34|
|bd7e54a2-0215-45c...|     M|25-34|
+--------------------+------+-----+
only showing top 5 rows



In [47]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

## Streaming

In [49]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_alexander.zhukov",
    "startingOffsets": "latest",
    "failOnDataLoss": "False"
}

write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "alexander.zhukov"
}

In [50]:
event_schema = StructType([
    StructField("uid", StringType(), True),
    StructField("visits", StringType(), True)
])

visit_schema = ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True)   
        ])
)

In [51]:
sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [52]:
clean_df = sdf.select(f.col("value").cast("string").alias("value"))\
         .select(f.from_json(f.col("value"), event_schema).alias("event"))\
         .select(f.col("event.uid").alias("uid"),
                 f.from_json(f.col("event.visits"), visit_schema).alias("url_array")) \
         .select("uid",
                 f.col("url_array.url").alias("url")) \
         .select("uid",
                 f.expr("""transform(url, url -> parse_url(url, 'HOST'))""").alias("clear_urls"))

In [53]:
prediction = rf_model.transform(clean_df)

In [54]:
decode = {0: "18-24:F",
          1: "25-34:F",
          2: "35-44:F",
          3: "45-54:F",
          4: ">=55:F",
          5: "18-24:M",
          6: "25-34:M",
          7: "35-44:M",
          8: "45-54:M",
          9: ">=55:M"}

def decode_target(s):
      return decode[s]

decode_target_udf = udf(decode_target, StringType())

In [55]:
prediction = prediction.withColumn("test", f.split(decode_target_udf("prediction"), ":")) \
      .withColumn("gender", f.col("test")[1]) \
      .withColumn("age", f.col("test")[0]) \
      .select("uid", "gender", "age")

In [56]:
doc = f.to_json(f.struct(f.col("*")))

In [35]:
result = prediction.select(doc.alias("value"))

In [36]:
result.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f8c12fe16a0>

In [None]:
spark.stop()