## Лаба 4. Прогнозирование пола и возрастной категории — Spark Streaming

In [149]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [150]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("Laba_4") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [151]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier


from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType, LongType

from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number, explode, expr, split
from pyspark.sql.functions import array, collect_set, lit, from_json, to_json, struct, regexp_replace
from pyspark.mllib.linalg import SparseVector, DenseVector



import json
import re

In [152]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [153]:
path = '/labs/slaba04/gender_age_dataset.txt'

schema = (
    StructType()
    .add("gender", StringType(), True)
    .add("age", StringType(), True)
    .add("uid", StringType(), True)
    .add("user_json", StringType(), True)
)
      
train_data = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(schema)
            .load(path)
)

In [154]:
train_data.show(5)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 5 rows



In [155]:
train_data.count()

41138

In [156]:
train_data = train_data.filter(train_data['gender'] != '-')

In [157]:
train_data.count()

36138

In [158]:
VisitsType = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [159]:
clean_df = train_data \
    .withColumn("visits", from_json(col("user_json"), VisitsType)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .withColumn('domain', regexp_replace('host', 'www.', ''))

In [160]:
clean_df_res = clean_df \
    .groupBy("gender", "age", "uid") \
    .agg(collect_set("domain") \
    .alias("domains"))

In [161]:
clean_df_res.show(2,False,True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender  | F                                                                                                                                                                           
 age     | 18-24                                                                                                                                                                       
 uid     | 09b1ecd3-b2d2-4c1b-857a-025c0509d9ec                                                                                                                                        
 domains | [tankionline.com]                                                                                                                                                           
-RECORD 1-----------------------------------------------------------------------

In [162]:
hasher_freq = HashingTF(numFeatures=1000, binary=False, inputCol="domains", outputCol="domains_vector")
df_train_vector = hasher_freq.transform(clean_df_res)

In [169]:
normalizer = Normalizer(inputCol='domains_vector', outputCol="domains_norm")
df_train_norm = normalizer.transform(df_train_vector)

In [170]:
df_train_norm = df_train_norm.replace(['F', 'M'], ['0','1'], "gender") \
                     .replace(['18-24', '25-34', '35-44', '45-54', '>=55'], ['1','3','5','7','9'], "age")

In [171]:
df_train_norm = df_train_norm.select(col("gender").cast('float').alias("gender"),col("age").cast('int').alias("age"),col("uid"),col("domains"),col("domains_norm"))

In [172]:
df_train_norm.show()

+------+---+--------------------+--------------------+--------------------+
|gender|age|                 uid|             domains|        domains_norm|
+------+---+--------------------+--------------------+--------------------+
|   0.0|  1|09b1ecd3-b2d2-4c1...|   [tankionline.com]|  (1000,[509],[1.0])|
|   0.0|  1|15faf063-5e44-4b6...|[hotels.1001tur.r...|(1000,[43,268,293...|
|   0.0|  1|560142d9-6c9c-439...|[yandex.ru, vk.co...|(1000,[101,218,23...|
|   0.0|  1|6709f443-7ddd-423...|       [muzofon.com]|  (1000,[696],[1.0])|
|   0.0|  1|67e9bd68-ef03-49c...|[yandex.ru, tempf...|(1000,[402,706,77...|
|   0.0|  1|757ff5c2-ecdb-489...|[msn.com, dns-sho...|(1000,[416,459,82...|
|   0.0|  1|c430a9d4-5f48-47c...|[eporner.com, 100...|(1000,[553,646,67...|
|   0.0|  1|d1d59923-51d7-4a1...|[go.mail.ru, shop...|(1000,[667,988],[...|
|   0.0|  1|fca5deb7-77f4-4c4...|[sprashivai.ru, b...|(1000,[115,189],[...|
|   0.0|  3|0521da78-b729-4a0...|[mirknig.com, yon...|(1000,[95,671,835...|
|   0.0|  3|

In [173]:
lr_gendr = LogisticRegression(featuresCol='domains_norm', labelCol="gender", maxIter=29, regParam=0.020436539365475917)
lr_gendr_model = lr_gendr.fit(df_train_norm)


In [174]:
rf_age = RandomForestClassifier(labelCol="age", featuresCol="domains_norm")
rf_age_model = rf_age.fit(df_train_norm)

In [193]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_ekaterina.fisenko'
KAFKA_OUTPUT_TOPIC = 'ekaterina.fisenko'

In [194]:
read_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVER,
    "subscribe": KAFKA_INPUT_TOPIC,
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [195]:
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [196]:
def create_console_sink(df):
    return df \
            .writeStream \
            .format("console") \
            .trigger(processingTime="5 seconds") \
            .option("truncate", "false") \
            .option("numRows", "20")

In [197]:
parsed_sdf = kafka_sdf.select(col("value").cast("string"), col("topic"), col("partition"), col("offset"))

sink = create_console_sink(parsed_sdf)

sq = sink.start()

In [217]:
sq.isActive

False

In [216]:
sq.stop()

In [199]:
kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [215]:
kafka_read_df.count()

8877

In [201]:
event_schema = StructType([
        StructField('uid', StringType(), True)
        , StructField('visits', StringType(),True)
        ])

visit_schema = ArrayType(
        StructType([
            StructField('url', StringType(), True)
            , StructField('timestamp', LongType(), True)
        ])
)

In [202]:
clean_df = (
    kafka_read_df
    .select(col('value').cast('string').alias('value'))
    .select(from_json(col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        from_json(col('event.visits'), visit_schema).alias('visits')
    )
)

In [203]:
clean_df.show(5)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
|bd7c5d7a-0def-41d...|[[http://www.24op...|
|bd7e54a2-0215-45c...|[[http://www.dns-...|
+--------------------+--------------------+
only showing top 5 rows



In [204]:
clean_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .withColumn('domain', regexp_replace('host', 'www.', '')) \
    .groupBy("uid") \
    .agg(collect_set("domain").alias("domains"))

In [205]:
clean_df_vector = hasher_freq.transform(clean_df)
clean_df_norm = normalizer.transform(clean_df_vector)

In [206]:
clean_df_norm_1 = lr_gendr_model.transform(clean_df_norm)
clean_df_norm_2 = rf_age_model.transform(clean_df_norm)

In [207]:
clean_df_norm_1.select("uid", "prediction").createOrReplaceTempView("gender")
clean_df_norm_2.select("uid", "prediction").createOrReplaceTempView("age")

In [208]:
out_df = spark.sql("""SELECT g.uid
                , CAST(CAST(g.prediction AS INT) AS String) AS gender
                , CAST(CAST(a.prediction AS INT) AS String) AS age
                FROM gender g
                JOIN age a
                ON g.uid = a.uid""")

In [209]:
out_df.show(5)

+--------------------+------+---+
|                 uid|gender|age|
+--------------------+------+---+
|0108d217-e476-493...|     1|  3|
|0192cc54-559c-4c8...|     1|  3|
|019acd5e-be9a-4cd...|     0|  3|
|02e7f830-da57-4d5...|     1|  3|
|1d160259-73d8-451...|     0|  3|
+--------------------+------+---+
only showing top 5 rows



In [210]:
out_df = out_df.replace(['0','1'], ['F', 'M'], "gender") \
                     .replace(['1','3','5','7','9'], ['18-24', '25-34', '35-44', '45-54', '>=55'], "age")

In [211]:
out_df.show(5)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|0108d217-e476-493...|     M|25-34|
|0192cc54-559c-4c8...|     M|25-34|
|019acd5e-be9a-4cd...|     F|25-34|
|02e7f830-da57-4d5...|     M|25-34|
|1d160259-73d8-451...|     F|25-34|
+--------------------+------+-----+
only showing top 5 rows



In [212]:
pDF = out_df.select(lit("").alias('key'), to_json(struct(*out_df.columns)).alias('value'))

In [213]:
pDF.show(5)

+---+--------------------+
|key|               value|
+---+--------------------+
|   |{"uid":"0108d217-...|
|   |{"uid":"0192cc54-...|
|   |{"uid":"019acd5e-...|
|   |{"uid":"02e7f830-...|
|   |{"uid":"1d160259-...|
+---+--------------------+
only showing top 5 rows



In [214]:
(
    pDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('topic', KAFKA_OUTPUT_TOPIC)
    .save()
)

In [219]:
spark.stop()