# Spark init

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.sql.window import Window
import json
import re

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("lab04")
         .getOrCreate())

In [3]:
spark

In [4]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


# Import & preproc data

In [62]:
schema = StructType([
    StructField("gender", StringType(),True), 
    StructField("age", StringType(),True), 
    StructField("uid", StringType(),True), 
    StructField("user_json", StringType(), True)
])

visit_schema = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(),True), 
            StructField("timestamp", LongType(),True)
        ])
    )
               )
])

data = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', schema = schema, sep = '\t', header = True)

In [135]:
data.count()

41138

In [63]:
data = data.withColumn('temp', F.from_json('user_json', schema=visit_schema))
data = data.withColumn("visit", F.explode("temp.visits"))
data = data.withColumn("host", F.expr("parse_url(visit.url, 'HOST')").alias("host"))
data = data.drop(*['temp', 'visit','user_json'])

In [7]:
#проверяем пропущенные значения
from pyspark.sql.functions import col,isnan, when, count
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+------+---+---+------+
|gender|age|uid|  host|
+------+---+---+------+
|     0|  0|  0|141843|
+------+---+---+------+



In [8]:
data.select('age').groupBy('age').count().show()

+-----+-------+
|  age|  count|
+-----+-------+
| >=55| 267941|
|45-54| 797479|
|    -| 516555|
|35-44|1703954|
|25-34|2057045|
|18-24| 486533|
+-----+-------+



In [9]:
data.select('gender').groupBy('gender').count().show()

+------+-------+
|gender|  count|
+------+-------+
|     F|1950980|
|     M|3361972|
|     -| 516555|
+------+-------+



In [64]:
data = data.filter((F.col('host').isNotNull())&(F.col('age')!= '-')&(F.col('age')!= '-'))

In [65]:
data = (data.groupBy("gender", "age", "uid")
        .agg(F.collect_list("host")
             .alias("hosts")))

data.count()

# Model

In [7]:
from pyspark.ml.feature import StringIndexer, HashingTF, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [50]:
stages = []
gender_stringIndexer = StringIndexer(inputCol="gender", outputCol="gender_indexed")
age_stringIndexer = StringIndexer(inputCol="age", outputCol="age_indexed")
gender_labels = gender_stringIndexer.fit(data).labels
age_labels = age_stringIndexer.fit(data).labels
stages += [gender_stringIndexer, age_stringIndexer]

hashingTF = HashingTF(inputCol="hosts", outputCol="features").setNumFeatures(10000)
stages += [hashingTF]

In [51]:
gender_rf = RandomForestClassifier(labelCol = 'gender_indexed', 
                                   predictionCol='gender_prediction', 
                                   probabilityCol='gender_probability',
                                   rawPredictionCol="gender_raw_prediction")

age_rf = RandomForestClassifier(labelCol = 'age_indexed', 
                                predictionCol='age_prediction', 
                                probabilityCol='age_probability',
                                rawPredictionCol="age_raw_prediction")

stages += [gender_rf, age_rf]

In [52]:
# Convert indexed labels back to original labels.
gender_converter = IndexToString(inputCol="gender_prediction", outputCol="predictedGender", labels=gender_labels)
age_converter = IndexToString(inputCol="age_prediction", outputCol="predictedAge", labels=age_labels)

stages += [gender_converter, age_converter]

In [53]:
pipeline = Pipeline(stages = stages)
model = pipeline.fit(data)

In [55]:
model.write().overwrite().save("/user/alexandra.kolesova/lab04_model")

In [56]:
from pyspark.ml import PipelineModel

In [66]:
model_downloaded = PipelineModel.load("/user/alexandra.kolesova/lab04_model")

In [67]:
data_pred= model_downloaded.transform(data)

In [59]:
data_pred.show(5)

+------+-----+--------------------+--------------------+--------------+-----------+--------------------+---------------------+--------------------+-----------------+--------------------+--------------------+--------------+---------------+------------+
|gender|  age|                 uid|               hosts|gender_indexed|age_indexed|            features|gender_raw_prediction|  gender_probability|gender_prediction|  age_raw_prediction|     age_probability|age_prediction|predictedGender|predictedAge|
+------+-----+--------------------+--------------------+--------------+-----------+--------------------+---------------------+--------------------+-----------------+--------------------+--------------------+--------------+---------------+------------+
|     F|18-24|09b1ecd3-b2d2-4c1...|[tankionline.com,...|           1.0|        2.0|(10000,[9509],[3.0])| [10.2228411483972...|[0.51114205741986...|              0.0|[8.73774176617548...|[0.43688708830877...|           0.0|              M|      

In [68]:
evaluator = MulticlassClassificationEvaluator(labelCol="age_indexed", predictionCol="age_prediction")
accuracy_age = evaluator.evaluate(data_pred)

evaluator = MulticlassClassificationEvaluator(labelCol="gender_indexed", predictionCol="gender_prediction")
accuracy_gender = evaluator.evaluate(data_pred)

print("Accuracy Age = %s" % (accuracy_age))
print("Accuracy Gender = %s" % (accuracy_gender))

Accuracy Age = 0.25945436190937554
Accuracy Gender = 0.42633283966404983


# Kafka

## Params

In [4]:
topic = 'alexandra.kolesova'
kafka_bootstrap_servers = 'spark-master-1.newprolab.com:6667'
subscribe = f'input_{topic}'

In [53]:
write_kafka_params = {
   "kafka.bootstrap.servers": kafka_bootstrap_servers,
   "topic": topic
}

read_kafka_params = {
    "kafka.bootstrap.servers": kafka_bootstrap_servers,
    "subscribe": subscribe,
    "startingOffsets": "latest",
    "failOnDataLoss": "False"
}

In [69]:
model = PipelineModel.load("/user/alexandra.kolesova/lab04_model")

## Batch

In [20]:
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
  .option("subscribe", subscribe) \
.option("failOnDataLoss", 'False')\
  .load()

In [21]:
df.show(5)

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_alexandra.k...|        0|     0|2022-11-05 08:34:...|            0|
|null|[7B 22 75 69 64 2...|input_alexandra.k...|        0|     1|2022-11-05 08:34:...|            0|
|null|[7B 22 75 69 64 2...|input_alexandra.k...|        0|     2|2022-11-05 08:34:...|            0|
|null|[7B 22 75 69 64 2...|input_alexandra.k...|        0|     3|2022-11-05 08:34:...|            0|
|null|[7B 22 75 69 64 2...|input_alexandra.k...|        0|     4|2022-11-05 08:34:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [22]:
test_data = df.selectExpr("CAST(value AS STRING)")

In [23]:
schema = StructType([
    StructField("uid", StringType(),True), 
    StructField("visits", StringType(), True)
])

visit_schema = ArrayType(
        StructType([
            StructField("url", StringType(),True), 
            StructField("timestamp", LongType(),True)
        ])
    )

In [24]:
test_data = (
    test_data
    .select(F.from_json(F.col('value'), schema).alias('user'))
    .select(
        'user.uid', 
        F.from_json(F.col('user.visits'), visit_schema).alias('visits')
    )
)


In [25]:
test_data = test_data.withColumn("visit", F.explode("visits"))
test_data = test_data.withColumn("host", F.expr("parse_url(visit.url, 'HOST')").alias("host"))
test_data = test_data.drop(*['visits', 'visit'])

In [26]:
test_data = test_data.filter(F.col('host').isNotNull())

In [27]:
test_data = (test_data.groupBy("uid")
             .agg(F.collect_list("host")
             .alias("hosts")))

In [28]:
test_data.show(5)

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|0108d217-e476-493...|[kvartblog.ru, kv...|
|0192cc54-559c-4c8...|[metanol.lv, meta...|
|019acd5e-be9a-4cd...|[www.russianfood....|
|02e7f830-da57-4d5...|[maxpark.com, max...|
|1d160259-73d8-451...|[ua.sinoptik.ua, ...|
+--------------------+--------------------+
only showing top 5 rows



In [76]:
model = PipelineModel.load("/user/alexandra.kolesova/lab04_model")

In [29]:
test_pred = model.transform(test_data)

In [30]:
test_pred = test_pred.selectExpr('uid', 'predictedGender as gender', 'predictedAge as age')

In [31]:
test_pred_json = (
    test_pred.select(F.to_json(F.struct(*test_pred.columns)).alias('value'))
)

In [32]:
test_pred_json.show()

+--------------------+
|               value|
+--------------------+
|{"uid":"0108d217-...|
|{"uid":"0192cc54-...|
|{"uid":"019acd5e-...|
|{"uid":"02e7f830-...|
|{"uid":"1d160259-...|
|{"uid":"1e14a504-...|
|{"uid":"1eb313db-...|
|{"uid":"1eff6e4f-...|
|{"uid":"3e75c432-...|
|{"uid":"47565df3-...|
|{"uid":"4766a8ab-...|
|{"uid":"50637c81-...|
|{"uid":"5a023519-...|
|{"uid":"5a781caa-...|
|{"uid":"5ab3c7b8-...|
|{"uid":"7302e78a-...|
|{"uid":"73081df3-...|
|{"uid":"89fe85cb-...|
|{"uid":"8affa6ce-...|
|{"uid":"b2e4450d-...|
+--------------------+
only showing top 20 rows



## Stream

In [70]:
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [55]:
#test_data = kafka_sdf.selectExpr("CAST(value AS STRING)")
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [71]:
test_data = kafka_sdf.selectExpr("CAST(value AS STRING)")

schema = StructType([
    StructField("uid", StringType(),True), 
    StructField("visits", StringType(), True)
])

visit_schema = ArrayType(
        StructType([
            StructField("url", StringType(),True), 
            StructField("timestamp", LongType(),True)
        ])
    )

test_data = (
    test_data
    .select(F.from_json(F.col('value'), schema).alias('user'))
    .select(
        'user.uid', 
        F.from_json(F.col('user.visits'), visit_schema).alias('visits')
    )
)

In [72]:
test_data = test_data.withColumn("visit", F.explode("visits"))
test_data = test_data.withColumn("host", F.expr("parse_url(visit.url, 'HOST')").alias("host"))
test_data = test_data.drop(*['visits', 'visit'])

test_data = test_data.filter(F.col('host').isNotNull())

test_data = (test_data.groupBy("uid")
             .agg(F.collect_list("host")
             .alias("hosts")))

In [73]:
test_pred = model.transform(test_data)
test_pred = test_pred.selectExpr('uid', 'predictedGender as gender', 'predictedAge as age')

In [74]:
test_pred_json = (
    test_pred.select(F.to_json(F.struct(*test_pred.columns)).alias('value'))
)

In [75]:
(test_pred_json.writeStream.format("kafka")
 .options(**write_kafka_params)
 .option("checkpointLocation", "checkpoints/checkpoint_lab04/3")
 .outputMode("complete").start())

<pyspark.sql.streaming.StreamingQuery at 0x7f2133b28e80>