#### Лаба 4. Прогнозирование пола и возрастной категории — Spark Streaming

In [180]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


###### Данные и библиотеки

In [181]:
# Библиотеки 
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list, regexp_replace, get_json_object, to_json, struct
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

In [3]:
# папка с датасетом на HDFS
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [4]:
# датасет для обучения модели
# Note: проверить таргет на наличие пустых значений
!hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head -n2

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
cat: Unable to write to output stream.


In [5]:
# формируем spark DF
path = '/labs/slaba04/gender_age_dataset.txt'

schema = t.StructType(fields=[
    t.StructField('gender', t.StringType()),
    t.StructField('age', t.StringType()),
    t.StructField('uid', t.StringType()),
    t.StructField('user_json', t.StringType()),
])

train_data = spark.read.csv(path, header=True, schema=schema, sep='\t')

In [6]:
train_data.show(2)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 2 rows



In [7]:
# схема для json с визитами
visits_schema = t.StructType([
    t.StructField('visits', t.ArrayType(
        t.StructType([
            t.StructField('url', t.StringType(), True),
            t.StructField('timestamp', t.LongType(), True)
        ])
    ))
])

##### Формируем фичи

In [9]:
# извлечем из визитов пользователя URL, чтобы применить HashingTF
train_df_parsed = (
    train_data
    .withColumn('visits', from_json(col('user_json'), visits_schema))
    .withColumn('visit', explode('visits.visits').alias('visit'))
    .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
    .drop('visits', 'visit', 'user_json')
)
# используем explode, parse_url, collect_list
train_df_final = (
    train_df_parsed
    .groupBy('gender', 'age', 'uid')
    .agg(collect_list('host')
    .alias('hosts'))
    .cache()
)

train_df_final.printSchema()
train_df_final.show(3)

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- hosts: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------+---+--------------------+--------------------+
|gender|age|                 uid|               hosts|
+------+---+--------------------+--------------------+
|     -|  -|13292e10-60bf-435...|[dateandtime.info...|
|     -|  -|13f91463-8386-44c...|[go.mail.ru, pass...|
|     -|  -|1dec593b-4dc5-41b...|[www.sport-expres...|
+------+---+--------------------+--------------------+
only showing top 3 rows



#####  Pipeline

In [182]:
# import and clean data + parse JSON column + extract URL from visits + Hashing TF + RandomForest 
train_df, test_df = train_df_final.randomSplit([0.8, 0.2], seed=42)
hashing_TF = HashingTF(inputCol='hosts', outputCol='rawFeatures', numFeatures=10000, binary=False)
#----------------------------------------------------------------------------------------------------------------
indexer_age = (StringIndexer(inputCol='age', 
                             outputCol='ageIndex'
                            )
               .fit(train_df_final)
              )

indexer_gender = (StringIndexer(inputCol='gender', 
                                outputCol='genderIndex'
                               )
                  .fit(train_df_final)
                 )
#----------------------------------------------------------------------------------------------------------------
rf_age = RandomForestClassifier(featuresCol = 'rawFeatures', 
                                labelCol = 'ageIndex',
                                predictionCol='age_index_prediction', 
                                rawPredictionCol='age_index_raw_prediction',
                                probabilityCol='age_probability'
                               )

rf_gender = RandomForestClassifier(featuresCol = 'rawFeatures', 
                                   labelCol = 'genderIndex',
                                   predictionCol='gender_index_prediction', 
                                   rawPredictionCol='gender_index_raw_prediction',
                                   probabilityCol='gender_probability'
                                  )
#----------------------------------------------------------------------------------------------------------------
converter_age = IndexToString(inputCol='age_index_prediction', 
                              outputCol='PredictedAge', 
                              labels=indexer_age.labels
                             )

converter_gender = IndexToString(inputCol='gender_index_prediction', 
                                 outputCol='PredictedGender', 
                                 labels=indexer_gender.labels
                                )

Py4JJavaError: An error occurred while calling o7056.fit.
: java.lang.IllegalStateException: SparkContext has been shutdown
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2053)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.countByKey(PairRDDFunctions.scala:369)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1259)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1259)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.countByValue(RDD.scala:1258)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:140)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [12]:
pipeline = (
    Pipeline(
        stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, converter_age, converter_gender]
    )
)

model = pipeline.fit(train_df)
predictions = model.transform(test_df)

predictions.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- hosts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- ageIndex: double (nullable = false)
 |-- genderIndex: double (nullable = false)
 |-- age_index_raw_prediction: vector (nullable = true)
 |-- age_probability: vector (nullable = true)
 |-- age_index_prediction: double (nullable = false)
 |-- gender_index_raw_prediction: vector (nullable = true)
 |-- gender_probability: vector (nullable = true)
 |-- gender_index_prediction: double (nullable = false)
 |-- PredictedAge: string (nullable = true)
 |-- PredictedGender: string (nullable = true)



In [13]:
predictions.select('gender', 'age', 'PredictedAge', 'PredictedGender').show(3)

+------+---+------------+---------------+
|gender|age|PredictedAge|PredictedGender|
+------+---+------------+---------------+
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
+------+---+------------+---------------+
only showing top 3 rows



In [14]:
evaluator_age = (MulticlassClassificationEvaluator(
    labelCol='ageIndex', 
    predictionCol='age_index_prediction', 
    metricName='accuracy')
                )

accuracy_age = evaluator_age.evaluate(predictions)

evaluator_gender = (
    MulticlassClassificationEvaluator(
        labelCol='genderIndex', 
        predictionCol='gender_index_prediction', 
        metricName='accuracy')
)
accuracy_gender = evaluator_gender.evaluate(predictions)

print('accuracy for age: ' + str(accuracy_age))
print('accuracy for gender: ' + str(accuracy_gender))

accuracy for age: 0.37466955058880075
accuracy for gender: 0.48125450612833454


In [23]:
MODEL_PATH = 'dmitry.tikhonov/lab04'
model.save(MODEL_PATH)

In [24]:
!hdfs dfs -ls dmitry.tikhonov/lab04

Found 2 items
drwxr-xr-x   - dmitriy.tikhonov dmitriy.tikhonov          0 2022-11-04 20:10 dmitry.tikhonov/lab04/metadata
drwxr-xr-x   - dmitriy.tikhonov dmitriy.tikhonov          0 2022-11-04 20:10 dmitry.tikhonov/lab04/stages


##### Kafka 

In [183]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_dmitriy.tikhonov'
KAFKA_OUTPUT_TOPIC = 'dmitriy.tikhonov'

In [184]:
# чтение в статическом режиме

kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [185]:
print('count',kafka_read_df.count())
kafka_read_df.show(3)

count 35000
+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_dmitriy.tik...|        0|     0|2022-11-04 20:23:...|            0|
|null|[7B 22 75 69 64 2...|input_dmitriy.tik...|        0|     1|2022-11-04 20:23:...|            0|
|null|[7B 22 75 69 64 2...|input_dmitriy.tik...|        0|     2|2022-11-04 20:23:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [186]:
# Парсинг бинарного файла из кафки

event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])


visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)


clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)

clean_df.show(3)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
+--------------------+--------------------+
only showing top 3 rows



In [187]:
# извлечение url
prep_df = (
    clean_df
    .withColumn('visit', explode('visits').alias('visit'))
    .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
    .drop('visits', 'visit')
    .groupBy('uid')
    .agg(collect_list('host').alias('hosts'))
)

prep_df.printSchema()
prep_df.show(3)

root
 |-- uid: string (nullable = true)
 |-- hosts: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|0108d217-e476-493...|[kvartblog.ru, kv...|
|0192cc54-559c-4c8...|[metanol.lv, meta...|
|019acd5e-be9a-4cd...|[www.russianfood....|
+--------------------+--------------------+
only showing top 3 rows



In [188]:
# применение модели, сохранение предсказаний в predictions_df  
inf_model = PipelineModel.load(MODEL_PATH)

predictions_df = (
    inf_model.transform(prep_df)
    .select('uid', 'PredictedGender', 'PredictedAge')
    .withColumnRenamed('PredictedAge', 'age')
    .withColumnRenamed('PredictedGender', 'gender')
)

predictions_df.show(3)
# Оборачивание предсказания обратно в json

kafka_out_df = (
    predictions_df
    .select(f.to_json(f.struct(*predictions_df.columns)).alias('value')).limit(5)
)
kafka_out_df.show(3)
# Запись в выходной топик

(
    kafka_out_df
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('topic', KAFKA_OUTPUT_TOPIC)
    .save()
)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|0108d217-e476-493...|     M|25-34|
|0192cc54-559c-4c8...|     M|25-34|
|019acd5e-be9a-4cd...|     M|25-34|
+--------------------+------+-----+
only showing top 3 rows

+--------------------+
|               value|
+--------------------+
|{"uid":"0108d217-...|
|{"uid":"0192cc54-...|
|{"uid":"019acd5e-...|
+--------------------+
only showing top 3 rows



In [209]:
# чтение стрима
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': KAFKA_INPUT_TOPIC,
    'startingOffsets': 'latest'
}

write_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'topic': KAFKA_OUTPUT_TOPIC
}

In [210]:
eventType = StructType([
    StructField('uid', StringType(), True),
    StructField('visits', StringType(), True)
])

visitType = ArrayType(
    StructType([
        StructField('url', StringType(), True),
        StructField('timestamp', LongType(), True)  
    ])
)

In [216]:
inf_model = PipelineModel.load(MODEL_PATH)
def write_in_batch(batch_df, batch_id):
    clean_df = (
    batch_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select('event.uid',f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    ))
    
    prep_df = (
    clean_df
    .withColumn('visit', explode('visits').alias('visit'))
    .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
    .drop('visits', 'visit')
    .groupBy('uid')
    .agg(collect_list('host').alias('hosts')))
    
    predictions_df = (
    inf_model.transform(prep_df)
    .select('uid', 'PredictedGender', 'PredictedAge')
    .withColumnRenamed('PredictedAge', 'age')
    .withColumnRenamed('PredictedGender', 'gender'))
        
    kafka_df = (
        predictions_df
        .select(to_json(struct(*predictions_df.columns)).alias('value'))
    )
       
    kafka_df\
     .write\
     .format('kafka')\
     .options(**write_kafka_params)\
     .mode('append')\
     .save()

In [217]:
def writestream_batch(df):
    return df.writeStream\
            .foreachBatch(write_in_batch)\
            .option("checkpointLocation", "streaming/chk/chk_kafka_dmitriy.tikhonov_lab04")

In [218]:
kafka_test_df = (spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .option("failOnDataLoss", 'False')
    .load()
)

In [223]:
wb = writestream_batch(kafka_test_df)
wbs = wb.start()

In [225]:
wbs.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [226]:
sc.stop()