## Лаба 4. Прогнозирование пола и возрастной категории — Spark Streaming

1. Вы готовите свою модель
2. Настраиваете чтение данных из одного топика и запись в другой топик.
3. Запускаете свой скрипт в стриминговым режиме
4. Идет подключение к топику на чтение данных.
5. На странице лабы нажимаете кнопку "Проверить".
6. Вам в топик текут данные.
7. Модель обрабатывает входные данные.
8. Ваш скрипт сохраняет данные в другой топик.
9. Чекер на странице лабы выдает результат проверки.

In [15]:
import os
import sys
os.environ['PYSPARK_PYTHON']='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [16]:
spark

In [63]:
import datetime
import numpy as np
import pandas as pd

import re
import urllib
from urllib.parse import urlparse

from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as f

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize  

from pyspark.ml.feature import CountVectorizer

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

In [18]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

@f.udf(ArrayType(StringType()))
def get_hostname(array):
    sites = []
    for a in array:
        name = urlparse(a[0]).hostname
        name_parsed = re.findall(r'\w+', name)
        sites.extend(name_parsed)
    return sites

@f.udf(ArrayType(StringType()))
def get_path(array):
    paths = []
    for a in array:
        strings = re.findall(r'\w+', urlparse(a[0]).path)

        remove_4digits_str = filter(lambda string: len(string)>4, strings)
        strings = list(remove_4digits_str)
    
        remove_numeric_digits = filter(lambda s: re.sub(r'[0-9\.]+', '', s), strings)
        strings = list(remove_numeric_digits)
        
        strings = [s for s in strings if not s in stop_words]
        
        strings = [lemmatizer.lemmatize(s) for s in strings]

        paths.extend(strings)
    return paths

@f.udf(ArrayType(StringType()))
def parse_timestamp(array):
    hours = []
    for a in array:
        timestamp = int(a[1])/1000
        value = datetime.datetime.fromtimestamp(timestamp)
        hours.append(str(value.hour))
    
    return hours

@f.udf
def get_target(gender, age):
    target = gender + '_' + age
    return target

In [19]:
data_schema = StructType(fields=[
    StructField('gender', StringType()),
    StructField('age', StringType()),
    StructField('uid', StringType()),
    StructField('user_json', StringType())
])

data = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', sep='\t', header=True)

parsed = data.select('uid', get_json_object(col('user_json'), '$.visits').alias('logs'))

schema = ArrayType(StructType(
                  [StructField("url", StringType()), 
                   StructField("timestamp", StringType())]))

parsed = parsed.withColumn('parsed', from_json(parsed.logs, schema))

parsed = parsed.withColumn('sites', get_hostname(parsed.parsed))
parsed = parsed.withColumn('paths', get_path(parsed.parsed))
parsed = parsed.withColumn('hours', parse_timestamp(parsed.parsed))

In [20]:
parsed.take(1)

[Row(uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', logs='[{"url":"http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun","timestamp":1419688144068},{"url":"http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story","timestamp":1426666298001},{"url":"http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html","timestamp":1426666298000},{"url":"http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story","timestamp":1426661722001},{"url":"http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html","timestamp":1426661722000}]', parsed=[Row(url='http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun', timestamp='1419688144068'), Row(url='http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-o

In [21]:
cv_sites = CountVectorizer(inputCol="sites", outputCol="sites_count", vocabSize=5000, minDF=1.0)
cv_paths = CountVectorizer(inputCol="paths", outputCol="paths_count", vocabSize=10000, minDF=1.0)
cv_hours = CountVectorizer(inputCol="hours", outputCol="hours_count", vocabSize=24, minDF=1.0)

model_sites = cv_sites.fit(parsed)
model_paths = cv_paths.fit(parsed)
model_hours = cv_hours.fit(parsed)

cv_train = model_sites.transform(parsed)
cv_train = model_paths.transform(cv_train)
cv_train = model_hours.transform(cv_train)

assembler = VectorAssembler(inputCols=['sites_count', 'hours_count'], outputCol='features')
train_sample = assembler.transform(cv_train)

train_sample = train_sample.select('uid', 'features').join(data.select('uid', 'gender', 'age'), 
                                                           on='uid', how='left')

train_sample = train_sample.withColumn('target', get_target(train_sample.gender, train_sample.age))
indexer = StringIndexer(inputCol="target", outputCol="target_encoded")
train_sample = indexer.fit(train_sample).transform(train_sample)
train_sample.cache()
train_sample.show(5)

+--------------------+--------------------+------+----+------+--------------+
|                 uid|            features|gender| age|target|target_encoded|
+--------------------+--------------------+------+----+------+--------------+
|0108d217-e476-493...|(5024,[0,5011],[3...|     -|   -|   -_-|           3.0|
|0192cc54-559c-4c8...|(5024,[0,2,106,36...|     -|   -|   -_-|           3.0|
|019acd5e-be9a-4cd...|(5024,[0,1,2,9,46...|     -|   -|   -_-|           3.0|
|02e7f830-da57-4d5...|(5024,[0,1,2,16,1...|     -|   -|   -_-|           3.0|
|0392f398-ea7e-4a1...|(5024,[0,1,2,3,4,...|     F|>=55|F_>=55|           9.0|
+--------------------+--------------------+------+----+------+--------------+
only showing top 5 rows



In [22]:
train_sample.count()

41138

In [51]:
lr = RandomForestClassifier(labelCol='target_encoded', featuresCol='features', numTrees=200)
model = lr.fit(train_sample)

### Read batch

In [53]:
read_kafka_params = {
    "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
    "subscribe": "input_andrey.karavaev",
    "startingOffsets": "earliest",
    "maxOffsetsPerTrigger": "5000",
    "failOnDataLoss": "False"
}

kafka_sdf = spark \
  .read \
  .format("kafka") \
  .options(**read_kafka_params) \
  .load()

In [54]:
kafka_sdf.take(1)

[Row(key=None, value=bytearray(b'{"uid": "bd7a30e1-a25d-4cbf-a03f-61748cbe540e", "visits": "[{\\"url\\": \\"http://www.interfax.ru/business/414668\\", \\"timestamp\\": 1419775945781}, {\\"url\\": \\"http://amerikan-gruzovik.ru/zapchasti-dlya-amerikanskikh-gruzovikov.html\\", \\"timestamp\\": 1419679865088}, {\\"url\\": \\"http://amerikan-gruzovik.ru/\\", \\"timestamp\\": 1419679853405}, {\\"url\\": \\"http://amerikan-gruzovik.ru/b-u-zapchasti-dlya-amerikanskikh-gruzovikov.html\\", \\"timestamp\\": 1419679847246}, {\\"url\\": \\"http://amerikan-gruzovik.ru/\\", \\"timestamp\\": 1419679822992}, {\\"url\\": \\"http://amerikan-gruzovik.ru/zapchasti-mack-novye-i-b-u-razborka-amerikanskikh-gruzovikov.html\\", \\"timestamp\\": 1419679749670}, {\\"url\\": \\"http://amerikan-gruzovik.ru/zapchasti-kenworth-novye-i-b-u-razborka-amerikanskikh-gruzovikov.html\\", \\"timestamp\\": 1419679738132}, {\\"url\\": \\"http://amerikan-gruzovik.ru/zapchasti-peterbilt-razborka-gruzovikov.html\\", \\"timestamp

## Batch processing

In [55]:
deserialized = kafka_sdf.select(col("value").cast("string").alias("value"))

parsed_test = deserialized.select(get_json_object(col("value"), "$.uid").alias("uid"),
                                  get_json_object(col("value"), "$.visits").alias("logs"))

schema = ArrayType(StructType([StructField("url", StringType()), 
                               StructField("timestamp", StringType())]))

parsed_test = parsed_test.withColumn('parsed', from_json(parsed_test.logs, schema))

parsed_test = parsed_test.withColumn('sites', get_hostname(parsed_test.parsed))
parsed_test = parsed_test.withColumn('paths', get_path(parsed_test.parsed))
parsed_test = parsed_test.withColumn('hours', parse_timestamp(parsed_test.parsed))

cv_test = model_sites.transform(parsed_test)
cv_test = model_paths.transform(cv_test)
cv_test = model_hours.transform(cv_test)

assembler = VectorAssembler(inputCols=['sites_count', 'hours_count'], outputCol='features')
test_sample = assembler.transform(cv_test)
test_sample = test_sample.select('uid', 'features')

predict = model.transform(test_sample)
decode_target = train_sample.select('gender', 'age', 'target_encoded').drop_duplicates()
predict = predict.select('uid', 'prediction').join(decode_target, predict.prediction==decode_target.target_encoded, 
                                            how='left')
predict = predict.select('uid', 'gender', 'age')
predict = predict.select(to_json(struct(*predict.columns)).alias("value"))

In [56]:
write_kafka_params = {
"kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
"topic": "andrey.karavaev"
    }

predict.write \
        .format("kafka") \
        .options(**write_kafka_params) \
        .mode("append") \
        .save()

## Stream processing

In [58]:
def foreach_batch_function(kafka_sdf, epoch_id):
    write_kafka_params = {
   "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
   "topic": "andrey.karavaev"
        }
    
    deserialized = kafka_sdf.select(col("value").cast("string").alias("value"))

    parsed_test = deserialized.select(get_json_object(col("value"), "$.uid").alias("uid"),
                                      get_json_object(col("value"), "$.visits").alias("logs"))

    schema = ArrayType(StructType([StructField("url", StringType()), 
                                   StructField("timestamp", StringType())]))

    parsed_test = parsed_test.withColumn('parsed', from_json(parsed_test.logs, schema))

    parsed_test = parsed_test.withColumn('sites', get_hostname(parsed_test.parsed))
    parsed_test = parsed_test.withColumn('paths', get_path(parsed_test.parsed))
    parsed_test = parsed_test.withColumn('hours', parse_timestamp(parsed_test.parsed))

    cv_test = model_sites.transform(parsed_test)
    cv_test = model_paths.transform(cv_test)
    cv_test = model_hours.transform(cv_test)

    assembler = VectorAssembler(inputCols=['sites_count', 'hours_count'], outputCol='features')
    test_sample = assembler.transform(cv_test)
    test_sample = test_sample.select('uid', 'features')

    predict = model.transform(test_sample)
    decode_target = train_sample.select('gender', 'age', 'target_encoded').drop_duplicates()
    predict = predict.select('uid', 'prediction').join(decode_target, predict.prediction==decode_target.target_encoded, 
                                                how='left')
    predict = predict.select('uid', 'gender', 'age')
    predict = predict.select(to_json(struct(*predict.columns)).alias("value"))

    predict.write \
            .format("kafka") \
            .options(**write_kafka_params) \
            .mode("append") \
            .save()

In [59]:
read_kafka_params = {
    "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
    "subscribe": "input_andrey.karavaev",
    "startingOffsets": "latest",
    "failOnDataLoss": "False"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [60]:
def create_sink(df):
    return df.writeStream \
             .foreachBatch(foreach_batch_function) \
             .option("checkpointLocation", "streaming/chk/chk_kafka_karavaev_andrei_lab04") \
            # .outputMode('append') \

sink = create_sink(kafka_sdf)

sink.start()

<pyspark.sql.streaming.StreamingQuery at 0x7fe87c421e48>

In [27]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))

In [62]:
kill_all()

Stopped KafkaV2[Subscribe[input_andrey.karavaev]]


In [None]:
spark.stop()