In [1]:
import os
import sys

spark_home = '/usr/hdp/current/spark2-client'
os.environ["SPARK_HOME"]=spark_home
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-cores 1 --executor-memory 5g --driver-memory 2g pyspark-shell'

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, HiveContext

conf = SparkConf().set("spark.app.name", "lab04") 
sc = SparkContext.getOrCreate(conf)
spark = HiveContext(sc)

sc.applicationId

'application_1667306389915_1653'

In [3]:
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import Tokenizer, HashingTF, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

### Работа с тестовыми данными

In [4]:
event_schema = T.StructType([
    T.StructField('gender', T.StringType()),
    T.StructField('age', T.StringType()),
    T.StructField('uid', T.StringType()),
    T.StructField('user_json', T.StringType()),
])
train_data_raw = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', header=True, schema=event_schema, sep='\t')
train_data_raw.show(5, 70)

+------+-----+------------------------------------+----------------------------------------------------------------------+
|gender|  age|                                 uid|                                                             user_json|
+------+-----+------------------------------------+----------------------------------------------------------------------+
|     F|18-24|d50192e5-c44e-4ae8-ae7a-7cfe67c8b777|{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-...|
|     M|25-34|d502331d-621e-4721-ada2-5d30b2c3801f|{"visits": [{"url": "http://sweetrading.ru/?p=900", "timestamp": 14...|
|     F|25-34|d50237ea-747e-48a2-ba46-d08e71dddfdb|{"visits": [{"url": "http://ru.oriflame.com/products/product?code=3...|
|     F|25-34|d502f29f-d57a-46bf-8703-1cb5f8dcdf03|{"visits": [{"url": "http://translate-tattoo.ru/font-selection/?has...|
|     M| >=55|d503c3b2-a0c2-4f47-bb27-065058c73008|{"visits": [{"url": "https://mail.rambler.ru/#/folder/", "timestamp...|
+------+-----+--

In [5]:
visit_schema = T.StructType([
    T.StructField('visits', T.ArrayType(T.StructType([
        T.StructField('url', T.StringType()),
        T.StructField('timestamp', T.LongType()),
    ])))
])
train_data_raw.select("user_json").take(1)

[Row(user_json='{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}')]

In [13]:
train_data = (train_data_raw
              .filter(F.length("age") > 1)
              .filter(F.col("gender").isin(["M", "F"]))
              .withColumn('visits_json', F.from_json(F.col('user_json'), visit_schema))
              .withColumn('url', F.col('visits_json.visits.url'))
              .drop('visits_json', 'user_json'))
train_data.show(5, 70)

+------+-----+------------------------------------+----------------------------------------------------------------------+
|gender|  age|                                 uid|                                                                   url|
+------+-----+------------------------------------+----------------------------------------------------------------------+
|     F|18-24|d50192e5-c44e-4ae8-ae7a-7cfe67c8b777|[http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid...|
|     M|25-34|d502331d-621e-4721-ada2-5d30b2c3801f|[http://sweetrading.ru/?p=900, http://sweetrading.ru/?p=884, http:/...|
|     F|25-34|d50237ea-747e-48a2-ba46-d08e71dddfdb|[http://ru.oriflame.com/products/product?code=30569, http://ru.orif...|
|     F|25-34|d502f29f-d57a-46bf-8703-1cb5f8dcdf03|[http://translate-tattoo.ru/font-selection/?hash=1199c573a5f4da47ed...|
|     M| >=55|d503c3b2-a0c2-4f47-bb27-065058c73008|[https://mail.rambler.ru/#/folder/, http://news.rambler.ru/29728405...|
+------+-----+--

In [23]:
indexGender = StringIndexer(inputCol='gender', outputCol='gender_cat')
indexAge = StringIndexer(inputCol='age', outputCol='age_cat')
indexModelGender = indexGender.fit(train_data)
indexModelAge = indexAge.fit(train_data)
df_visits = indexModelGender.transform(train_data)
df_visits = indexModelAge.transform(df_visits)

In [25]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class ParseURLTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(ParseURLTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        
    def _transform(self, dataset):
        res = (dataset.withColumn('url_inter', F.explode(self.getInputCol()))
               .withColumn('url_inter', F.expr('parse_url(url_inter, "HOST")'))
               .withColumn('url_inter', F.lower(F.col('url_inter')))
               .drop(self.getInputCol())
        )
        res = res.groupBy(dataset.drop(self.getInputCol()).columns).agg(F.collect_list('url_inter').alias(self.getOutputCol()))
        return res
    
parseURL = ParseURLTransformer(inputCol='url', outputCol='url_parsed')
hashingTF = HashingTF(numFeatures=100000, binary=False, inputCol="url_parsed", outputCol="url_freq")
forestG = RandomForestClassifier(featuresCol='url_freq', labelCol='gender_cat', predictionCol='predictionG',
                                 probabilityCol='probabilityG', rawPredictionCol='rawPredictionG')
forestA = RandomForestClassifier(featuresCol='url_freq', labelCol='age_cat', predictionCol='predictionA',
                                 probabilityCol='probabilityA', rawPredictionCol='rawPredictionA')
strindG = IndexToString(inputCol='predictionG', outputCol='gender_str', labels=indexModelGender.labels)
strindA = IndexToString(inputCol='predictionA', outputCol='age_str', labels=indexModelAge.labels)

pipeline = Pipeline(stages=[
    parseURL,
    hashingTF,
    forestG,
    forestA,
    strindG,
    strindA
])

In [14]:
%%time
pipeline_model = pipeline.fit(train_data)

CPU times: user 124 ms, sys: 58.9 ms, total: 183 ms
Wall time: 13min 20s


In [20]:
# pipeline_model.write().overwrite().save("tmp/lab04/pipeline_model")
# pipeline_model = PipelineModel.load("tmp/lab04/pipeline_model")

### Работа с Kafka Stream

In [21]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = ""
            try:
                desc = s.lastProgress["sources"][0]["description"]
            except:
                print(f"{s} data not available")
            s.stop()
            print(f"Stopped {desc} at {s}")

In [22]:
KAFKA_BOOTSTRAP_SERVER = 'spark-master-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_anton.gladkiy'
KAFKA_OUTPUT_TOPIC = 'anton.gladkiy'

event_schema = T.StructType([
    T.StructField('uid', T.StringType(), True),
    T.StructField('visits', T.StringType(), True),
])

visit_schema = T.ArrayType(
    T.StructType([
        T.StructField('url', T.StringType(), True),
        T.StructField('timestamp', T.LongType(), True)
    ])
)

In [28]:
! hdfs dfs -rm -R /user/anton.gladkiy/tmp/lab04/checkpointLocation

kafka_read_df = (spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

clean_df = (kafka_read_df
    .select(F.col('value').cast('string').alias('value'))
    .select(F.from_json(F.col('value'), event_schema).alias('event'))
    .select('evenT.uid', F.from_json(F.col('evenT.visits'), visit_schema).alias('visits'))
    .withColumn('url', F.col('visits.url'))
    .drop('visits')
)

predictions_df = pipeline_model.transform(clean_df) \
.select('uid', F.col('gender_str').alias('gender'), F.col('age_str').alias('age'))

kafka_out_df = predictions_df.select(F.to_json(F.struct(*predictions_df.columns)).alias('value'))

kafka_write_stream = (
    kafka_out_df
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "tmp/lab04/checkpointLocation")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)
kafka_write_stream.start()

22/11/06 23:51:14 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/anton.gladkiy/tmp/lab04/checkpointLocation' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/anton.gladkiy/.Trash/Current/user/anton.gladkiy/tmp/lab04/checkpointLocation1667767874505


<pyspark.sql.streaming.StreamingQuery at 0x7f0179271e10>

In [None]:
# ! /usr/hdp/current/kafka-broker/bin/kafka-console-consumer.sh --bootstrap-server spark-master-1:6667 --topic anton.gladkiy

In [30]:
kill_all()
spark.streams.active

Stopped KafkaV2[Subscribe[input_anton.gladkiy]] at <pyspark.sql.streaming.StreamingQuery object at 0x7f01792545f8>


[]

In [27]:
sc.stop()