In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 16 --executor-memory 4g --executor-cores 8 --driver-memory 4g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
PARTITIONS = 256
import pyspark.sql.functions as f

In [3]:
from pyspark.sql.types import ArrayType, StructType, StructField, DataType, StringType, LongType, TimestampType, BooleanType, IntegerType
schema = StructType(fields=[
                         StructField("gender",StringType(), True),\
                         StructField("age",StringType(), True),\
                         StructField("uid",StringType(), True),\
                         StructField("user_json",StringType(), True)])

In [4]:
replace_dict = {"F":"1", "M":"0", ">=55":"0", "45-54": "1", "35-44": "2", "25-34": "3", "18-24": "4"}

In [5]:
from pyspark.ml import Transformer
from pyspark import keyword_only
from pyspark.ml.param.shared import  HasInputCol, HasOutputCol
from pyspark.sql.functions import udf
from urllib.parse import urlparse

class ExtractURLs(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(ExtractURLs, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        arr_extract = udf(lambda arr: [urlparse(x.url).netloc for x in arr] , ArrayType(StringType()))
        
        schema_json = StructType(fields=[
                         StructField("visits", ArrayType(
                                                 StructType(fields=[
                                                     StructField("url", StringType(), True),
                                                     StructField("timestamp", StringType(), True)
                                                 ])), False)])
        
        return dataset.withColumn("user_json_", f.from_json(self.getInputCol(), schema_json))\
                      .withColumn("visits", f.col("user_json_").visits)\
                      .withColumn(self.getOutputCol(), arr_extract(f.col("visits")))\
                      .drop("visits", "user_json")

In [6]:
raw_items = spark.read.csv("/labs/slaba04", sep="\t", header=True, schema=schema)\
             .filter("gender != '-'")\
             .na.replace(replace_dict)\
             .withColumn("age", f.col("age").cast("int"))\
             .withColumn("gender", f.col("gender").cast("int"))\
             .repartition(PARTITIONS)\
             .cache()

In [7]:
from pyspark.ml.feature import HashingTF, CountVectorizer, IDF, Normalizer
from pyspark.ml.classification import NaiveBayes

from pyspark.ml import Pipeline

eurls = ExtractURLs(inputCol="user_json")
tf = CountVectorizer(inputCol=eurls.getOutputCol(), minTF=2, maxDF=0.5)
idf = IDF(inputCol=tf.getOutputCol())
nrml = Normalizer(inputCol=idf.getOutputCol())
gender_model = NaiveBayes(featuresCol=nrml.getOutputCol(),
                          labelCol="gender",
                          predictionCol="gender_p",
                          rawPredictionCol="skip_1", 
                          probabilityCol="skip_2"
                         )
age_model = NaiveBayes(featuresCol=nrml.getOutputCol(),
                       labelCol="age", predictionCol="age_p",
                       rawPredictionCol="skip_3", probabilityCol="skip_4")
pipeline = Pipeline(stages=[
    eurls,
    tf,
    idf,
    nrml,
    age_model,
    gender_model
])

model = pipeline.fit(raw_items)

In [161]:
df = model.transform(raw_items).select("uid", "gender",  "age", "gender_p", "age_p")

# Проверка модели

In [9]:
train = raw_items.sampleBy("gender", fractions={0: 0.9, 1: 0.9}, seed=5757).repartition(PARTITIONS)
test_model = pipeline.fit(train)

test = raw_items.join(train, "uid", "leftanti")
predictions = test_model.transform(test)

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
evaluator = MulticlassClassificationEvaluator(predictionCol="gender_p",
                                          labelCol="gender", metricName='accuracy')
gender_acc = evaluator.evaluate(predictions)

In [13]:
evaluator = MulticlassClassificationEvaluator(predictionCol="age_p",
                                          labelCol="age", metricName='accuracy')
age_acc = evaluator.evaluate(predictions)

In [14]:
print(gender_acc)
print(age_acc)
gender_acc * age_acc > 0.25

0.6358333333333334
0.4288888888888889


True

# Создание стрима

In [276]:
inv_gender_dict = {1.0: "F", 0.0: "M"}
inv_age_dict = {0.0:">=55", 1.0:"45-54", 2.0:"35-44", 3.0:"25-34", 4.0:"18-24"}

pack2json = udf(lambda x,y,z : '{{"uid":"{:s}", "age":"{:s}", "gender":"{:s}"}}'.format(x,\
                                                                                      inv_age_dict[y],\
                                                                                      inv_gender_dict[z]),\
              StringType())

In [277]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_igor.dmitriev",
    "startingOffsets": "latest"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

schema_json = StructType(fields=[
                 StructField("uid", StringType(), False),\
                 StructField("visits", StringType(), False)])

kafka_sdf = kafka_sdf.select("value")\
        .withColumn("json_raw", f.from_json(f.col("value").cast("string"), schema_json))\
        .withColumn("uid", f.col("json_raw").uid)\
        .withColumn("visits", f.col("json_raw").visits )\
        .withColumn("user_json", f.format_string('{"visits": %s}', f.col("visits")))\
        .select("uid", "user_json")

batch_df = model.transform(kafka_sdf)\
    .select("uid", "age_p", "gender_p")\
    .withColumn("value", pack2json(f.col("uid"), f.col("age_p"), f.col("gender_p")))\
    .drop("gender_p", "age_p", "uid")

In [300]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "igor.dmitriev"
}

sink = batch_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

In [309]:
sink.isActive

True

In [311]:
kill_all()

Stopped KafkaV2[Subscribe[input_igor.dmitriev]]


In [98]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [87]:
spark.stop()