#### Лаба 4. Прогнозирование пола и возрастной категории — Spark Streaming

In [154]:
spark.stop()

In [155]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 --executor-memory 12g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [156]:
spark

###### Данные и библиотеки

In [157]:
# Библиотеки 
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier

In [158]:
df = spark.read.format("csv").option("header","true").option("sep", "\t").load("/labs/slaba04/gender_age_dataset.txt")
# df.select('user_json').show(1,False,False)

In [159]:
# датасет для обучения модели
# Note: проверить таргет на наличие пустых значений
# !hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head -n2

In [160]:
# формируем spark DF
path = '/labs/slaba04/gender_age_dataset.txt'

schema = t.StructType(fields=[
    t.StructField('gender', t.StringType()),
    t.StructField('age', t.StringType()),
    t.StructField('uid', t.StringType()),
    t.StructField('user_json', t.StringType()),
])

train_data = spark.read.csv(path, header=True, schema=schema, sep='\t')

In [161]:
train_data.show(1)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 1 row



In [162]:
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructType
from pyspark.sql.types import LongType
from pyspark.sql.functions import explode

In [163]:
from pyspark.sql.functions import col, explode
from pyspark.sql.types import TimestampType
from pyspark.sql.types import DateType
import pyspark.sql.functions as F

In [164]:
from pyspark.sql.functions import *

##### Формируем фичи

In [165]:
train_data.select('gender','age','uid', train_data["user_json"]).show(1,False)

+------+-----+------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|gender|age  |uid                                 |user_json                                                                                                                                                                                         

In [166]:
# схема для json с визитами
visits_schema = t.StructType([
    t.StructField('visits', t.ArrayType(
        t.StructType([
            t.StructField('url', t.StringType(), True),
            t.StructField('timestamp', t.LongType(), True)
        ])
    ))
])

In [167]:
train_data2=train_data.withColumn("user_json",from_json(col("user_json"),visits_schema)['visits'])
train_data2.show(1,False,False)

+------+-----+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|gender|age  |uid                                 |user_json                                                                                                                                                                                                                                                                                                                   

In [168]:
train_data2.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)



In [169]:
train_data3 = train_data2.withColumn("user_json", explode("user_json")).select("*", col("user_json")["url"].alias("url"), col("user_json")["timestamp"].alias("timestamp"))
train_data4 = train_data3.select('gender','age','uid','url','timestamp')
train_data4.show(20)

+------+-----+--------------------+--------------------+-------------+
|gender|  age|                 uid|                 url|    timestamp|
+------+-----+--------------------+--------------------+-------------+
|     F|18-24|d50192e5-c44e-4ae...|http://zebra-zoya...|1419688144068|
|     F|18-24|d50192e5-c44e-4ae...|http://news.yande...|1426666298001|
|     F|18-24|d50192e5-c44e-4ae...|http://www.sotovi...|1426666298000|
|     F|18-24|d50192e5-c44e-4ae...|http://news.yande...|1426661722001|
|     F|18-24|d50192e5-c44e-4ae...|http://www.sotovi...|1426661722000|
|     M|25-34|d502331d-621e-472...|http://sweetradin...|1419717886224|
|     M|25-34|d502331d-621e-472...|http://sweetradin...|1419717884437|
|     M|25-34|d502331d-621e-472...|http://sweetradin...|1419717816375|
|     M|25-34|d502331d-621e-472...|http://101.ru/?an...|1419717804934|
|     M|25-34|d502331d-621e-472...|http://sweetradin...|1419714194423|
|     M|25-34|d502331d-621e-472...|http://sweetradin...|1419713998481|
|     

In [170]:
train_data55 = train_data4.withColumn("host", expr("regexp_extract(parse_url(url, 'HOST'),'(.*)')"))\
.withColumn("target", concat(col("gender"), col("age")))\
.select('gender','age','uid','host','target').where("host!='null'").where("host!='https'").where("host!='http'").distinct()

train_data55.where("uid ='03378baf-0b69-4f6c-b06a-eb539d9679a6'").show(30,truncate=False)

+------+-----+------------------------------------+------------------------+------+
|gender|age  |uid                                 |host                    |target|
+------+-----+------------------------------------+------------------------+------+
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|pddmaster.ru            |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|tgg.my1.ru              |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|www.cleaning-city.ru    |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|ledashop.ru             |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|alldbases.com           |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|motormaran.ru           |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|yandex.ru               |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|rusfishing.ru           |M35-44|
|M     |35-44|03378baf-0b69-4f6c-b06a-eb539d9679a6|scootermag.ru           |

In [171]:
train_data66 = train_data55.where("target!='--'").groupBy('gender','age',"uid",'target')\
.agg(collect_list("host").alias('host_list'))


train_data66.where("uid ='03378baf-0b69-4f6c-b06a-eb539d9679a6'").show(1,False,False)

+------+-----+------------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|gender|age  |uid                                 |target|host_list                                                                

#####  Pipeline

In [172]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import CountVectorizer

In [173]:
ht = HashingTF(inputCol="host_list", outputCol="features", numFeatures=10000)
train_data7 = ht.transform(train_data66)
# train_data7.show(1,truncate=False)

In [174]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "target", outputCol = "target_")
pipeline = Pipeline(stages=[label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(train_data7)
dataset = pipelineFit.transform(train_data7)
# dataset.show(30)

In [175]:
gender_age_dic = dataset.select('gender','age',col('target_').alias('prediction')).distinct()
# gender_age_dic.show()

In [176]:
train = dataset.sampleBy("target_", fractions={0: 0.096, 1: 0.1207, 2: 0.151, 3: 0.185, 4: 0.265, 5:0.311, 6: 0.365, 7: 0.375, 8: 0.865, 9: 1}, seed=5757)
# train = dataset.sampleBy("target_", fractions={0: 0.4, 1: 0.5, 2: 0.7, 3: 0.6, 4: 1, 5:0.311, 6: 0.365, 7: 0.375, 8: 0.865, 9: 1}, seed=5757)

In [177]:
train2 = train.select('uid','features','target_')
# train2 = train2.cache()

In [178]:
tr, ts = dataset.randomSplit(weights=[0.7,0.3])

In [None]:
train2.cache()

In [179]:
train2.where("target_==0").count()

841

In [181]:
train2.where("target_==1").count()

793

In [182]:
train2.where("target_==2").count()

794

In [183]:
train2.where("target_==3").count()

785

In [184]:
train2.where("target_==4").count()

747

In [185]:
train2.where("target_==5").count()

794

In [186]:
train2.where("target_==6").count()

764

In [187]:
train2.where("target_==7").count()

791

In [188]:
train2.where("target_==8").count()

775

In [189]:
train2.where("target_==9").count()

779

In [190]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [191]:
# train2 = train.select('uid','features','target_')
train2.show()

+--------------------+--------------------+-------+
|                 uid|            features|target_|
+--------------------+--------------------+-------+
|07591afc-f28e-41d...|(10000,[893,3581,...|    4.0|
|8ccf503e-2b77-46f...|(10000,[3427],[1.0])|    4.0|
|f9b5782f-4be2-4cc...|(10000,[771,4361,...|    4.0|
|416477cd-c500-495...|(10000,[43,93,196...|    1.0|
|260efd0c-5fb5-4ca...|(10000,[38,154,45...|    3.0|
|4b21a7d3-99cc-40e...|(10000,[973,1790,...|    3.0|
|a3d6f72f-cde4-4eb...|(10000,[9,93,439,...|    3.0|
|ff0255ad-9d0d-416...|(10000,[3055,9357...|    3.0|
|d1d60f28-9f6a-467...|(10000,[267,1001,...|    5.0|
|d23982af-56e6-4b2...|(10000,[23,31,35,...|    5.0|
|eb9eb062-2a3e-4ad...|(10000,[3433,4811...|    5.0|
|f738e48b-f177-4c5...|(10000,[91,450,54...|    5.0|
|fc9e3e88-71f9-440...|(10000,[9,93,865,...|    5.0|
|6d3ec8cb-a6e7-42e...|(10000,[6830],[1.0])|    7.0|
|72183331-6898-49f...|(10000,[392,1790,...|    7.0|
|09531aa4-91d2-4ff...|(10000,[8459,9506...|    0.0|
|19377e29-54

In [None]:
rf = RandomForestClassifier(
    
      featuresCol='features'
    , labelCol='target_'
    , maxDepth=15
    , maxBins=32
    , minInstancesPerNode=1
    , minInfoGain=0.0
    , cacheNodeIds=False
    , checkpointInterval=10
    , impurity='entropy'
    , numTrees=500
    , featureSubsetStrategy='auto'
    , seed=None
    , subsamplingRate=1.0

)

model = rf.fit(train2)

In [122]:
pred = model.transform(dataset)

In [123]:
pred.select('uid','target','target_','prediction').show(20)

In [124]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'target_', predictionCol = 'prediction', metricName = 'accuracy')
mlpacc = evaluator.evaluate(pred)
mlpacc

0.2577774246096388

##### Kafka 

In [145]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_petr.manannikov'
KAFKA_OUTPUT_TOPIC = 'petr.manannikov'

In [270]:
from pyspark.sql.types import StringType, IntegerType

In [248]:
# clean_df.count()
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import ArrayType

In [142]:
# чтение стрима

kafka_sdf = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)


Py4JJavaError: An error occurred while calling o1784.load.
: java.lang.NullPointerException
	at org.apache.spark.sql.kafka010.KafkaMicroBatchReader.<init>(KafkaMicroBatchReader.scala:72)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.createMicroBatchReader(KafkaSourceProvider.scala:143)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.createMicroBatchReader(KafkaSourceProvider.scala:44)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:182)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [250]:
kafka_sdf

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [251]:
event_schema = StructType([
    StructField('uid', StringType(), True),
    StructField('visits', StringType(), True),
])


visit_schema = ArrayType(
    StructType([
        StructField('url', StringType(), True),
        StructField('timestamp', LongType(), True)
    ])
)


clean_df = (
    kafka_sdf
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)

In [1855]:
# Парсинг бинарного файла из кафки

event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])

visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)
clean_df = (
    kafka_sdf
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select('event.uid',f.from_json(f.col('event.visits'), visit_schema).alias('visits'))
)

clean_df.show(3)

AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nkafka'

In [1641]:
vis_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)

In [129]:
clean_df2 = clean_df.withColumn("visits", explode("visits")).select("*", col("visits")["url"].alias("url"))
clean_df3 = clean_df2.select('uid','url')
clean_df3.show(20)

+--------------------+--------------------+
|                 uid|                 url|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|http://www.interf...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://amerikan-g...|
|bd7a30e1-a25d-4cb...|http://tv.jampo.t...|
|bd7a30e1-a25d-4cb...|http://www.sovsek...|
|bd7a30e1-a25d-4cb...|http://mar

In [130]:
clean_df4 = clean_df3.withColumn("host", expr("regexp_extract(parse_url(url, 'HOST'),'(.*)')"))\
.select('uid','host').distinct()  #.where("host!='null'")

clean_df4.show(10,truncate=False)

+------------------------------------+------------------------+
|uid                                 |host                    |
+------------------------------------+------------------------+
|bd7a30e1-a25d-4cbf-a03f-61748cbe540e|www.eer.ru              |
|bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd|films.imhonet.ru        |
|bd9778f1-e2dc-4630-b449-1d7265cb5e41|zx-spectrum.narod.ru    |
|bda1d504-87c3-4cf7-8ea8-325f60288918|trinixy.ru              |
|bda8be2f-c4ff-4be5-b41c-b02fb59a69bc|www.rusbonds.ru         |
|bda8be2f-c4ff-4be5-b41c-b02fb59a69bc|www.npp-sadovod.ru      |
|bdae9068-92ed-434a-a0ed-f2afb183a02f|www.tonnel.ru           |
|bdb3f69d-61fb-41db-92d5-19f579c29921|cache.betweendigital.com|
|bdbb4a6a-05a3-4d4a-9ae4-c507477d63fa|www.p-lib.ru            |
|bdc34323-c6a4-4bc1-8b5c-a0e17ee0ffd3|ogo.micropromo.ru       |
+------------------------------------+------------------------+
only showing top 10 rows



In [132]:
clean_df5 = clean_df4.groupBy("uid")\
.agg(collect_list("host").alias('host_list'))

# clean_df5.show(5,False,False)

In [134]:
ht = HashingTF(inputCol="host_list", outputCol="features", numFeatures=10000)
clean_df6 = ht.transform(clean_df5)
# clean_df6.show(1,truncate=False)

In [135]:
pr = model.transform(clean_df6)

In [286]:
# pr.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 uid|           host_list|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|0192cc54-559c-4c8...|[vestiprim.ru, n5...|(10000,[120,1058,...|[51.8969246743789...|[0.10379384934875...|       1.0|
|019acd5e-be9a-4cd...|[zhenskoe-mnenie....|(10000,[2155,3273...|[49.7621020049179...|[0.09952420400983...|       1.0|
|1e14a504-276e-448...|[www.gismeteo.ru,...|(10000,[2304,3016...|[55.3047357805689...|[0.11060947156113...|       7.0|
|1eb313db-34ff-4bf...|     [zamok.gidm.ru]|(10000,[7792],[1.0])|[52.2175670241649...|[0.10443513404832...|       1.0|
|47565df3-13e3-460...|[www.grosmet.ru, ...|(10000,[266,3166,...|[51.1658327815553...|[0.10233166556311...|       1.0|
|4766a8ab-e9b6-4e0...|[www.st54.ru, www...|(10000,[979,1

In [282]:
# gender_age_dic.show()

+------+-----+----------+
|gender|  age|prediction|
+------+-----+----------+
|     M|45-54|       6.0|
|     M| >=55|       9.0|
|     F|25-34|       1.0|
|     M|35-44|       2.0|
|     F| >=55|       8.0|
|     M|18-24|       7.0|
|     F|18-24|       4.0|
|     F|45-54|       5.0|
|     F|35-44|       3.0|
|     M|25-34|       0.0|
+------+-----+----------+



In [147]:
predictions_df =pr.join(f.broadcast(gender_age_dic), on='prediction', how="inner").select('uid','gender','age')
# transform.show()
# predictions_df.show()

In [148]:
kafka_out_df = predictions_df.select(F.to_json(F.struct(*predictions_df.columns)).alias('value'))

kafka_write_stream = (
    kafka_out_df
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "tmp/lab04/checkpointLocation")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)
kafka_write_stream.start()

AnalysisException: "'writeStream' can be called only on streaming Dataset/DataFrame;"