In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --executor-cores 3 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, HashingTF, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import *
import numpy as np

In [3]:
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.sql.functions import col, desc, pandas_udf, PandasUDFType, udf, regexp_replace, when, asc, lit, broadcast
from pyspark.sql.types import StructType, IntegerType, StructField, DateType, StringType, TimestampType, FloatType, ArrayType, LongType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql.functions import struct, to_json
from pyspark.sql.functions import shuffle, array, lit
from pyspark.sql.functions import col, explode
from pyspark.sql.functions import lower
from pyspark.sql.functions import struct, to_json
from pyspark.ml import Pipeline, PipelineModel

In [4]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [5]:
schema = StructType([
    StructField("gender", StringType()),
    StructField("age", StringType()),
    StructField("uid", StringType()),
    StructField("user_json", StringType())
])

In [6]:
#Она расположена на hdfs: /labs/slaba04/.

#Поле gender принимает значения F (женщина) и M (мужчина).

#Поле age принимает значения диапазона возраста: 18-24, 25-34, 35-44, 45-54, >=55

#Поле uid принимает значения уникального ID пользователя (cookies): d50192e5-c44e-4ae8-ae7a-7cfe67c8b777.

#Поле user_json имеет внутри json со следующей схемой данных: {"visits": [{"url": "url1", "timestamp": "timestamp1"}, {"url": "url2", "timestamp": "timestamp2"}]}. В нем содержатся непосредственно логи посещения пользователем страниц вместе с временной меткой посещения.

In [7]:
df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, schema=schema,sep='\t')

In [8]:
df.show(1)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 1 row



In [9]:
schema_visit = t.StructType([
    t.StructField('visits', t.ArrayType(
        t.StructType([
        t.StructField('url', StringType(), True),
        t.StructField('timestamp', LongType(), True)
                      ])
    ))
])

In [10]:
df1=(
    df
    .select('gender','age','uid', f.col('user_json').cast('string').alias('value'))
    .select('gender','age','uid', from_json(f.col('value'), schema_visit).alias('visits'))
    
)
df1.show(2)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|              visits|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|[[[http://zebra-z...|
|     M|25-34|d502331d-621e-472...|[[[http://sweetra...|
+------+-----+--------------------+--------------------+
only showing top 2 rows



In [11]:
df1.take(1)

[Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', visits=Row(visits=[Row(url='http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun', timestamp=1419688144068), Row(url='http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', timestamp=1426666298001), Row(url='http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', timestamp=1426666298000), Row(url='http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', timestamp=1426661722001), Row(url='http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', timestamp=1426661722000)]))]

In [12]:
df2=df1.select('gender','age','uid','visits.visits')
df2.take(1)

[Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', visits=[Row(url='http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun', timestamp=1419688144068), Row(url='http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', timestamp=1426666298001), Row(url='http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', timestamp=1426666298000), Row(url='http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', timestamp=1426661722001), Row(url='http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', timestamp=1426661722000)])]

In [13]:
df2.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- visits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)



In [14]:
df3=df2.select('gender','age','uid', explode(df2.visits.url).alias('url'))


In [15]:
df3.show(1)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|                 url|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|http://zebra-zoya...|
+------+-----+--------------------+--------------------+
only showing top 1 row



In [16]:
df4 = df3.withColumn('site_name', f.regexp_extract(df3.url, r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2)).drop('url').distinct()

In [17]:
df4.show(3)

+------+-----+--------------------+------------+
|gender|  age|                 uid|   site_name|
+------+-----+--------------------+------------+
|     M|35-44|33dc7928-4226-4d0...|panicnews.ru|
|     M|25-34|33ebe1d9-0ef3-486...|  aburmu4.tv|
|     F|25-34|33ee5b03-5e7d-478...| ofigenno.cc|
+------+-----+--------------------+------------+
only showing top 3 rows



In [18]:
df5 = df4.groupBy("gender","age","uid").\
                    agg(f.collect_list("site_name").alias("site_name")).cache()

In [19]:
df5 = df5.filter(df.gender != '-')

In [20]:
indexGender = StringIndexer(inputCol='gender', outputCol='gender_i')
indexAge = StringIndexer(inputCol='age', outputCol='age_i')
indexModelGender = indexGender.fit(df5)
indexModelAge = indexAge.fit(df5)
df_i = indexModelGender.transform(df5)
df_i = indexModelAge.transform(df_i)

In [21]:
df_i.where('gender="M"').show(1)

+------+-----+--------------------+-------------------+--------+-----+
|gender|  age|                 uid|          site_name|gender_i|age_i|
+------+-----+--------------------+-------------------+--------+-----+
|     M|18-24|0735ae64-024e-445...|[km.ru, it-fecs.ru]|     0.0|  2.0|
+------+-----+--------------------+-------------------+--------+-----+
only showing top 1 row



In [22]:
hashingTF = HashingTF(numFeatures=100000, binary=False, inputCol="site_name", outputCol="site_name_h")
forestG = RandomForestClassifier(featuresCol='site_name_h', labelCol='gender_i', predictionCol='predictionG',
                                 probabilityCol='probabilityG', rawPredictionCol='rawPredictionG')
forestA = RandomForestClassifier(featuresCol='site_name_h', labelCol='age_i', predictionCol='predictionA',
                                 probabilityCol='probabilityA', rawPredictionCol='rawPredictionA')
strindG = IndexToString(inputCol='predictionG', outputCol='gender_p', labels=indexModelGender.labels)
strindA = IndexToString(inputCol='predictionA', outputCol='age_p', labels=indexModelAge.labels)

pipeline = Pipeline(stages=[
   
    hashingTF,
    forestG,
    forestA,
    strindG,
    strindA
])

In [23]:
train = df_i.sampleBy('gender_i', fractions={0: 0.8, 1: 0.8}, seed=42).cache()
val = df_i.join(train, on=['uid'], how='leftanti').cache()

In [24]:
pipiline_model=pipeline.fit(train)

In [25]:
valid=pipiline_model.transform(val)

In [26]:
valid.show(2)
valid.printSchema()

+--------------------+------+----+--------------------+--------+-----+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-----------+--------+-----+
|                 uid|gender| age|           site_name|gender_i|age_i|         site_name_h|      rawPredictionG|        probabilityG|predictionG|      rawPredictionA|        probabilityA|predictionA|gender_p|age_p|
+--------------------+------+----+--------------------+--------+-----+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-----------+--------+-----+
|098a0e00-8597-475...|     F|>=55|[jkeks.ru, youtub...|     1.0|  4.0|(100000,[56021,72...|[10.2714519598183...|[0.51357259799091...|        0.0|[8.61539862739998...|[0.43076993136999...|        0.0|       M|25-34|
|0a595fa1-bae0-41d...|     M|>=55|[go.mail.ru, avit...|     0.0|  4.0|(100000,[4925,288...|[10.3927615743713...|[0.51963807871856...|       

In [27]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="predictionG", labelCol="gender_i", metricName='areaUnderROC')

evaluator.evaluate(valid)

0.5151666096001996

In [28]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [29]:
KAFKA_BOOTSTRAP_SERVERS='spark-master-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC='input_yuriy.gulynin'
KAFKA_OUTPUT_TOPIC='yuriy.gulynin'  

In [39]:
kafka_read_df=(
    spark.readStream
        .format('kafka')
        .option('kafka.bootstrap.servers',KAFKA_BOOTSTRAP_SERVERS)
        .option('subscribe', KAFKA_INPUT_TOPIC)
        .option('startingOffsets', 'earliest')
        .option('failOnDataLoss', 'False')
        .load()        
)

In [40]:
event_schema=t.StructType([   
        t.StructField("uid", t.StringType(), True),
        t.StructField("visits", t.StringType(), True)
])

In [41]:
visit_schema =  t.ArrayType(
    t.StructType([  
        t.StructField("url", StringType(), True),
        t.StructField("timestamp", LongType(), True)
    ])
)

In [42]:
! hdfs dfs -rm -R /user/yuriy.gulynin/tmp/lab04/checkpointLocation

kafka_read_df = (spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVERS)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

clean_df = (kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select('evenT.uid', f.from_json(f.col('evenT.visits'), visit_schema).alias('visits'))
    .withColumn('url', f.col('visits.url'))
    .drop('visits')
)

clean_df2=clean_df.select('uid', explode(clean_df.url).alias('url'))
clean_df2=clean_df2.withColumn('site_name', f.regexp_extract('url', r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2)).distinct().drop("url")
clean_df2 = clean_df2.groupBy("uid").\
                    agg(f.collect_list("site_name").alias("site_name"))



predictions_df = pipiline_model.transform(clean_df2) \
.select('uid', f.col('gender_p').alias('gender'), f.col('age_p').alias('age'))

kafka_out_df = predictions_df.select(to_json(struct(*predictions_df.columns)).alias('value'))

kafka_write_stream = (
    kafka_out_df
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "tmp/lab04/checkpointLocation")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)
kafka_write_stream.start()


22/11/08 13:03:55 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/yuriy.gulynin/tmp/lab04/checkpointLocation' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/yuriy.gulynin/.Trash/Current/user/yuriy.gulynin/tmp/lab04/checkpointLocation


<pyspark.sql.streaming.StreamingQuery at 0x7f5cac069240>

In [43]:
kill_all()

Stopped KafkaV2[Subscribe[input_yuriy.gulynin]]
Stopped KafkaV2[Subscribe[input_yuriy.gulynin]]


In [44]:
spark.stop()