In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


**Load Data**

In [2]:
data = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', sep='\t', header=True)
print('Data size: {}'. format(data.count()))

Data size: 41138


**Extract data from json**

In [3]:
json_data = spark.read.json(data.rdd.map(lambda r: r.user_json))

parsed_json = json_data.select('visits.url', 'visits.timestamp')

Feature Engineering

In [4]:
parsed_json.printSchema()

root
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- timestamp: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [5]:
parsed_json.show(2)

+--------------------+--------------------+
|                 url|           timestamp|
+--------------------+--------------------+
|[http://zebra-zoy...|[1419688144068, 1...|
|[http://sweetradi...|[1419717886224, 1...|
+--------------------+--------------------+
only showing top 2 rows



**Use only one feature**

In [7]:
from pyspark.sql.functions import col, size
parsed_json = parsed_json.withColumn("sites_cnt", size("timestamp"))

**Concat Dataframes**

In [9]:
from pyspark.sql.functions import monotonically_increasing_id
data = data.withColumn("id", monotonically_increasing_id())
parsed_json = parsed_json.withColumn("id", monotonically_increasing_id())
df = data.join(parsed_json, "id", "outer").drop("id")
# print(df.count())


In [12]:
df = df.select('uid', 'gender', 'age', 'sites_cnt'
              )

In [13]:
df = df.filter(df.age != '-')

In [16]:
import pyspark.sql.functions as F
df = df.withColumn('target', 
                    F.concat(F.col('gender'),F.lit('_'), F.col('age')))

In [19]:
df = df.fillna(0, subset=['sites_cnt'
                         ])

**One model**

In [20]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['sites_cnt'
              ],
    outputCol="features")

df_all = assembler.transform(df)
df_all = df_all.select(['uid', 'target', 'features'])

In [21]:
df_all = df_all.withColumn('target', F.when(col('target') == 'M_25-34', 1).\
                  when(col('target') == 'M_35-44', 2).\
                       when(col('target') == 'M_18-24', 3).\
                            when(col('target') == 'M_45-54', 4).\
                                 when(col('target') == 'M_>=55', 5).\
                   when(col('target') == 'F_25-34', 6).\
                  when(col('target') == 'F_35-44', 7).\
                       when(col('target') == 'F_18-24', 8).\
                            when(col('target') == 'F_45-54', 9).\
                                 when(col('target') == 'F_>=55', 10))

In [22]:
train = df_all.sampleBy("target", fractions={1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8,
                                         6: 0.8, 7: 0.8, 8: 0.8, 9: 0.8, 10: 0.8}, seed=5757)
val = df_all.join(train, on=['uid'], how="leftanti")

In [23]:
train.show(2)

+--------------------+------+--------+
|                 uid|target|features|
+--------------------+------+--------+
|d5288ed8-8253-45d...|     1|  [19.0]|
|d52ab244-0a9c-434...|     1|  [20.0]|
+--------------------+------+--------+
only showing top 2 rows



In [625]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol="target", featuresCol="features", numTrees=500, 
                            maxDepth=2, seed=2021)


In [626]:
%%time
rf_model = rf.fit(train)

CPU times: user 292 ms, sys: 149 ms, total: 441 ms
Wall time: 2min 21s


In [627]:
predictions_model = rf_model.transform(val)

In [628]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_model)
print(accuracy)
# print("Test Error = %g" % (1.0 - accuracy))

0.23748251748251747


In [608]:
rf_model.save('rf_model')

In [None]:
0.23552447552447553  == 0.248 LB
0.24125874125874125

**To submit file**

In [29]:
import pyspark.sql.functions as f
predicted_labels = predictions_model.withColumn('prediction', F.when(col('prediction') == 1, 'M_25-34').\
                  when(col('prediction') == 2, 'M_35-44').\
                       when(col('prediction') == 3, 'M_18-24').\
                            when(col('prediction') == 4, 'M_45-54').\
                                 when(col('prediction') == 5, 'M_>=55').\
                   when(col('prediction') == 6, 'F_25-34').\
                  when(col('prediction') == 7, 'F_35-44').\
                       when(col('prediction') == 8, 'F_18-24').\
                            when(col('prediction') == 9, 'F_45-54').\
                                 when(col('prediction') == 10, 'F_>=55')).select('uid', 'prediction')

split_col = f.split(predicted_labels['prediction'], '_')
predicted_labels = predicted_labels.withColumn('gender', split_col.getItem(0))
predicted_labels = predicted_labels.withColumn('age', split_col.getItem(1)).select('uid', 'gender', 'age')

In [108]:
!hdfs dfs -ls streaming/chk/chk_kafka

Found 4 items
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-19 16:27 streaming/chk/chk_kafka/commits
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         45 2021-03-19 16:27 streaming/chk/chk_kafka/metadata
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-23 09:35 streaming/chk/chk_kafka/offsets
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-19 16:27 streaming/chk/chk_kafka/sources


In [112]:
!hdfs dfs -ls streaming/chk/chk_kafka/offsets

Found 2 items
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        438 2021-03-19 16:27 streaming/chk/chk_kafka/offsets/0
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        441 2021-03-23 09:35 streaming/chk/chk_kafka/offsets/1


### Read from Kafka

Read only one topic

In [613]:
kafka_params = {
#     "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
#     "kafka.bootstrap.servers": "spark-node-1.newprolab.com:6667",
    "kafka.bootstrap.servers": "spark-de-master-1.newprolab.com:6667",
    "subscribe": "input_dmitry.ulogov",

    "startingOffsets": "earliest",

#         "startingOffsets": """ { "test_topic0": { "0": 6 } } """,
#     "endingOffsets": """ { "test_topic0": { "0": 9 } }  """
    
    "checkpointLocation": "streaming/chk/chk_kafka_ulogov_d_lab_04"
}


# sdf = spark.read.format("kafka").option("failOnDataLoss", 'False').options(**kafka_params).load()
sdf = spark.read.format("kafka").options(**kafka_params).load()
sdf.printSchema()
sdf = sdf.select(             
#     col("key").cast("string"), \
                              col("value").cast("string"),\
#                               col("topic"), \
#                               col("partition"), \
#                               col("offset"), \
#                              col("timestamp"), \
#                               col("timestampType")
) 

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [614]:
sdf.printSchema()

root
 |-- value: string (nullable = true)



In [615]:
sdf.count()

15000

In [575]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType
json_schema = StructType(
    [
    StructField('uid', StringType()),
    StructField('visits', StringType())])

json_schema_vis = ArrayType(StructType([
                                StructField('url', StringType()),
                                StructField('timestamp', LongType())
                ]))

In [576]:
sdf_parsed = sdf.withColumn('weblog', F.from_json(col('value').cast('string'), json_schema))\
                        .select('weblog.*')\
                        .select('uid', F.from_json(col('visits'), json_schema_vis).alias('visits'))

In [577]:
sdf_parsed.printSchema()

root
 |-- uid: string (nullable = true)
 |-- visits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)



In [578]:
sdf_parsed.show(2)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
+--------------------+--------------------+
only showing top 2 rows



In [579]:
sdf_parsed = sdf_parsed.select('uid','visits.url', 'visits.timestamp')

In [580]:
from pyspark.sql.functions import col, size
sdf_parsed = sdf_parsed.withColumn("sites_cnt", size("timestamp"))

In [581]:
sdf_parsed.count()

10000

In [582]:
sdf_parsed = sdf_parsed.fillna(0, subset=['sites_cnt', 
#                           'days_passed', 'month_between', 'sites_per_day', 'year_equal_flg'
                         ])

In [583]:
sdf_parsed = assembler.transform(sdf_parsed)
sdf_parsed = sdf_parsed.select(['uid', 'features'])

In [255]:
sdf_parsed.show(2)

+--------------------+--------+
|                 uid|features|
+--------------------+--------+
|bd7a30e1-a25d-4cb...|[2000.0]|
|bd7a6f52-45db-49b...|[1284.0]|
+--------------------+--------+
only showing top 2 rows



In [584]:
pred_kafka = rf_model.transform(sdf_parsed)

In [585]:
pred_kafka.show(2)

+--------------------+--------+--------------------+--------------------+----------+
|                 uid|features|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|bd7a30e1-a25d-4cb...|[2000.0]|[0.0,121.27558947...|[0.0,0.2425511789...|       1.0|
|bd7a6f52-45db-49b...|[1284.0]|[0.0,121.27558947...|[0.0,0.2425511789...|       1.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 2 rows



In [586]:
from pyspark.sql.types import FloatType, IntegerType

pred_kafka = pred_kafka.withColumn("prediction", pred_kafka["prediction"].cast(FloatType()))
pred_kafka = pred_kafka.withColumn("prediction", pred_kafka["prediction"].cast(IntegerType()))

In [587]:
pred_kafka.printSchema()

root
 |-- uid: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: integer (nullable = true)



In [588]:
import pyspark.sql.functions as f
pred_kafka = pred_kafka.withColumn('prediction', F.when(col('prediction') == 1, 'M_25-34').\
                  when(col('prediction') == 2, 'M_35-44').\
                       when(col('prediction') == 3, 'M_18-24').\
                            when(col('prediction') == 4, 'M_45-54').\
                                 when(col('prediction') == 5, 'M_>=55').\
                   when(col('prediction') == 6, 'F_25-34').\
                  when(col('prediction') == 7, 'F_35-44').\
                       when(col('prediction') == 8, 'F_18-24').\
                            when(col('prediction') == 9, 'F_45-54').\
                                 when(col('prediction') == 10, 'F_>=55')).select('uid', 'prediction')

split_col = f.split(pred_kafka['prediction'], '_')
predicted_labels = pred_kafka.withColumn('gender', split_col.getItem(0))
predicted_labels = predicted_labels.withColumn('age', split_col.getItem(1)).select('uid', 'gender', 'age')

In [299]:
predicted_labels.show(2)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|bd7a30e1-a25d-4cb...|     M|25-34|
|bd7a6f52-45db-49b...|     M|25-34|
+--------------------+------+-----+
only showing top 2 rows



In [589]:
from pyspark.sql.functions import *
finish = predicted_labels.select(to_json(struct("uid","gender","age"))\
                                        .alias("value"))

In [590]:
finish.show(10,False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|{"uid":"bd7a30e1-a25d-4cbf-a03f-61748cbe540e","gender":"M","age":"25-34"}|
|{"uid":"bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd","gender":"M","age":"25-34"}|
|{"uid":"bd7a7fd9-ab06-42f5-bf0f-1cbb0463004c","gender":"M","age":"25-34"}|
|{"uid":"bd7c5d7a-0def-41d1-895f-fdb96c56c2d4","gender":"M","age":"25-34"}|
|{"uid":"bd7e54a2-0215-45cb-a869-9efebf250e38","gender":"M","age":"25-34"}|
|{"uid":"bd7e9797-4cdb-46e1-a540-f3ea010605ad","gender":"M","age":"25-34"}|
|{"uid":"bd7e9ec7-fb67-45eb-8ad3-209d01d15ae6","gender":"M","age":"25-34"}|
|{"uid":"bd8056df-cc25-4b63-bc12-a46f888baa49","gender":"M","age":"25-34"}|
|{"uid":"bd818690-73d2-445d-be5d-5c8f748dbb19","gender":"M","age":"25-34"}|
|{"uid":"bd81e006-f059-4cdd-b716-3467c78d1312","gender":"M","age":"25-34"}|
+-----------

In [591]:
finish.printSchema()

root
 |-- value: string (nullable = true)



In [592]:
finish.count()

10000

In [593]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-de-master-1.newprolab.com:6667',
   "topic": "dmitry.ulogov"
}
# finish.writeStream.format("kafka").options(**write_kafka_params)\
#     .option("checkpointLocation", "streaming/chk/chk_kafka_ulogov_d_lab_04")\
#     .outputMode("append").start()

finish.write.format("kafka").options(**write_kafka_params).save()
#     .outputMode("append").start()

**Посмотреть данные в кафке**

In [616]:
KAFKA_BOOTSTRAP_SERVER = 'spark-de-master-1.newprolab.com:6667'
# KAFKA_BOOTSTRAP_SERVER = 'spark-master-1.newprolab.com:6667'
INPUT_KAFKA_TOPIC = 'input_dmitry.ulogov'
OUTPUT_KAFKA_TOPIC = 'dmitry.ulogov'

In [617]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': OUTPUT_KAFKA_TOPIC,
    'startingOffsets': 'earliest'
}
kafka_sdf = (
    spark
    .read
    .format('kafka')
    .options(**read_kafka_params)
    .load()
    .cache()
)
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [618]:
kafka_sdf.select('value').show(2, vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | [7B 22 75 69 64 22 3A 22 62 64 37 61 33 30 65 31 2D 61 32 35 64 2D 34 63 62 66 2D 61 30 33 66 2D 36 31 37 34 38 63 62 65 35 34 30 65 22 2C 22 67 65 6E 64 65 72 22 3A 22 4D 22 2C 22 61 67 65 22 3A 22 32 35 2D 33 34 22 7D] 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | [7B 22 75 69 64 22 3A 22 62 64 37 61 36 66 35 32 2D 34 35 64 62 2D 34 39 62 66 2D 39 30 66 32 2D 61 33 62 30 37 61 39 62 37 62 63 64 22 2C 22 67 65 6E 64 65 72 22 3A 22 4D 22 2C 22 61 67 65 22 3A 22 32 35 2D 33 34 22 7D] 
only showing top 2 rows



In [619]:
print('count',kafka_sdf.count())
kafka_sdf.show(3)

count 10000
+----+--------------------+-------------+---------+------+--------------------+-------------+
| key|               value|        topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|dmitry.ulogov|        0| 10000|2021-03-23 14:30:...|            0|
|null|[7B 22 75 69 64 2...|dmitry.ulogov|        0| 10001|2021-03-23 14:30:...|            0|
|null|[7B 22 75 69 64 2...|dmitry.ulogov|        0| 10002|2021-03-23 14:30:...|            0|
+----+--------------------+-------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [620]:
kafka_sdf.select( col("value").cast("string")).show(2, False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|{"uid":"bd7a30e1-a25d-4cbf-a03f-61748cbe540e","gender":"M","age":"25-34"}|
|{"uid":"bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd","gender":"M","age":"25-34"}|
+-------------------------------------------------------------------------+
only showing top 2 rows



## Streaming

In [69]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))

In [413]:
def create_console_sink(df):
    return df \
            .writeStream \
            .format("console") \
            .trigger(processingTime="5 seconds") \
            .option("truncate", "false") \
            .option("numRows", "20")\
#             .option("checkpointLocation", "streaming/chk/chk_kafka_ulogov_d_lab_04")

In [414]:
!hdfs dfs -ls streaming/chk/

Found 2 items
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-19 16:27 streaming/chk/chk_kafka
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-23 12:28 streaming/chk/chk_kafka_ulogov_d_lab_04


In [595]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-de-master-1.newprolab.com:6667',
    "subscribe": "input_dmitry.ulogov",
    "startingOffsets": "latest"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()


parsed_sdf = kafka_sdf.select(
#     col("key").cast("string"), \
                              col("value").cast("string"),\
#                               col("topic"), \
#                               col("partition"), \
#                               col("offset"), \
#                              col("timestamp"), \
#                               col("timestampType")
)                            


parsed_sdf.printSchema()

sink = create_console_sink(parsed_sdf)

sq = sink.start()

root
 |-- value: string (nullable = true)



In [596]:
sdf_parsed = parsed_sdf.withColumn('weblog', F.from_json(col('value').cast('string'), json_schema))\
                        .select('weblog.*')\
                        .select('uid', F.from_json(col('visits'), json_schema_vis).alias('visits'))

In [597]:
sdf_parsed.printSchema()

root
 |-- uid: string (nullable = true)
 |-- visits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)



In [598]:
sdf_parsed = sdf_parsed.select('uid','visits.url', 'visits.timestamp')

In [599]:
from pyspark.sql.functions import col, size
sdf_parsed = sdf_parsed.withColumn("sites_cnt", size("timestamp"))

In [600]:
sdf_parsed = sdf_parsed.fillna(0, subset=['sites_cnt', 
#                           'days_passed', 'month_between', 'sites_per_day', 'year_equal_flg'
                         ])

In [601]:
sdf_parsed = assembler.transform(sdf_parsed)
sdf_parsed = sdf_parsed.select(['uid', 'features'])

In [602]:
pred_kafka = rf_model.transform(sdf_parsed)

In [603]:
from pyspark.sql.types import FloatType, IntegerType

pred_kafka = pred_kafka.withColumn("prediction", pred_kafka["prediction"].cast(FloatType()))
pred_kafka = pred_kafka.withColumn("prediction", pred_kafka["prediction"].cast(IntegerType()))

In [604]:
import pyspark.sql.functions as f
pred_kafka = pred_kafka.withColumn('prediction', F.when(col('prediction') == 1, 'M_25-34').\
                  when(col('prediction') == 2, 'M_35-44').\
                       when(col('prediction') == 3, 'M_18-24').\
                            when(col('prediction') == 4, 'M_45-54').\
                                 when(col('prediction') == 5, 'M_>=55').\
                   when(col('prediction') == 6, 'F_25-34').\
                  when(col('prediction') == 7, 'F_35-44').\
                       when(col('prediction') == 8, 'F_18-24').\
                            when(col('prediction') == 9, 'F_45-54').\
                                 when(col('prediction') == 10, 'F_>=55')).select('uid', 'prediction')

split_col = f.split(pred_kafka['prediction'], '_')
predicted_labels = pred_kafka.withColumn('gender', split_col.getItem(0))
predicted_labels = predicted_labels.withColumn('age', split_col.getItem(1)).select('uid', 'gender', 'age')

In [605]:
from pyspark.sql.functions import *
finish = predicted_labels.select(to_json(struct("uid","gender","age"))\
                                        .alias("column"))

Запись в Kafka

In [567]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "dmitry.ulogov"
}
finish.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f5f603287f0>