In [1]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


In [2]:
!hdfs dfs -head /labs/slaba04/gender_age_dataset.txt

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
M	25-34	d502331d-621e-4721-ada2-5d30b2c3801f	{"visits": [{"url": "http://sweetrading.ru/?p=900", "timestamp": 1419717886224}, {"url": "http://sweetrading.ru/?p=884", "timestamp": 1419717884437}, {"url": "http://sweetrading.ru

In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "alexey gurov lab4") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [180]:
import pyspark.sql.functions as f
from pyspark.sql.functions import udf, to_json, struct

import numpy as np

from pyspark.ml.feature import HashingTF
import re
import numpy as np
from numpy import NaN

from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

from pyspark.sql.types import StructType, StructField, StringType, LongType, \
                              IntegerType, DateType, FloatType, ArrayType, DoubleType

from pyspark.ml.classification import GBTClassifier, RandomForestClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [6]:
spark

### Определяем необходимые в дальнейшем функции

In [327]:
@udf(returnType=IntegerType())
def get_len(x):
    return len(x)


@udf(returnType=FloatType())
def get_mean(x):
    return float(np.mean(x))


@udf(returnType=StringType())
def get_gender(x):
    return x.split(' ')[0]


@udf(returnType=StringType())
def get_age(x):
    return x.split(' ')[1]


def to_kafka_format(df):
    return df.select(to_json(struct('uid', 'gender', 'age')).alias('value'))


def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))
        
        
        
def transform_data(model, df):
    schema = StructType([
        StructField('uid', StringType()),
        StructField('visits', StringType())
    ])

    schema_visits = ArrayType(StructType([
                                StructField('url', StringType()),
                                StructField('timestamp', LongType())
                ]))
    
    df = df.withColumn('weblog', f.from_json(f.col('value').cast('string'), schema))\
                        .select('weblog.*')\
                        .select('uid', f.from_json(f.col('visits'), schema_visits).alias('visits'))
    
    df = df.select('uid', 'visits.url', 'visits.timestamp')
    df = df.withColumn('sites_cnt', get_len('timestamp'))
    df = df.withColumn('mean_timestamp', get_mean('timestamp'))
    df = df.select('uid', 'sites_cnt', 'mean_timestamp')
    predictions = model.transform(df)
    
    predictions = predictions.withColumn('prediction', f.when(f.col('prediction') == 1, 'M 25-34').\
                                                     when(f.col('prediction') == 2, 'M 35-44').\
                                                     when(f.col('prediction') == 3, 'M 18-24').\
                                                     when(f.col('prediction') == 4, 'M 45-54').\
                                                     when(f.col('prediction') == 5, 'M >=55').\
                                                     when(f.col('prediction') == 6, 'F 25-34').\
                                                     when(f.col('prediction') == 7, 'F 35-44').\
                                                     when(f.col('prediction') == 8, 'F 18-24').\
                                                     when(f.col('prediction') == 9, 'F 45-54').\
                                                     when(f.col('prediction') == 10, 'F >=55'))
    
    predictions = predictions.withColumn('gender', get_gender('prediction'))\
                         .withColumn('age', get_age('prediction'))
    
    predictions = predictions.select('uid', 'gender', 'age')
    
    predictions = to_kafka_format(predictions)
    
    return predictions
        
        
def create_console_sink(predictions):
    write_kafka_params = {
        "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVER,
        "topic": "alexey.gurov", 
        "truncate": "false", 
        "numRows": "1000"    
    }
    
    return predictions \
        .writeStream.format("kafka").options(**write_kafka_params) \
        .option("checkpointLocation", "streaming/chk/chk_alexey_gurov")\
        .outputMode("append")

### Считываем данные

In [147]:
schema = StructType([StructField('gender', StringType()), 
                     StructField('age', StringType()),
                     StructField('uid', StringType()),
                     StructField('user_json', StringType())
                    ])

train_data = spark.read.format("csv")\
                       .option("inferSchema", "true")\
                       .schema(schema)\
                       .option("header", "true")\
                       .option("delimiter", "\\t")\
                       .load("/labs/slaba04/gender_age_dataset.txt")

visits_schema = StructType([
    StructField("visits", ArrayType(
      StructType([
          StructField("url", StringType()),
          StructField("timestamp", LongType())
      ])
   ))
]) 

train_data = train_data.withColumn('visits_from_json', f.from_json(f.col('user_json'), schema=visits_schema))\
                       .withColumn('visits', f.col('visits_from_json.visits.url'))\
                       .withColumn('timestamps', f.col('visits_from_json.visits.timestamp'))

In [148]:
train_data = train_data.select(f.col('gender')
                               , f.col('age')
                               , f.col('uid')
                               , f.col('visits')
                               , f.col('timestamps'))

In [32]:
train_data.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
train_data.show(5)

+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|    visits_from_json|              visits|          timestamps|
+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[[[http://zebra-z...|[http://zebra-zoy...|[1419688144068, 1...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[[[http://sweetra...|[http://sweetradi...|[1419717886224, 1...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[[[http://ru.orif...|[http://ru.orifla...|[1418840296062, 1...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[[[http://transla...|[http://translate...|[1418217864467, 1...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[[[https://mail.r...|[https://mail.ram...|[1427272415001, 1...|
+------+-----+--------------------+-----

In [48]:
train_data.groupBy("age").count().show()

+-----+-----+
|  age|count|
+-----+-----+
| >=55| 1679|
|45-54| 4744|
|    -| 5000|
|35-44| 9360|
|25-34|15457|
|18-24| 4898|
+-----+-----+



### Добавляем признаки для обучения модели, преобразуем имеющиеся в удобный вид

In [149]:
train_data = train_data.withColumn('sites_cnt', get_len('visits')) # Количество посещений

In [150]:
train_data = train_data.withColumn('mean_timestamp', get_mean('timestamps'))

In [152]:
train_data = train_data.withColumn('target', f.concat(f.col('gender'), f.lit(' '), f.col('age'))).filter(train_data.age != '-')

In [153]:
train_data = train_data.withColumn('target', f.when(f.col('target') == 'M 25-34', 1).\
                                               when(f.col('target') == 'M 35-44', 2).\
                                               when(f.col('target') == 'M 18-24', 3).\
                                               when(f.col('target') == 'M 45-54', 4).\
                                               when(f.col('target') == 'M >=55', 5).\
                                               when(f.col('target') == 'F 25-34', 6).\
                                               when(f.col('target') == 'F 35-44', 7).\
                                               when(f.col('target') == 'F 18-24', 8).\
                                               when(f.col('target') == 'F 45-54', 9).\
                                               when(f.col('target') == 'F >=55', 10))

In [154]:
X = train_data.select('uid', 'sites_cnt', 'mean_timestamp', 'target').cache()

In [155]:
X.show(5, vertical=False, truncate=False)

+------------------------------------+---------+--------------+------+
|uid                                 |sites_cnt|mean_timestamp|target|
+------------------------------------+---------+--------------+------+
|d50192e5-c44e-4ae8-ae7a-7cfe67c8b777|5        |1.4252688E12  |8     |
|d502331d-621e-4721-ada2-5d30b2c3801f|102      |1.41802865E12 |1     |
|d50237ea-747e-48a2-ba46-d08e71dddfdb|44       |1.4259028E12  |6     |
|d502f29f-d57a-46bf-8703-1cb5f8dcdf03|14       |1.41775733E12 |6     |
|d503c3b2-a0c2-4f47-bb27-065058c73008|212      |1.4269055E12  |5     |
+------------------------------------+---------+--------------+------+
only showing top 5 rows



### Готовим обучающую и тестовую выборки

In [156]:
X_train = X.sampleBy('target', fractions={1: 0.9, 2: 0.9, 3: 0.9, 4: 0.9, 5: 0.9, 
                                          6: 0.9, 7: 0.9, 8: 0.9, 9: 0.9, 10: 0.9}, seed=42)

In [157]:
X_test = X.join(X_train, on=['uid'], how="leftanti")

### Обучаем модель

In [158]:
feature_list = ['sites_cnt', 'mean_timestamp']

In [159]:
feature_assembler = VectorAssembler(inputCols=feature_list, outputCol='features')

In [160]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='target', seed=42, numTrees=500, maxDepth=4)

In [161]:
model_pipeline = Pipeline(stages=[feature_assembler, rfc])

In [162]:
model = model_pipeline.fit(X_train)

In [163]:
predictions = model.transform(X_test)

In [164]:
predictions.select('prediction').show(truncate=False)

+----------+
|prediction|
+----------+
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
|1.0       |
+----------+
only showing top 20 rows



In [166]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [167]:
accuracy

0.24564362001124226

### Преобразуем предсказания к формату исходных данных

In [168]:
predictions = predictions.withColumn('prediction', f.when(f.col('prediction') == 1, 'M 25-34').\
                                                     when(f.col('prediction') == 2, 'M 35-44').\
                                                     when(f.col('prediction') == 3, 'M 18-24').\
                                                     when(f.col('prediction') == 4, 'M 45-54').\
                                                     when(f.col('prediction') == 5, 'M >=55').\
                                                     when(f.col('prediction') == 6, 'F 25-34').\
                                                     when(f.col('prediction') == 7, 'F 35-44').\
                                                     when(f.col('prediction') == 8, 'F 18-24').\
                                                     when(f.col('prediction') == 9, 'F 45-54').\
                                                     when(f.col('prediction') == 10, 'F >=55'))

In [169]:
predictions = predictions.withColumn('gender', get_gender('prediction'))\
                         .withColumn('age', get_age('prediction'))

In [171]:
predictions = predictions.select('uid', 'gender', 'age')

In [172]:
predictions.show()

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|d50237ea-747e-48a...|     M|25-34|
|d51cba2e-f666-46d...|     M|25-34|
|d532b175-ef8c-4ea...|     M|25-34|
|d53d9796-c6a1-4dc...|     M|25-34|
|d53ebbae-768e-470...|     M|25-34|
|d576d60e-8d3f-4dc...|     M|25-34|
|d5a8347b-a7cb-469...|     M|25-34|
|d5ab0ae2-ea3c-4a4...|     M|25-34|
|d5cedebe-3151-478...|     M|25-34|
|d5e4aa1e-d5ec-4b1...|     M|25-34|
|d616f6fe-0470-4e7...|     M|25-34|
|d61796ab-0ce6-489...|     M|25-34|
|d635f738-e3cf-424...|     M|25-34|
|d63c4fd5-95df-462...|     M|25-34|
|d6425577-1a7b-4a6...|     M|25-34|
|d64b2a8f-d310-458...|     M|25-34|
|d651f69f-baa1-4fa...|     M|25-34|
|d65f88e9-cddb-4f4...|     M|25-34|
|bf00c5e3-1637-41c...|     M|25-34|
|bf2388ab-9a95-4d3...|     M|25-34|
+--------------------+------+-----+
only showing top 20 rows



In [274]:
!hdfs dfs -ls /labs/

Found 9 items
drwxr-xr-x   - hdfs hdfs          0 2020-09-05 20:37 /labs/laba01
drwxr-xr-x   - hdfs hdfs          0 2020-09-11 18:26 /labs/laba02
drwxr-xr-x   - hdfs hdfs          0 2020-09-05 20:51 /labs/laba03
drwxr-xr-x   - hdfs hdfs          0 2020-09-30 20:12 /labs/laba04
drwxr-xr-x   - hdfs hdfs          0 2020-10-18 21:32 /labs/laba07
drwxr-xr-x   - hdfs hdfs          0 2020-10-18 21:33 /labs/laba08
drwxr-xr-x   - hdfs hdfs          0 2021-02-27 21:58 /labs/slaba02
drwxr-xr-x   - hdfs hdfs          0 2021-02-27 22:12 /labs/slaba03
drwxr-xr-x   - hdfs hdfs          0 2021-02-27 22:13 /labs/slaba04


### Работа с kafka

In [312]:
KAFKA_BOOTSTRAP_SERVER = 'spark-de-node-1.newprolab.com:6667'
INPUT_KAFKA_TOPIC = 'input_alexey.gurov'
OUTPUT_KAFKA_TOPIC = 'alexey.gurov'

In [313]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'latest'
}

In [314]:
kafka_sdf = spark\
                .readStream\
                .format('kafka')\
                .options(**read_kafka_params)\
                .load()

In [315]:
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [316]:
parsed_sdf = kafka_sdf.select(f.col("value").cast("string"))

In [328]:
predictions = transform_data(model, parsed_sdf)

In [329]:
sink = create_console_sink(kafka_sdf)

In [330]:
sq = sink.start()

In [331]:
sq.isActive

True

In [333]:
sq.status

{'message': 'Writing offsets to log',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [334]:
sq.lastProgress

{'id': '167a70b7-88c8-407a-91e3-cd33650fd6cb',
 'runId': '4bd977ea-deb2-4d0f-8720-8080d72a27fd',
 'name': None,
 'timestamp': '2021-03-24T19:36:26.949Z',
 'batchId': 14,
 'numInputRows': 182,
 'inputRowsPerSecond': 202.44716351501668,
 'processedRowsPerSecond': 251.0344827586207,
 'durationMs': {'addBatch': 90,
  'getBatch': 0,
  'getEndOffset': 0,
  'queryPlanning': 8,
  'setOffsetRange': 2,
  'triggerExecution': 725,
  'walCommit': 139},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[input_alexey.gurov]]',
   'startOffset': {'input_alexey.gurov': {'0': 21285}},
   'endOffset': {'input_alexey.gurov': {'0': 21467}},
   'numInputRows': 182,
   'inputRowsPerSecond': 202.44716351501668,
   'processedRowsPerSecond': 251.0344827586207}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider@d5395d3'}}

In [336]:
kill_all()

In [337]:
spark.stop()