In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [66]:
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list,regexp_replace, get_json_object, to_json, struct
import pyspark.sql.types as t
import pyspark.sql.functions as f

from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType, TimestampType, LongType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression ,DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

In [3]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [6]:
spark_df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, inferSchema=True, sep='\t')

In [7]:
spark_df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [8]:
VisitsType = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [9]:
spark_df_flattened = spark_df \
    .withColumn("visits", from_json(col("user_json"), VisitsType)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit","user_json")

In [10]:
df_final = spark_df_flattened \
    .groupBy("gender", "age", "uid") \
    .agg(collect_list("host") \
    .alias("hosts"))

In [21]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=7575)

In [13]:
hashing_TF = HashingTF(inputCol="hosts", outputCol="rawFeatures", numFeatures=10000, binary=False)
indexer_age = StringIndexer(inputCol="age", outputCol="ageIndex").fit(df_final)
indexer_gender = StringIndexer(inputCol="gender", outputCol="genderIndex").fit(df_final)

In [14]:
rf_age = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'ageIndex', 
                           predictionCol="age_index_prediction", rawPredictionCol="age_index_raw_prediction",
                           probabilityCol = "age_probability")

In [15]:
rf_gender = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'genderIndex',
                              predictionCol="gender_index_prediction", rawPredictionCol="gender_index_raw_prediction",
                              probabilityCol = "gender_probability")

In [16]:
converter_age = IndexToString(inputCol="age_index_prediction", outputCol="PredictedAge", labels=indexer_age.labels)

In [17]:
converter_gender = IndexToString(inputCol="gender_index_prediction", outputCol="PredictedGender", labels=indexer_gender.labels)

In [18]:
pipeline = Pipeline(stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, 
                            converter_age, converter_gender])

In [19]:
model = pipeline.fit(X_train)

In [22]:
predictions = model.transform(X_test)

In [23]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="ageIndex", predictionCol="age_index_prediction", metricName="accuracy")
accuracy_age = evaluator_age.evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="genderIndex", predictionCol="gender_index_prediction", metricName="accuracy")
accuracy_gender = evaluator_gender.evaluate(predictions)

print("Accuracy for age: " + str(accuracy_age))
print("Accuracy for gender: " + str(accuracy_gender))

Accuracy for age: 0.37657569452943335
Accuracy for gender: 0.48941378044303024


In [24]:
model.write().overwrite().save("/user/dmitriy.chausov/lab04_model")

In [25]:
!hdfs dfs -du /user/dmitriy.chausov/lab04_model

395    1185    /user/dmitriy.chausov/lab04_model/metadata
89464  268392  /user/dmitriy.chausov/lab04_model/stages


# 2. 

In [61]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_dmitriy.chausov'
KAFKA_OUTPUT_TOPIC = 'dmitriy.chausov'

In [110]:
# чтение в статическом режиме

kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [111]:
# Парсинг бинарного файла из кафки

event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])


visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)


clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)



clean_df.show(3)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
+--------------------+--------------------+
only showing top 3 rows



In [114]:
inf_model = PipelineModel.load("/user/dmitriy.chausov/lab04_model")

In [115]:
prep_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(collect_list("host").alias("hosts"))

predictions_df = inf_model.transform(prep_df) \
    .select("uid", "PredictedGender", "PredictedAge") \
    .withColumnRenamed("PredictedAge","age") \
    .withColumnRenamed("PredictedGender","gender")

In [116]:
# извлечение url

# применение модели, сохранение предсказаний в predictions_df  

# Оборачивание предсказания обратно в json

kafka_out_df = (
    predictions_df.select(f.to_json(f.struct(*predictions_df.columns)).alias('value')).limit(5)
)

# Запись в выходной топикa

(
    kafka_out_df
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('topic', KAFKA_OUTPUT_TOPIC)
    .save()
)

In [117]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [120]:
# предсказание и запись

kafka_write_stream = (
    predictions_df
    .select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
    .write
    .format("kafka")
#     .outputMode("append")
    .option("checkpointLocation", "checkpoints/checkpoints_lab04")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)

In [97]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_dmitriy.chausov",
    "startingOffsets": "latest"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [98]:
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [99]:
eventType  = StructType([
    StructField("uid", StringType(), True),
    StructField("visits", StringType(), True),
    ])

In [100]:
visitType = ArrayType(
    StructType([
        StructField("url", StringType(), True),
        StructField("timestamp", LongType(), True) 
    ])
)

In [101]:
clean_df = kafka_sdf \
    .select(col("value").cast("string").alias("value")) \
    .select(from_json(col("value"), eventType).alias("data")) \
    .select("data.*") \
    .select("uid", from_json(col("visits"), visitType).alias("visits"))

In [102]:
prep_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(collect_list("host").alias("hosts"))

In [103]:
inf_model = PipelineModel.load("/user/dmitriy.chausov/lab04_model")

In [104]:
predictions_df = inf_model.transform(prep_df) \
    .select("uid", "PredictedGender", "PredictedAge") \
    .withColumnRenamed("PredictedAge","age") \
    .withColumnRenamed("PredictedGender","gender")

In [105]:
kafka_out_df = predictions_df \
    .select(to_json(struct(*predictions_df.columns)).alias("value"))

In [109]:
    kafka_out_df.write\
    .format('kafka')\
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)\
    .option('topic', KAFKA_OUTPUT_TOPIC)\
    .save()

AnalysisException: "'write' can not be called on streaming Dataset/DataFrame;"

In [107]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "dmitriy.chausov"
}

kafka_out_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "checkpoints/checkpoints_lab04")\
    .outputMode("append").start()

# kafka_out_df.write \
#     .format("kafka") \
#     .options(**write_kafka_params)\
#     .save()

AnalysisException: 'Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;;\nProject [structstojson(named_struct(uid, uid#1893, gender, gender#2073, age, age#2069), Some(Europe/Moscow)) AS value#2077]\n+- Project [uid#1893, PredictedGender#2054 AS gender#2073, age#2069]\n   +- Project [uid#1893, PredictedGender#2054, PredictedAge#2043 AS age#2069]\n      +- Project [uid#1893, PredictedGender#2054, PredictedAge#2043]\n         +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, age_index_prediction#1994, gender_index_raw_prediction#2007, gender_probability#2015, gender_index_prediction#2024, PredictedAge#2043, if (isnull(cast(gender_index_prediction#2024 as double))) null else UDF(cast(gender_index_prediction#2024 as double)) AS PredictedGender#2054]\n            +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, age_index_prediction#1994, gender_index_raw_prediction#2007, gender_probability#2015, gender_index_prediction#2024, if (isnull(cast(age_index_prediction#1994 as double))) null else UDF(cast(age_index_prediction#1994 as double)) AS PredictedAge#2043]\n               +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, age_index_prediction#1994, gender_index_raw_prediction#2007, gender_probability#2015, UDF(gender_index_raw_prediction#2007) AS gender_index_prediction#2024]\n                  +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, age_index_prediction#1994, gender_index_raw_prediction#2007, UDF(gender_index_raw_prediction#2007) AS gender_probability#2015]\n                     +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, age_index_prediction#1994, UDF(rawFeatures#1973) AS gender_index_raw_prediction#2007]\n                        +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, age_probability#1988, UDF(age_index_raw_prediction#1983) AS age_index_prediction#1994]\n                           +- Project [uid#1893, hosts#1918, rawFeatures#1973, age_index_raw_prediction#1983, UDF(age_index_raw_prediction#1983) AS age_probability#1988]\n                              +- Project [uid#1893, hosts#1918, rawFeatures#1973, UDF(rawFeatures#1973) AS age_index_raw_prediction#1983]\n                                 +- Project [uid#1893, hosts#1918, UDF(hosts#1918) AS rawFeatures#1973]\n                                    +- Aggregate [uid#1893], [uid#1893, collect_list(host#1907, 0, 0) AS hosts#1918]\n                                       +- Project [uid#1893, host#1907]\n                                          +- Project [uid#1893, visits#1897, visit#1902, parse_url(visit#1902.url, HOST) AS host#1907]\n                                             +- Project [uid#1893, visits#1897, visit#1902]\n                                                +- Generate explode(visits#1897), false, [visit#1902]\n                                                   +- Project [uid#1893, jsontostructs(ArrayType(StructType(StructField(url,StringType,true), StructField(timestamp,LongType,true)),true), visits#1894, Some(Europe/Moscow)) AS visits#1897]\n                                                      +- Project [data#1891.uid AS uid#1893, data#1891.visits AS visits#1894]\n                                                         +- Project [jsontostructs(StructField(uid,StringType,true), StructField(visits,StringType,true), value#1889, Some(Europe/Moscow)) AS data#1891]\n                                                            +- Project [cast(value#1876 as string) AS value#1889]\n                                                               +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@4a3d0651, kafka, Map(startingOffsets -> latest, subscribe -> input_dmitriy.chausov, kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667), [key#1875, value#1876, topic#1877, partition#1878, offset#1879L, timestamp#1880, timestampType#1881], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@18f9179c,kafka,List(),None,List(),None,Map(startingOffsets -> latest, subscribe -> input_dmitriy.chausov, kafka.bootstrap.servers -> spark-master-1.newprolab.com:6667),None), kafka, [key#1868, value#1869, topic#1870, partition#1871, offset#1872L, timestamp#1873, timestampType#1874]\n'