In [25]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("spark-course") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [27]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.feature import HashingTF, IDF, StringIndexer,IndexToString, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
!hdfs dfs -ls "/labs/slaba04/"

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [5]:
schema = StructType(fields=[
    StructField('gender', StringType()),
    StructField('age', StringType()),
    StructField('uid', StringType()),
    StructField('user_json', StringType()),
])

df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, schema=schema, sep='\t')
df.show()

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|
|     F|25-34|d5090ddf-5648-487...|{"visits": [{"url...|
|     F|25-34|d50bcef8-16ff-4e8...|{"visits": [{"url...|
|     F|18-24|d50e23dc-0cbd-488...|{"visits": [{"url...|
|     F|45-54|d50fdabb-4208-441...|{"visits": [{"url...|
|     F|18-24|d511b480-23a6-482...|{"visits": [{"url...|
|     F|25-34|d51294ed-1b95-4e4...|{"visits": [{"url...|
|     F|25-34|d512e295-6a85-491...|{"visits": [{"url...|
|     M|25-34|d51441ea-9dda-454...|{"visits": [{"url...|
|     F|25-34|d51822d4-105b-457...|{"visits": [{"url...|
|     F|35-44|d5183db2-c8e5-413

In [6]:
df = df.withColumn("visits", F.from_json(F.col("user_json"), StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ]))) \
    .withColumn("visit", F.explode("visits.visits")) \
    .withColumn("host", F.expr("parse_url(visit.url, 'HOST')")) \
    .drop("user_json", "visits", "visit")

In [8]:
df = df[(df.age != "-") & (df.gender != "-")]
df = df.groupBy("gender", "age", "uid").agg(F.collect_list("host").alias('hosts')).filter(F.size('hosts') != 0)
df.show()

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|               hosts|
+------+-----+--------------------+--------------------+
|     F|18-24|09b1ecd3-b2d2-4c1...|[tankionline.com,...|
|     F|18-24|15faf063-5e44-4b6...|[allods.mail.ru, ...|
|     F|18-24|560142d9-6c9c-439...|[http, http, http...|
|     F|18-24|6709f443-7ddd-423...|[muzofon.com, muz...|
|     F|18-24|67e9bd68-ef03-49c...|[tempfile.ru, tem...|
|     F|18-24|757ff5c2-ecdb-489...|[www.yves-rocher....|
|     F|18-24|c430a9d4-5f48-47c...|[www.eporner.com,...|
|     F|18-24|d1d59923-51d7-4a1...|[shop.lenovo.com,...|
|     F|18-24|fca5deb7-77f4-4c4...|[b.jaymedianetwor...|
|     F|25-34|0521da78-b729-4a0...|[mirknig.com, mir...|
|     F|25-34|09023c5f-d98f-47f...|[www.pc.img-studi...|
|     F|25-34|205ed1e2-1504-47c...|[ubr.ua, cm.g.dou...|
|     F|25-34|3c295020-fe8e-483...|[go.mail.ru, andr...|
|     F|25-34|4359c398-4a7b-4d4...|[join1.kluberudit...|
|     F|25-34|492f6cb0-d878-4b9

In [9]:
CV = CountVectorizer(inputCol = "hosts", outputCol = "vector_hosts")

In [10]:
age_indexer = StringIndexer(inputCol = "age", outputCol = "age_indexer").fit(df)
gender_indexer = StringIndexer(inputCol = "gender", outputCol = "gender_indexer").fit(df)
age_reverse = IndexToString(inputCol = "age_prediction", outputCol = "age_pred", labels = age_indexer.labels)
gender_reverse = IndexToString(inputCol = "gender_prediction", outputCol = "gender_pred", labels = gender_indexer.labels)

In [11]:
logreg_age = LogisticRegression(featuresCol = 'vector_hosts', labelCol = 'age_indexer', probabilityCol = 'age_prob',
                                predictionCol = "age_prediction", rawPredictionCol = 'age_prediction_raw',
                                maxIter = 10, regParam = 0.3, elasticNetParam = 0.8, family = "multinomial")
logreg_gender = LogisticRegression(featuresCol = 'vector_hosts', labelCol = 'gender_indexer', probabilityCol = 'gender_prob',
                                   predictionCol = "gender_prediction", rawPredictionCol = 'gender_prediction_raw',
                                   maxIter = 10, regParam = 0.3, elasticNetParam = 0.8, family = "multinomial")

In [12]:
pp = Pipeline(stages = [CV, age_indexer, gender_indexer, logreg_age, logreg_gender, age_reverse, gender_reverse])

In [13]:
X_train, X_test = df.randomSplit([0.8, 0.2], seed = 42)

In [14]:
mdl = pp.fit(X_train)

In [15]:
predictions = mdl.transform(X_test)

In [16]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol = "age_indexer", predictionCol = "age_prediction", metricName = "accuracy").evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol = "gender_indexer", predictionCol = "gender_prediction", metricName = "accuracy").evaluate(predictions)

In [17]:
mdl.write().overwrite().save("/user/pavel.kolodkin/mdl_lab4")

In [None]:
# запускаем проверка и ждем пару минут пока формируются данные для теста

In [28]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_pavel.kolodkin",
    "startingOffsets": "earliest",
    "failOnDataLoss":"False"
}
kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).load()
kafka_sdf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     0|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     1|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     2|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     3|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     4|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     5|2022-11-06 14:05:...|            0|
|null|[7B 22 75 69 64 2...|input_pavel.kolodkin|        0|     6|2022-11-06 14:05:...|     

In [29]:
kafka_sdf = kafka_sdf.select(F.from_json(F.col('value').cast('string'), 
                                                        StructType([
                                                        StructField('uid', StringType(), True),
                                                        StructField('visits', StringType(), True),]
                                                        )).alias('event'))


kafka_sdf = kafka_sdf.select('event.uid', F.from_json(F.col('event.visits'),
                                                      ArrayType(StructType([
                                                          StructField('url', StringType(), True),
                                                          StructField('timestamp', LongType(), True)])
                                                            )).alias('visits'))

kafka_sdf = kafka_sdf.withColumn("visit", F.explode("visits")) \
    .withColumn("host", F.expr("parse_url(visit.url, 'HOST')")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(F.collect_list("host").alias("hosts"))

kafka_sdf.show()

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|0108d217-e476-493...|[kvartblog.ru, kv...|
|0192cc54-559c-4c8...|[metanol.lv, meta...|
|019acd5e-be9a-4cd...|[www.russianfood....|
|02e7f830-da57-4d5...|[maxpark.com, new...|
|1d160259-73d8-451...|[ua.sinoptik.ua, ...|
|1e14a504-276e-448...|[www.rusfishing.r...|
|1eb313db-34ff-4bf...|[zamok.gidm.ru, z...|
|1eff6e4f-3b8a-447...|[mirtesen.ru, mir...|
|3e75c432-cb78-488...|[www.gotovim.ru, ...|
|47565df3-13e3-460...|[psl.by, psl.by, ...|
|4766a8ab-e9b6-4e0...|[www.dns-shop.ru,...|
|50637c81-fffa-4ee...|[www.adme.ru, www...|
|5a023519-f28e-4eb...|[www.forum.zoo.kz...|
|5a781caa-6131-4d9...|[www.evino.ru, ww...|
|5ab3c7b8-c550-493...|[www.dns-shop.ru,...|
|7302e78a-ec04-47e...|[www.proftester.r...|
|73081df3-8f41-435...|[www.rowenta.ru, ...|
|89fe85cb-ea4c-4be...|[loveplanet.ru, l...|
|8affa6ce-24c7-4ed...|[piter-mania.ru, ...|
|b2e4450d-c582-441...|[www.yapla

In [30]:
preds = mdl.transform(kafka_sdf) \
    .select("uid", "age_pred", "gender_pred") \
    .withColumnRenamed("age_pred","age") \
    .withColumnRenamed("gender_pred","gender")
preds.show()

+--------------------+-----+------+
|                 uid|  age|gender|
+--------------------+-----+------+
|0108d217-e476-493...|25-34|     M|
|0192cc54-559c-4c8...|25-34|     M|
|019acd5e-be9a-4cd...|25-34|     M|
|02e7f830-da57-4d5...|25-34|     M|
|1d160259-73d8-451...|25-34|     M|
|1e14a504-276e-448...|25-34|     M|
|1eb313db-34ff-4bf...|25-34|     M|
|1eff6e4f-3b8a-447...|25-34|     M|
|3e75c432-cb78-488...|25-34|     M|
|47565df3-13e3-460...|25-34|     M|
|4766a8ab-e9b6-4e0...|25-34|     M|
|50637c81-fffa-4ee...|25-34|     M|
|5a023519-f28e-4eb...|25-34|     M|
|5a781caa-6131-4d9...|25-34|     M|
|5ab3c7b8-c550-493...|25-34|     M|
|7302e78a-ec04-47e...|25-34|     M|
|73081df3-8f41-435...|25-34|     M|
|89fe85cb-ea4c-4be...|25-34|     M|
|8affa6ce-24c7-4ed...|25-34|     M|
|b2e4450d-c582-441...|25-34|     M|
+--------------------+-----+------+
only showing top 20 rows



In [31]:
preds.count()

5000

In [32]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', 'spark-master-1.newprolab.com:6667')
    .option('subscribe', "maxim.borchashvili")
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [33]:
preds = preds.select(F.to_json(F.struct(*preds.columns)).alias('value'))

kafka_write_stream = (
    preds
    .write
    .format("kafka")
    .option("checkpointLocation", "streaming/chk/chk_kafka")
    .option("kafka.bootstrap.servers", 'spark-master-1.newprolab.com:6667')
    .option("topic", "pavel.kolodkin")
)

In [34]:
kafka_write_stream.save("/user/pavel.kolodkin/test")

In [36]:
!hdfs dfs -ls "/user/pavel.kolodkin/"

Found 6 items
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-06 20:07 /user/pavel.kolodkin/.sparkStaging
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-03 09:48 /user/pavel.kolodkin/lab03.csv
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-03 07:44 /user/pavel.kolodkin/lab03111.csv
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-06 19:20 /user/pavel.kolodkin/lab04_model
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-08 08:51 /user/pavel.kolodkin/mdl_lab4
drwxr-xr-x   - pavel.kolodkin pavel.kolodkin          0 2022-11-03 07:54 /user/pavel.kolodkin/user


In [27]:
spark.stop()