In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("spark-course") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [3]:
!hdfs dfs -ls "/labs/slaba04/"

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [105]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, StringIndexer,IndexToString, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
path = "/labs/slaba04/gender_age_dataset.txt"

schema = StructType(fields=[
    StructField('gender', StringType()),
    StructField('age', StringType()),
    StructField('uid', StringType()),
    StructField('user_json', StringType()),
])

df = spark.read.csv(path, header=True, schema=schema, sep='\t')
df.show(1)
print(df)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 1 row

DataFrame[gender: string, age: string, uid: string, user_json: string]


In [36]:
df = df.withColumn("visits", f.from_json(f.col("user_json"), StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ]))) \
    .withColumn("visit", f.explode("visits.visits")) \
    .withColumn("host", f.expr("parse_url(visit.url, 'HOST')")) \
    .drop("user_json", "visits", "visit")

In [37]:
df.show()

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|                host|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|       zebra-zoya.ru|
|     F|18-24|d50192e5-c44e-4ae...|      news.yandex.ru|
|     F|18-24|d50192e5-c44e-4ae...|      www.sotovik.ru|
|     F|18-24|d50192e5-c44e-4ae...|      news.yandex.ru|
|     F|18-24|d50192e5-c44e-4ae...|      www.sotovik.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|              101.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472...|      sweetrading.ru|
|     M|25-34|d502331d-621e-472

In [38]:
df[(df.gender!='F')&(df.gender!='M')].show()

+------+---+--------------------+--------------------+
|gender|age|                 uid|                host|
+------+---+--------------------+--------------------+
|     -|  -|bd7a30e1-a25d-4cb...|     www.interfax.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  -|bd7a30e1-a25d-4cb...|amerikan-gruzovik.ru|
|     -|  

In [42]:
df = df[(df.age != "-")&(df.gender != "-")]
df = df.groupBy("gender", "age", "uid").agg(f.collect_list("host").alias('hosts')).filter(f.size('hosts') != 0)

In [43]:
df.show()

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|               hosts|
+------+-----+--------------------+--------------------+
|     F|18-24|09b1ecd3-b2d2-4c1...|[tankionline.com,...|
|     F|18-24|15faf063-5e44-4b6...|[allods.mail.ru, ...|
|     F|18-24|560142d9-6c9c-439...|[http, http, http...|
|     F|18-24|6709f443-7ddd-423...|[muzofon.com, muz...|
|     F|18-24|67e9bd68-ef03-49c...|[tempfile.ru, tem...|
|     F|18-24|757ff5c2-ecdb-489...|[www.yves-rocher....|
|     F|18-24|c430a9d4-5f48-47c...|[www.eporner.com,...|
|     F|18-24|d1d59923-51d7-4a1...|[shop.lenovo.com,...|
|     F|18-24|fca5deb7-77f4-4c4...|[b.jaymedianetwor...|
|     F|25-34|0521da78-b729-4a0...|[mirknig.com, mir...|
|     F|25-34|09023c5f-d98f-47f...|[www.pc.img-studi...|
|     F|25-34|205ed1e2-1504-47c...|[ubr.ua, cm.g.dou...|
|     F|25-34|3c295020-fe8e-483...|[go.mail.ru, andr...|
|     F|25-34|4359c398-4a7b-4d4...|[join1.kluberudit...|
|     F|25-34|492f6cb0-d878-4b9

In [107]:
CV = CountVectorizer(inputCol="hosts", outputCol="vector_hosts")

In [108]:
age_indexer = StringIndexer(inputCol="age", outputCol="age_indexer").fit(df)
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_indexer").fit(df)
age_reverse = IndexToString(inputCol="age_prediction", outputCol="age_pred", labels=age_indexer.labels)
gender_reverse = IndexToString(inputCol="gender_prediction", outputCol="gender_pred", labels=gender_indexer.labels)

In [109]:
logreg_age = LogisticRegression(featuresCol = 'vector_hosts', labelCol = 'age_indexer', probabilityCol = 'age_prob',
                           predictionCol="age_prediction", rawPredictionCol = 'age_prediction_raw',
                               maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")
logreg_gender = LogisticRegression(featuresCol = 'vector_hosts', labelCol = 'gender_indexer', probabilityCol = 'gender_prob',
                           predictionCol="gender_prediction", rawPredictionCol = 'gender_prediction_raw',
                                  maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

In [110]:
pp = Pipeline(stages=[CV, age_indexer, gender_indexer, logreg_age, logreg_gender, 
                            age_reverse, gender_reverse])

In [111]:
X_train, X_test = df.randomSplit([0.8, 0.2], seed=42)

In [112]:
mdl = pp.fit(X_train)

In [113]:
predictions = mdl.transform(X_test)

In [114]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="age_indexer", predictionCol="age_prediction", metricName="accuracy").evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="gender_indexer", predictionCol="gender_prediction", metricName="accuracy").evaluate(predictions)

print(accuracy_age)
print(accuracy_gender)

0.32029743872211514
0.577251445882677


In [103]:
mdl.write().overwrite().save("/user/maxim.borchashvili/mdl_lab4")

In [177]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_maxim.borchashvili",
    "startingOffsets": "earliest",
    "failOnDataLoss":"False"
}
kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).load()

In [178]:
kafka_sdf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     0|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     1|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     2|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     3|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     4|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     5|2022-11-06 19:26:...|            0|
|null|[7B 22 75 69 64 2...|input_maxim.borch...|        0|     6|2022-11-06 19:26:...|     

In [179]:
kafka_sdf = kafka_sdf.select(f.from_json(f.col('value').cast('string'), 
                                                        t.StructType([
                                                        t.StructField('uid', t.StringType(), True),
                                                        t.StructField('visits', t.StringType(), True),]
                                                        )).alias('event'))


kafka_sdf = kafka_sdf.select('event.uid', f.from_json(f.col('event.visits'), 
                    t.ArrayType(t.StructType([t.StructField('url', t.StringType(), True),
                                              t.StructField('timestamp', t.LongType(), True)])
                                              )).alias('visits'))

kafka_sdf = kafka_sdf.withColumn("visit", f.explode("visits")) \
    .withColumn("host", f.expr("parse_url(visit.url, 'HOST')")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(f.collect_list("host").alias("hosts"))

kafka_sdf.show(3)

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|0108d217-e476-493...|[kvartblog.ru, kv...|
|0192cc54-559c-4c8...|[metanol.lv, meta...|
|019acd5e-be9a-4cd...|[www.russianfood....|
+--------------------+--------------------+
only showing top 3 rows



In [180]:
preds = mdl.transform(kafka_sdf) \
    .select("uid", "age_pred", "gender_pred") \
    .withColumnRenamed("age_pred","age") \
    .withColumnRenamed("gender_pred","gender")

In [181]:
preds.show(3)

+--------------------+-----+------+
|                 uid|  age|gender|
+--------------------+-----+------+
|0108d217-e476-493...|25-34|     M|
|0192cc54-559c-4c8...|25-34|     M|
|019acd5e-be9a-4cd...|25-34|     M|
+--------------------+-----+------+
only showing top 3 rows



In [182]:
preds.count()

5000

In [183]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', 'spark-master-1.newprolab.com:6667')
    .option('subscribe', "maxim.borchashvili")
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [184]:
preds = preds.select(f.to_json(f.struct(*preds.columns)).alias('value'))

kafka_write_stream = (
    preds
    .write
    .format("kafka")
    .option("checkpointLocation", "streaming/chk/chk_kafka")
    .option("kafka.bootstrap.servers", 'spark-master-1.newprolab.com:6667')
    .option("topic", "maxim.borchashvili")
)

In [185]:
kafka_write_stream.save("/user/maxim.borchashvili/test")