In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [4]:
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list,regexp_replace, get_json_object, to_json, struct
import pyspark.sql.types as t
import pyspark.sql.functions as f

from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType, TimestampType, LongType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression ,DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

In [3]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [4]:
!hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head -n2

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
cat: Unable to write to output stream.


In [5]:
schema = t.StructType(fields = [
    t.StructField("gender", t.StringType()),
    t.StructField("age", t.StringType()),
    t.StructField("uid", t.StringType()),
    t.StructField("user_json", t.StringType())
])

In [6]:
spark_df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, schema=schema, sep='\t')

In [7]:
spark_df.show(7)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|
|     F|25-34|d5090ddf-5648-487...|{"visits": [{"url...|
|     F|25-34|d50bcef8-16ff-4e8...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 7 rows



In [8]:
VisitsType = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [9]:
spark_df_flattened = spark_df \
    .withColumn("visits", from_json(col("user_json"), VisitsType)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit","user_json")

In [10]:
spark_df_flattened.show(3, vertical = True, truncate = False)

-RECORD 0--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | zebra-zoya.ru                        
-RECORD 1--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | news.yandex.ru                       
-RECORD 2--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | www.sotovik.ru                       
only showing top 3 rows



In [11]:
df_final = spark_df_flattened \
    .groupBy("gender", "age", "uid") \
    .agg(collect_list("host") \
    .alias("hosts"))

In [12]:
df_final.show(3, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender | -                                                                                                                                                                                                                                                                                                                                                                                                              
 age    | -                                                                                                                                                                         

In [13]:
df_final.filter(df_final.gender == "F").show(7)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|               hosts|
+------+-----+--------------------+--------------------+
|     F|18-24|09b1ecd3-b2d2-4c1...|[tankionline.com,...|
|     F|18-24|15faf063-5e44-4b6...|[allods.mail.ru, ...|
|     F|18-24|560142d9-6c9c-439...|[http, http, http...|
|     F|18-24|6709f443-7ddd-423...|[muzofon.com, muz...|
|     F|18-24|67e9bd68-ef03-49c...|[tempfile.ru, tem...|
|     F|18-24|757ff5c2-ecdb-489...|[www.yves-rocher....|
|     F|18-24|c430a9d4-5f48-47c...|[www.eporner.com,...|
+------+-----+--------------------+--------------------+
only showing top 7 rows



In [14]:
df_final.select("gender").distinct().collect()

[Row(gender='F'), Row(gender='M'), Row(gender='-')]

In [15]:
df_final.select("age").distinct().collect()

[Row(age='>=55'),
 Row(age='45-54'),
 Row(age='-'),
 Row(age='35-44'),
 Row(age='25-34'),
 Row(age='18-24')]

In [16]:
df_final = df_final.filter( (df_final.gender != '-') & (df_final.age != '-') )

In [17]:
df_final.show(7)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|               hosts|
+------+-----+--------------------+--------------------+
|     F|18-24|09b1ecd3-b2d2-4c1...|[tankionline.com,...|
|     F|18-24|15faf063-5e44-4b6...|[allods.mail.ru, ...|
|     F|18-24|560142d9-6c9c-439...|[http, http, http...|
|     F|18-24|6709f443-7ddd-423...|[muzofon.com, muz...|
|     F|18-24|67e9bd68-ef03-49c...|[tempfile.ru, tem...|
|     F|18-24|757ff5c2-ecdb-489...|[www.yves-rocher....|
|     F|18-24|c430a9d4-5f48-47c...|[www.eporner.com,...|
+------+-----+--------------------+--------------------+
only showing top 7 rows



In [18]:
df_final.select("gender").distinct().collect()

[Row(gender='F'), Row(gender='M')]

In [19]:
df_final.select("age").distinct().collect()

[Row(age='>=55'),
 Row(age='45-54'),
 Row(age='35-44'),
 Row(age='25-34'),
 Row(age='18-24')]

In [20]:
hosts_len = len(df_final.select("hosts").distinct().collect())
hosts_len

34178

In [21]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=7575)

In [22]:
hashing_TF = HashingTF(inputCol="hosts", outputCol="rawFeatures", numFeatures=10000, binary=False)
indexer_age = StringIndexer(inputCol="age", outputCol="ageIndex").fit(df_final)
indexer_gender = StringIndexer(inputCol="gender", outputCol="genderIndex").fit(df_final)

In [23]:
rf_age = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'ageIndex', 
                           predictionCol="age_index_prediction", rawPredictionCol="age_index_raw_prediction",
                           probabilityCol = "age_probability")

In [24]:
rf_gender = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'genderIndex',
                              predictionCol="gender_index_prediction", rawPredictionCol="gender_index_raw_prediction",
                              probabilityCol = "gender_probability")

In [25]:
converter_age = IndexToString(inputCol="age_index_prediction", outputCol="PredictedAge", labels=indexer_age.labels)

In [26]:
converter_gender = IndexToString(inputCol="gender_index_prediction", 
                                 outputCol="PredictedGender", labels=indexer_gender.labels)

In [27]:
pipeline = Pipeline(stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, 
                            converter_age, converter_gender])

In [28]:
model = pipeline.fit(X_train)

In [29]:
predictions = model.transform(X_test)

In [30]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="ageIndex", predictionCol="age_index_prediction", metricName="accuracy")
accuracy_age = evaluator_age.evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="genderIndex", predictionCol="gender_index_prediction", metricName="accuracy")
accuracy_gender = evaluator_gender.evaluate(predictions)

print("Accuracy for age: " + str(accuracy_age))
print("Accuracy for gender: " + str(accuracy_gender))

Accuracy for age: 0.4328420467185762
Accuracy for gender: 0.5483870967741935


In [31]:
model.write().overwrite().save("/user/ivan.bychenkov/lab04_model")

In [32]:
!hdfs dfs -du /user/ivan.bychenkov/lab04_model

395    1185    /user/ivan.bychenkov/lab04_model/metadata
85066  255198  /user/ivan.bychenkov/lab04_model/stages


In [5]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_ivan.bychenkov' # тест данные
KAFKA_OUTPUT_TOPIC = 'ivan.bychenkov' # записываем в аутпут

In [6]:
kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [7]:
kafka_read_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [8]:
event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])


visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)

In [9]:
clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)


clean_df.show(7)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
|bd7c5d7a-0def-41d...|[[http://www.24op...|
|bd7e54a2-0215-45c...|[[http://www.dns-...|
|bd7e9797-4cdb-46e...|[[http://news.met...|
|bd7e9ec7-fb67-45e...|[[http://dynamobr...|
+--------------------+--------------------+
only showing top 7 rows



In [10]:
inf_model = PipelineModel.load("/user/ivan.bychenkov/lab04_model")

In [39]:
!hdfs dfs -du /user/ivan.bychenkov/lab04_model

395    1185    /user/ivan.bychenkov/lab04_model/metadata
85066  255198  /user/ivan.bychenkov/lab04_model/stages


In [40]:
kafka_read_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [41]:
event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])


visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)

In [42]:
clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)


clean_df.show(3)

+---+------+
|uid|visits|
+---+------+
+---+------+



In [43]:
! hdfs dfs -ls

Found 2 items
drwxr-xr-x   - ivan.bychenkov ivan.bychenkov          0 2022-11-07 11:52 .sparkStaging
drwxr-xr-x   - ivan.bychenkov ivan.bychenkov          0 2022-11-07 11:35 lab04_model


In [11]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_ivan.bychenkov",
    "startingOffsets": "latest"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [13]:
clean_df = kafka_sdf \
    .select(col("value").cast("string").alias("value")) \
    .select(from_json(col("value"), event_schema).alias("data")) \
    .select("data.*") \
    .select("uid", from_json(col("visits"), visit_schema).alias("visits"))

In [14]:
inf_model = PipelineModel.load("/user/ivan.bychenkov/lab04_model")

In [16]:
prep_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(collect_list("host").alias("hosts"))

In [17]:
predictions_df = inf_model.transform(prep_df) \
    .select("uid", "PredictedGender", "PredictedAge") \
    .withColumnRenamed("PredictedAge","age") \
    .withColumnRenamed("PredictedGender","gender")\

In [18]:
kafka_out_df = predictions_df \
    .select(to_json(struct(*predictions_df.columns)).alias("value"))

In [19]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "ivan.bychenkov"
}
kafka_out_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("complete").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f37da29ff60>

In [20]:
spark.stop()