In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("ML app") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [3]:
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list,regexp_replace, get_json_object, to_json, struct
import pyspark.sql.types as t
import pyspark.sql.functions as f

from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType, TimestampType, LongType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression ,DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

In [7]:
path = "/labs/slaba04/gender_age_dataset.txt"

schema = StructType(fields=[
    t.StructField('gender', StringType()),
    t.StructField('age', StringType()),
    t.StructField('uid', StringType()),
    t.StructField('user_json', StringType()),
])

df = spark.read.csv(path, header=True, schema=schema, sep='\t')
df.show(1, truncate=False, vertical=True)
print(df)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender    | F                                                                                                                                                                                                                                                                             

In [8]:
VisitsType = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [9]:
df_flattened = df \
    .withColumn("visits", from_json(col("user_json"), VisitsType)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit","user_json")
df_flattened.show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | zebra-zoya.ru                        
-RECORD 1--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | news.yandex.ru                       
only showing top 2 rows



In [10]:
df_flattened.where(col("host").isNull()).show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------
 gender | M                                    
 age    | 25-34                                
 uid    | d502331d-621e-4721-ada2-5d30b2c3801f 
 host   | null                                 
-RECORD 1--------------------------------------
 gender | M                                    
 age    | >=55                                 
 uid    | d503c3b2-a0c2-4f47-bb27-065058c73008 
 host   | null                                 
only showing top 2 rows



In [11]:
df_final = df_flattened \
    .groupBy("gender", "age", "uid") \
    .agg(collect_list("host") \
    .alias("hosts"))\
    .where(col("age") != "-")\
    .where(col("gender") != "-")\
    .filter(f.size('hosts') != 0)
df_final.show(2, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender | F                                                                                                                                                                                                                                                                                             
 age    | 18-24                                                                                                                                                                                                                                                                                         
 uid    | 09b1ecd3-b2d2-4c1b-857a-025c0509d9ec                                                               

In [12]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=123)

In [13]:
hashing_TF = HashingTF(inputCol="hosts", outputCol="rawFeatures", numFeatures=10000, binary=False)
indexer_age = StringIndexer(inputCol="age", outputCol="ageIndex").fit(df_final)
indexer_gender = StringIndexer(inputCol="gender", outputCol="genderIndex").fit(df_final)

In [14]:
rf_age = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'ageIndex', 
                           predictionCol="age_index_prediction", rawPredictionCol="age_index_raw_prediction",
                           probabilityCol = "age_probability")

In [15]:
rf_gender = RandomForestClassifier(featuresCol = 'rawFeatures', labelCol = 'genderIndex',
                              predictionCol="gender_index_prediction", rawPredictionCol="gender_index_raw_prediction",
                              probabilityCol = "gender_probability")

In [16]:
converter_age = IndexToString(inputCol="age_index_prediction", outputCol="PredictedAge", labels=indexer_age.labels)

In [17]:
converter_gender = IndexToString(inputCol="gender_index_prediction", outputCol="PredictedGender", labels=indexer_gender.labels)

In [18]:
pipeline = Pipeline(stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, 
                            converter_age, converter_gender])

In [21]:
df_final.repartition(15).rdd.getNumPartitions()

15

In [22]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=123)
X_train.repartition(15)
X_train.cache()

DataFrame[gender: string, age: string, uid: string, hosts: array<string>]

In [23]:
model = pipeline.fit(X_train)

In [24]:
predictions = model.transform(X_test)

In [25]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="ageIndex", predictionCol="age_index_prediction", metricName="accuracy")
accuracy_age = evaluator_age.evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="genderIndex", predictionCol="gender_index_prediction", metricName="accuracy")
accuracy_gender = evaluator_gender.evaluate(predictions)

print("Accuracy for age: " + str(accuracy_age))
print("Accuracy for gender: " + str(accuracy_gender))

Accuracy for age: 0.42804997894145724
Accuracy for gender: 0.5389583040853573


In [26]:
model.write().overwrite().save("/user/denis.gorbatov/lab04_model")

In [28]:
kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', 'spark-node-1.newprolab.com:6667')
    .option('subscribe', 'input_denis.gorbatov')
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [29]:
kafka_read_df.show(3)

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_denis.gorbatov|        0|     0|2022-11-06 20:31:...|            0|
|null|[7B 22 75 69 64 2...|input_denis.gorbatov|        0|     1|2022-11-06 20:31:...|            0|
|null|[7B 22 75 69 64 2...|input_denis.gorbatov|        0|     2|2022-11-06 20:31:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [30]:
event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])


visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)


clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)

clean_df.show(3)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
+--------------------+--------------------+
only showing top 3 rows



In [32]:
inf_model = PipelineModel.load("/user/denis.gorbatov/lab04_model")

In [33]:
prep_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(collect_list("host").alias("hosts"))

predictions_df = inf_model.transform(prep_df) \
    .select("uid", "PredictedGender", "PredictedAge") \
    .withColumnRenamed("PredictedAge","age") \
    .withColumnRenamed("PredictedGender","gender")

In [36]:
kafka_out_df = (
    predictions_df.select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
)


(
    kafka_out_df
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', 'spark-node-1.newprolab.com:6667')
    .option('topic', 'denis.gorbatov')
    .save()
)

In [37]:
kafka_out_df.show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------
 value | {"uid":"0108d217-e476-493d-8c81-a9744f12451a","gender":"M","age":"25-34"} 
-RECORD 1--------------------------------------------------------------------------
 value | {"uid":"0192cc54-559c-4c8e-89b4-5f4bf31e4245","gender":"M","age":"25-34"} 
only showing top 2 rows



In [39]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', 'spark-node-1.newprolab.com:6667')
    .option('subscribe', 'input_denis.gorbatov')
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [40]:


kafka_write_stream = (
    predictions_df
    .select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
    .write
    .format("kafka")
#    .outputMode("append")
    .option("checkpointLocation", "checkpoints/checkpoints_lab04")
    .option("kafka.bootstrap.servers", 'spark-node-1.newprolab.com:6667')
    .option("topic",'denis.gorbatov')
)


In [41]:
! hdfs dfs -ls "/user/denis.gorbatov"

Found 6 items
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 20:08 /user/denis.gorbatov/.sparkStaging
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 15:32 /user/denis.gorbatov/lab03
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 16:00 /user/denis.gorbatov/lab03_lgbt
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 20:53 /user/denis.gorbatov/lab04_model
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 15:04 /user/denis.gorbatov/test
drwxr-xr-x   - denis.gorbatov denis.gorbatov          0 2022-11-06 14:07 /user/denis.gorbatov/test_plz_work


In [42]:
kafka_write_stream.save("/user/denis.gorbatov/test")

In [None]:
spark.stop()