In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("GP lab04") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [3]:
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list,regexp_replace, get_json_object, to_json, struct
import pyspark.sql.types as t
import pyspark.sql.functions as f

from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType, TimestampType, LongType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression ,DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

In [4]:
! hdfs dfs -ls "/labs/slaba04/"

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [5]:
path = "/labs/slaba04/gender_age_dataset.txt"

schema = StructType(fields=[
    t.StructField('gender', StringType()),
    t.StructField('age', StringType()),
    t.StructField('uid', StringType()),
    t.StructField('user_json', StringType()),
])

df = spark.read.csv(path, header=True, schema=schema, sep='\t')
df.show(1, truncate=False, vertical=True)
print(df)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender    | F                                                                                                                                                                                                                                                                             

In [6]:
VisitsType = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [7]:
df_flattened = df \
    .withColumn("visits", from_json(col("user_json"), VisitsType)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit","user_json")

df_flattened.show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | zebra-zoya.ru                        
-RECORD 1--------------------------------------
 gender | F                                    
 age    | 18-24                                
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777 
 host   | news.yandex.ru                       
only showing top 2 rows



In [8]:
df_flattened.where(col("host").isNull()).show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------
 gender | M                                    
 age    | 25-34                                
 uid    | d502331d-621e-4721-ada2-5d30b2c3801f 
 host   | null                                 
-RECORD 1--------------------------------------
 gender | M                                    
 age    | >=55                                 
 uid    | d503c3b2-a0c2-4f47-bb27-065058c73008 
 host   | null                                 
only showing top 2 rows



In [9]:
df_final = df_flattened \
    .groupBy("gender", "age", "uid") \
    .agg(collect_list("host") \
    .alias("hosts"))\
    .where(col("age") != "-")\
    .where(col("gender") != "-")\
    .filter(f.size('hosts') != 0)
df_final.show(2, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender | F                                                                                                                                                                                                                                                                                             
 age    | 18-24                                                                                                                                                                                                                                                                                         
 uid    | 09b1ecd3-b2d2-4c1b-857a-025c0509d9ec                                                               

In [10]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=456)

In [11]:
# Добавим индексацию таргетов
indexer_age = StringIndexer(inputCol="age", outputCol="ageIndex").fit(df_final)
indexer_gender = StringIndexer(inputCol="gender", outputCol="genderIndex").fit(df_final)

In [13]:
from pyspark.ml.feature import CountVectorizer, Tokenizer

In [15]:
tokenizer = Tokenizer(inputCol="hosts", outputCol="rawHosts")

In [20]:
# Добавим count_vectorizer для создания вектора с посещенными сайтами
count_vectorizer = CountVectorizer(inputCol="hosts", outputCol="hosts_vector", binary=True)

In [21]:
count_vectorizer.params

[Param(parent='CountVectorizer_a78b5ab8c111', name='binary', doc='Binary toggle to control the output vector values. If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False'),
 Param(parent='CountVectorizer_a78b5ab8c111', name='inputCol', doc='input column name.'),
 Param(parent='CountVectorizer_a78b5ab8c111', name='maxDF', doc='Specifies the maximum number of different documents a term could appear in to be included in the vocabulary. A term that appears more than the threshold will be ignored. If this is an integer >= 1, this specifies the maximum number of documents the term could appear in; if this is a double in [0,1), then this specifies the maximum fraction of documents the term could appear in. Default (2^63) - 1'),
 Param(parent='CountVectorizer_a78b5ab8c111', name='minDF', doc='Specifies the minimum number of different documents a term must appear

In [28]:
from pyspark.sql.functions import size,col
from pyspark.ml.feature import VectorAssembler

In [36]:
# Добавим число посещенных сайтов
df_final = df_final.withColumn("hosts_len", size(col("hosts")))

In [29]:
assembler = VectorAssembler(inputCols=[count_vectorizer.getOutputCol(), "hosts_len"], outputCol="features")

In [34]:
rf_age = RandomForestClassifier(featuresCol = 'hosts_vector', labelCol = 'ageIndex', 
                           predictionCol="age_index_prediction", rawPredictionCol="age_index_raw_prediction",
                           probabilityCol = "age_probability")

In [35]:
rf_gender = RandomForestClassifier(featuresCol = 'hosts_vector', labelCol = 'genderIndex',
                              predictionCol="gender_index_prediction", rawPredictionCol="gender_index_raw_prediction",
                              probabilityCol = "gender_probability")

In [32]:
converter_age = IndexToString(inputCol="age_index_prediction", outputCol="PredictedAge", labels=indexer_age.labels)

In [33]:
converter_gender = IndexToString(inputCol="gender_index_prediction", outputCol="PredictedGender", labels=indexer_gender.labels)

In [37]:
pipeline = Pipeline(stages=[count_vectorizer, assembler, indexer_age, indexer_gender, rf_age, rf_gender, 
                            converter_age, converter_gender])

In [38]:
df_final.repartition(15).rdd.getNumPartitions()

15

In [39]:
X_train, X_test = df_final.randomSplit([0.8, 0.2], seed=456)
X_train.repartition(15)
X_train.cache()

DataFrame[gender: string, age: string, uid: string, hosts: array<string>, hosts_len: int]

In [40]:
model = pipeline.fit(X_train)

In [41]:
predictions = model.transform(X_test)

In [42]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="ageIndex", predictionCol="age_index_prediction", metricName="accuracy")
accuracy_age = evaluator_age.evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="genderIndex", predictionCol="gender_index_prediction", metricName="accuracy")
accuracy_gender = evaluator_gender.evaluate(predictions)

print("Accuracy for age column: " + str(accuracy_age))
print("Accuracy for gender column: " + str(accuracy_gender))

Accuracy for age: 0.43338930870417014
Accuracy for gender: 0.533445284075007


In [43]:
model.write().overwrite().save("/user/georgiy.krupenchenkov/lab04_model")

In [1]:
# закончил бачовый тест для пайплайна, теперь для сделаем стримминг 

# Spark Streamming

In [44]:
KAFKA_BOOTSTRAP_SERVER = 'spark-master-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_georgiy.krupenchenkov'
KAFKA_OUTPUT_TOPIC = 'georgiy.krupenchenkov'

In [48]:
# чтение в статическом режиме
kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [49]:
kafka_read_df.show(3)

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_georgiy.kru...|        0|     0|2022-11-06 19:42:...|            0|
|null|[7B 22 75 69 64 2...|input_georgiy.kru...|        0|     1|2022-11-06 19:42:...|            0|
|null|[7B 22 75 69 64 2...|input_georgiy.kru...|        0|     2|2022-11-06 19:42:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [51]:
# Парсинг бинарного файла из кафки
event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])

visit_schema = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)

clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)

clean_df = clean_df.withColumn("hosts_len", size(col("visits")))

In [52]:
clean_df.show(3)

+--------------------+--------------------+---------+
|                 uid|              visits|hosts_len|
+--------------------+--------------------+---------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|     2000|
|bd7a6f52-45db-49b...|[[https://www.pac...|     1284|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|        4|
+--------------------+--------------------+---------+
only showing top 3 rows



In [57]:
inf_model = PipelineModel.load("/user/georgiy.krupenchenkov/lab04_model")

In [60]:
prep_df = clean_df \
    .withColumn("visit", explode("visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit") \
    .groupBy("uid") \
    .agg(collect_list("host").alias("hosts")) \
    .withColumn("hosts_len", size(col("hosts")))

In [61]:
prep_df.show(3)

+--------------------+--------------------+---------+
|                 uid|               hosts|hosts_len|
+--------------------+--------------------+---------+
|0108d217-e476-493...|[kvartblog.ru, kv...|        3|
|0192cc54-559c-4c8...|[metanol.lv, meta...|       43|
|019acd5e-be9a-4cd...|[www.russianfood....|       26|
+--------------------+--------------------+---------+
only showing top 3 rows



In [62]:
predictions_df = inf_model.transform(prep_df) \
    .select("uid", "PredictedGender", "PredictedAge") \
    .withColumnRenamed("PredictedAge","age") \
    .withColumnRenamed("PredictedGender","gender")

In [63]:
predictions_df.show(3)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|0108d217-e476-493...|     M|25-34|
|0192cc54-559c-4c8...|     M|25-34|
|019acd5e-be9a-4cd...|     M|25-34|
+--------------------+------+-----+
only showing top 3 rows



In [64]:
# извлечение url
# применение модели, сохранение предсказаний в predictions_df  
# Оборачивание предсказания обратно в json

kafka_out_df = (
    predictions_df.select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
)

# Запись в выходной топикa
(
    kafka_out_df
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('topic', KAFKA_OUTPUT_TOPIC)
    .save()
)

In [65]:
kafka_out_df.show(2, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------
 value | {"uid":"0108d217-e476-493d-8c81-a9744f12451a","gender":"M","age":"25-34"} 
-RECORD 1--------------------------------------------------------------------------
 value | {"uid":"0192cc54-559c-4c8e-89b4-5f4bf31e4245","gender":"M","age":"25-34"} 
only showing top 2 rows



In [66]:
kafka_out_df.count()

5000

In [67]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [68]:
# предсказание и запись
kafka_write_stream = (
    predictions_df
    .select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
    .write
    .format("kafka")
#    .outputMode("append")
    .option("checkpointLocation", "checkpoints/checkpoints_lab04")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)

In [71]:
!hdfs dfs -ls "/user/georgiy.krupenchenkov"

Found 4 items
drwxr-xr-x   - georgiy.krupenchenkov georgiy.krupenchenkov          0 2022-11-06 18:55 /user/georgiy.krupenchenkov/.sparkStaging
drwxr-xr-x   - georgiy.krupenchenkov georgiy.krupenchenkov          0 2022-11-06 15:52 /user/georgiy.krupenchenkov/lab03.csv
drwxr-xr-x   - georgiy.krupenchenkov georgiy.krupenchenkov          0 2022-11-06 19:49 /user/georgiy.krupenchenkov/lab03_folder
drwxr-xr-x   - georgiy.krupenchenkov georgiy.krupenchenkov          0 2022-11-06 19:39 /user/georgiy.krupenchenkov/lab04_model


In [70]:
kafka_write_stream.save("/user/georgiy.krupenchenkov/test")

In [72]:
spark.stop()