In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql.functions import from_json, col, explode, expr, \
collect_list, to_json, struct

from pyspark.sql.types import StructField, StructType, StringType, ArrayType, LongType
from pyspark.ml.feature import HashingTF, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
train_df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, inferSchema=True, sep='\t')

In [6]:
v_type = StructType([
    StructField("visits", ArrayType(
        StructType([
            StructField("url", StringType(), True),
            StructField("timestamp", LongType(), True) 
            ])
        ))
    ])

In [7]:
pre_train = train_df \
    .withColumn("visits", from_json(col("user_json"), v_type)) \
    .withColumn("visit", explode("visits.visits").alias("visit")) \
    .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
    .drop("visits", "visit","user_json")

In [8]:
final_train = pre_train \
    .groupBy("gender", "age", "uid") \
    .agg(collect_list("host") \
    .alias("hosts"))

In [9]:
final_train.cache()

DataFrame[gender: string, age: string, uid: string, hosts: array<string>]

In [10]:
x_train, x_test = final_train.randomSplit([0.8, 0.2], seed=12345)

In [11]:
hashing_TF = HashingTF(inputCol="hosts", outputCol="raw_features", numFeatures=10000, binary=False)

In [12]:
indexer_age = StringIndexer(inputCol="age", outputCol="age_index").fit(final_train)

In [13]:
indexer_gender = StringIndexer(inputCol="gender", outputCol="gender_index").fit(final_train)

In [14]:
rf_age = RandomForestClassifier(featuresCol = 'raw_features', labelCol = 'age_index', 
                           predictionCol="age_index_prediction", rawPredictionCol="age_index_raw_prediction",
                           probabilityCol = "age_probability")

In [15]:
rf_gender = RandomForestClassifier(featuresCol = 'raw_features', labelCol = 'gender_index',
                              predictionCol="gender_index_prediction", rawPredictionCol="gender_index_raw_prediction",
                              probabilityCol = "gender_probability")

In [16]:
converter_age = IndexToString(inputCol="age_index_prediction", outputCol="predicted_age", labels=indexer_age.labels)

In [17]:
converter_gender = IndexToString(inputCol="gender_index_prediction", outputCol="predicted_gender", labels=indexer_gender.labels)

In [18]:
pipeline = Pipeline(stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, 
                            converter_age, converter_gender])

In [19]:
model = pipeline.fit(x_train)

In [20]:
predictions = model.transform(x_test)

In [21]:
predictions.select("gender", "age", "predicted_age", "predicted_gender").show(5)

+------+---+------------+---------------+
|gender|age|PredictedAge|PredictedGender|
+------+---+------------+---------------+
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
|     -|  -|       25-34|              M|
+------+---+------------+---------------+
only showing top 5 rows



In [22]:
evaluator_age = MulticlassClassificationEvaluator(
    labelCol="age_index", predictionCol="age_index_prediction", metricName="accuracy")
accuracy_age = evaluator_age.evaluate(predictions)
evaluator_gender = MulticlassClassificationEvaluator(
    labelCol="gender_index", predictionCol="gender_index_prediction", metricName="accuracy")
accuracy_gender = evaluator_gender.evaluate(predictions)

print("Accuracy for age: " + str(accuracy_age))
print("Accuracy for gender: " + str(accuracy_gender))

Accuracy for age: 0.3780164092664093
Accuracy for gender: 0.48359073359073357


In [23]:
model.write().overwrite().save("/user/andrey.kachetov/lab04_model")

In [24]:
!hdfs dfs -du /user/andrey.kachetov/lab04_model

395    1185    /user/andrey.kachetov/lab04_model/metadata
86841  260523  /user/andrey.kachetov/lab04_model/stages


In [25]:
inf_model = PipelineModel.load("/user/andrey.kachetov/lab04_model")

In [26]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_andrey.kachetov",
    "startingOffsets": "latest"
}

In [27]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "andrey.kachetov"
}

In [28]:
event_type  = StructType([
    StructField("uid", StringType(), True),
    StructField("visits", StringType(), True),
    ])

visit_type = ArrayType(
    StructType([
        StructField("url", StringType(), True),
        StructField("timestamp", LongType(), True) 
    ])
)

In [29]:
def process_batch(batch_df):
    clean_df = batch_df \
        .select(col("value").cast("string").alias("value")) \
        .select(from_json(col("value"), event_type).alias("data")) \
        .select("data.*") \
        .select("uid", from_json(col("visits"), visit_type).alias("visits"))
    
    proc_df = clean_df \
        .withColumn("visit", explode("visits").alias("visit")) \
        .withColumn("host", expr("parse_url(visit.url, 'HOST')").alias("host")) \
        .drop("visits", "visit") \
        .groupBy("uid") \
        .agg(collect_list("host").alias("hosts"))
    
    predictions_df = inf_model.transform(proc_df) \
        .select("uid", "predicted_gender", "predicted_age") \
        .withColumnRenamed("predicted_age","age") \
        .withColumnRenamed("predicted_gender","gender")

    kafka_df = predictions_df \
        .select(to_json(struct(*predictions_df.columns)).alias("value"))
    
    kafka_df \
        .write \
        .format("kafka") \
        .options(**write_kafka_params)\
        .mode("append") \
        .save()

In [8]:
kafka_test_df = spark \
    .readStream \
    .format("kafka") \
    .option("failOnDataLoss", 'False') \
    .options(**read_kafka_params) \
    .load()

In [10]:
sq = kafka_test_df \
    .writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", "streaming/chk/andrey_kachetov_chk_kafka")\
    .start()

In [59]:
# Stop streaming
sq.stop()

In [None]:
spark.stop()

In [9]:
!hdfs dfs -rm -r -f streaming/chk/

21/03/22 13:46:05 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/andrey.kachetov/streaming/chk' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/andrey.kachetov/.Trash/Current/user/andrey.kachetov/streaming/chk1616409965278


In [76]:
!hdfs dfs -ls streaming/chk/

ls: `streaming/chk/': No such file or directory
