In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 --executor-memory 1g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("spark-course") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [None]:
from pyspark.sql.functions import struct, to_json
from pyspark.sql.functions import lower
from pyspark.sql.functions import shuffle, array, lit
from pyspark.sql.functions import max, col
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
from pyspark.sql.types import StructType,StructField, FloatType, ArrayType, StringType, LongType
import json
import re
import pyspark.sql.functions as f


In [None]:
kafka_params = {
    "kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667",
    "subscribe": "input_ivan.groo"
}

df = spark.read.format("kafka").options(**kafka_params).load()

df.printSchema()
df.show()



In [None]:
df.show()

In [None]:
test = df.select(col("value").cast("string"))
test.show(5,False)

In [None]:
data = spark.read.format("csv")\
    .option("header", True) \
    .option("sep", "\t") \
    .load("/labs/slaba04/gender_age_dataset.txt")

In [None]:
data.show(truncate=False)

In [None]:
schema1 = StructType([ 
    StructField("url",StringType(),True), 
    StructField("timestamp",StringType(),True)

  ])

In [None]:
schema = StructType([ 
    StructField("visits",ArrayType(schema1),True),
    
  ])

In [None]:
from pyspark.sql.functions import col,from_json

In [None]:
dfJSON = data.withColumn("visits",from_json(col("user_json"),schema)) \
                   
dfJSON.printSchema()
dfJSON.show(truncate=False)

In [None]:
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCol
from pyspark.ml.param.shared import HasInputCol
from pyspark import keyword_only

In [None]:
class ClearTransformer(Transformer, HasInputCol, HasOutputCol):

            
    def _transform(self, dataset):
        return dataset.select(col("gender"), col("age"), col("uid"),col("visits.*")).filter("gender == 'M' or gender == 'F'").filter("age == '18-24' or age == '25-34' or age == '35-44' or age == '45-54' or age == '>=55'")

In [None]:
transformer = ClearTransformer()

In [None]:
df = transformer.transform(dfJSON)
df.show(5)

In [None]:
df_exp=df.withColumn('exploded', f.explode(col("visits")))
df_exp.printSchema()
df_exp.show(5)
df_train = df_exp.select(col("gender"), col("age"), col("uid"),col("exploded.*")).drop("timestamp")
df_train.printSchema()
df_train.show(5)

In [None]:
df_train1 = df_train.withColumn("url_parse", f.expr("parse_url(url, 'HOST')")) \
    .groupBy("gender","age", "uid") \
    .agg(f.collect_list('url_parse').alias("collect_url")) \

In [None]:
htf = HashingTF(inputCol="collect_url", outputCol="tf", numFeatures= 10000, binary = True )

In [None]:
tf = htf.transform(df_train1)

In [None]:
tf.show(5, False, True)

In [None]:
tf.show(5)

In [None]:
indexers_age = StringIndexer(inputCol="age", outputCol="age_index")

In [None]:
indexers_gender = StringIndexer(inputCol="gender", outputCol="gender_index")

In [None]:
df_ind = indexers_age.fit(df_train1)

In [None]:
df3 = df_ind.transform(df_train1)

In [None]:
df_ind2 = indexers_gender.fit(df_ind.transform(df_train1))

In [None]:
df_train = df_ind2.transform(df3)

In [None]:
df_train.show(5)

In [None]:
rf_age = RandomForestClassifier(featuresCol = 'tf', labelCol = 'age_index', predictionCol = "age_predict",  rawPredictionCol = "age_rawPredictionCol",  probabilityCol = "age_probabilityCol", seed=42, numTrees=40, maxDepth=12)

In [None]:
rf_gender = RandomForestClassifier(featuresCol = 'tf', labelCol = 'gender_index', predictionCol = "gender_predict", rawPredictionCol = "gender_rawPredictionCol",  probabilityCol = "gender_probabilityCol", seed=42, numTrees=40, maxDepth=12)

In [None]:
%%time
rfModel_age = rf_age.fit(df_train)

In [None]:
rfModel_age.numClasses

In [None]:
%%time
rfModel_gender = rf_gender.fit(predictions_age)

In [None]:
predictions = rfModel_gender.transform(df_train).cache()

In [None]:
predictions_age = rfModel_age.transform(df_train).cache()

In [None]:
predictions_age.show(5, False)

In [None]:
schema1 = ArrayType(
    StructType([ 
        StructField("url",StringType(),True), 
        StructField("timestamp",StringType(),True)
        ])
)

In [None]:
schema_test = StructType([
    StructField("uid",StringType(),True),
    StructField("visits",StringType(),True),
])


In [None]:
test.show(2,False)

In [None]:
df_test = test.withColumn("visits_col",from_json(col("value"),schema_test))
                   


In [None]:
df_test.printSchema()
df_test.show(1)


In [None]:
df_test.select(col("visits_col").uid, f.col("visits_col").visits).show(5)

In [None]:
df6 = df_test.select(col("visits_col").uid.alias("uid"), f.from_json(f.col("visits_col").visits, schema1).alias("visits")).cache()

In [None]:
df_test.show()


In [None]:
df_exp_t=df6.withColumn('exploded', f.explode(col("visits")))
df_exp_t.printSchema()
df_exp_t.show(5)
df_test_t = df_exp_t.select(col("uid"),col("exploded.*")).drop("timestamp")
df_test_t.printSchema()
df_test_t.show(5)

In [None]:
df_test_t1 = df_test_t.withColumn("url_parse", f.expr("parse_url(url, 'HOST')")) \
    .groupBy("uid") \
    .agg(f.collect_list('url_parse').alias("collect_url")) \

In [None]:
test_tf = htf.transform(df_test_t1)

In [None]:
prediction_g = rfModel_gender.transform(test_tf).cache()

In [None]:
prediction_a = rfModel_age.transform(test_tf).cache()

In [None]:
prediction_a.show()

In [None]:
converter_age = IndexToString(inputCol="age_predict", outputCol="age", labels=df_ind.labels)

In [None]:
converter_gender = IndexToString(inputCol="gender_predict", outputCol="gender", labels=df_ind2.labels)

In [None]:
pr_a = converter_age.transform(prediction_a)

In [None]:
pr_g = converter_gender.transform(prediction_g)

In [None]:
pipeline = Pipeline(stages=[htf, rf_age, rf_gender, converter_age, converter_gender])

In [None]:
%%time
model = pipeline.fit(df_train)

In [None]:
model.write().overwrite().save("/user/ivan.groo/lab04_model")

In [None]:
from pyspark.ml import  PipelineModel

In [None]:
model = PipelineModel.load("/user/ivan.groo/lab04_model")

In [None]:
predictions = model.transform(df_test_t1).select("uid", "Age", "gender")

In [None]:
predictions.show()

In [None]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_ivan.groo'
KAFKA_OUTPUT_TOPIC = 'ivan.groo'

In [None]:
# чтение в статическом режиме

kafka_read_df = (
    spark.read
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
    .cache()
)

In [None]:
kafka_test = kafka_read_df.select(col("value").cast("string"))


In [None]:
kafka_test.count()

In [None]:
kafka_test.count()

In [None]:
kafka_test.show(5, False)

In [None]:
kafka_test.show(15, False)

In [None]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [None]:
event_schema = StructType([
    StructField('uid', StringType(), True),
    StructField('visits', StringType(), True),
])


visit_schema = ArrayType(
    StructType([
        StructField('url', StringType(), True),
        StructField('timestamp', LongType(), True)
    ])
)

In [None]:
clean_df = (
    kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits')
    )
)

In [None]:
clean_df.show(5)

In [None]:
df_exp_t=clean_df.withColumn('exploded', f.explode(col("visits")))
df_exp_t.printSchema()
df_exp_t.show(5)
df_test_t = df_exp_t.select(col("uid"),col("exploded.*")).drop("timestamp")
df_test_t.printSchema()
df_test_t.show(5)

In [None]:
df_test_t1 = df_test_t.withColumn("url_parse", f.expr("parse_url(url, 'HOST')")) \
    .groupBy("uid") \
    .agg(f.collect_list('url_parse').alias("collect_url")) \

In [None]:
predictions_df = model.transform(df_test_t1).select("uid", "gender", "Age".alias('age'))

In [None]:
predictions_df.show()

In [None]:
# Оборачивание предсказания обратно в json

kafka_out_df = (
    predictions_df.select(f.to_json(f.struct(*predictions_df.columns)).alias('value')).limit(5)
)

# Запись в выходной топик в батчевом режиме

(
    kafka_out_df
    .write
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('topic', KAFKA_OUTPUT_TOPIC)
    .save()
)

In [None]:
# чтение стрима

kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [None]:
kafka_stream

In [None]:
clean_df_s = (
    kafka_stream
    .select(f.col('value').cast('string').alias('value'), "timestamp")
    .select(f.from_json(f.col('value'), event_schema).alias('event'), "timestamp")
    .select(
        'event.uid', 
        f.from_json(f.col('event.visits'), visit_schema).alias('visits'), "timestamp"
    )
)

In [None]:
df_exp_s=clean_df_s.withColumn('exploded', f.explode(col("visits")))
df_test_s = df_exp_s.select(col("uid"),col("exploded.*"))


In [None]:
df_test_s1 = df_test_s.withColumn("url_parse", f.expr("parse_url(url, 'HOST')")) \
    .groupBy("uid", ) \
    .agg(f.collect_list('url_parse').alias("collect_url")
        ) \

In [None]:
predictions_dfs = model.transform(df_test_s1).select("uid", "gender", col('Age').alias('age'))


In [None]:
# предсказание и запись

kafka_write_stream = (
    predictions_dfs
    .select(f.to_json(f.struct(*predictions_dfs.columns)).alias('value'))
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "checkpoints/checkpoints_lab04/groo")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)

In [None]:
sq_w = kafka_write_stream.start()


In [None]:
sq_w.isActive

In [None]:
kill_all()

In [None]:
spark.stop()