In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier


from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType, LongType

from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number, explode, split
from pyspark.sql.functions import array, collect_set, lit, from_json, to_json, struct, regexp_replace

from pyspark.mllib.linalg import SparseVector, DenseVector


from urllib.parse import urlparse
import json
import re

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("RIK_laba4") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

# spark

# Обучение

In [None]:
schema = (
    StructType()
    .add("gender", StringType(), True)
    .add("age", StringType(), True)
    .add("uid", StringType(), True)
    .add("user_json", StringType(), True)
)
   
event_schema = StructType([
        StructField('uid', StringType(), True)
        , StructField('visits', StringType(),True)
        ])

visit_schema = ArrayType(
        StructType([
            StructField('url', StringType(), True)
            , StructField('timestamp', LongType(), True)
        ])
)
    
df_train = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(schema)
            .load("/labs/slaba04/gender_age_dataset.txt")
)

df_train = (
    df_train
    .filter(df_train.gender != '-')
    .select("gender", "age", "uid", from_json("user_json", event_schema).alias("event"))
    .select("gender", "age", "uid", from_json(col("event.visits"), visit_schema).alias("visits"))
    .select("gender", "age", "uid", explode(col("visits")))
    .selectExpr("gender", "age", "uid", "parse_url(col.url, 'HOST') as host")
    .filter(col("host").isNotNull())
    .withColumn('domain', regexp_replace('host', 'www.', ''))
    .select('gender', 'age', 'uid', 'domain')
)

In [None]:
df_train.createOrReplaceTempView("train")
spark.sql("""SELECT DISTINCT domain 
        FROM train
        GROUP BY domain
        HAVING count(1) <= 2""").createOrReplaceTempView("train_not")
df_train = spark.sql("""SELECT /*broadcast(tn)*/
            t.* 
        FROM train t
        JOIN train_not tn
            on t.domain = tn.domain
        """).cache()

df_train = df_train.groupBy("gender", "age", "uid").agg(collect_set('domain').alias('domain'))

In [None]:
df_train.show(5)

In [None]:
hasher_freq = HashingTF(numFeatures=1000, binary=False, inputCol="domain", outputCol="domain_vector")
normalizer = Normalizer(inputCol='domain_vector', outputCol="domain_norm")


df_train_norm = normalizer.transform(hasher_freq.transform(df_train))


df_train_norm = (
    df_train_norm
    .replace(['F', 'M'], ['0','1'], "gender")
    .replace(['18-24', '25-34', '35-44', '45-54', '>=55'], ['1','2','3','4','5'], "age")
)

df_train_norm.createOrReplaceTempView("train_norm")
df_train_norm = spark.sql("""
    SELECT CAST(gender AS INT) as gender, CAST(age AS INT) age, domain_norm
    FROM train_norm""")

rf_age = RandomForestClassifier(labelCol="age", featuresCol="domain_norm")
rf_age_model = rf_age.fit(df_train_norm)

rf_gender = RandomForestClassifier(labelCol="gender", featuresCol="domain_norm")
rf_gender_model = rf_gender.fit(df_train_norm)

# Проверка

In [None]:
kafka_read_df = (
    spark
#     .read
    .readStream
    .format('kafka')
    .option("kafka.bootstrap.servers", 'spark-master-1.newprolab.com:6667')
    .option("subscribe", "input_roman.kozhushko")
    .option("startingOffsets", "latest")
    .option("failOnDataloss", "False")
    .load()
#     .cache()
)

In [None]:
# kafka_read_df.count()
kafka_read_df.printSchema()

In [None]:
@udf(returnType=ArrayType(StringType()))
def visits_to_domain(visit):
    return list(set([urlparse(x.url).hostname.replace('www.', '') for x in visit]))

df_domain = (
    kafka_read_df
    .selectExpr('CAST(value AS STRING)')
    .select(from_json("value", event_schema).alias("event"))
    .select("event.uid", from_json(col("event.visits"), visit_schema).alias("visits"))
    .withColumn('domain', visits_to_domain(col("visits"))) 
)

In [None]:
df_domain_norm = normalizer.transform(hasher_freq.transform(df_domain))

gender_pred = rf_gender_model.transform(df_domain_norm).select(
    col("uid").alias("uid_gender"), 
    col("prediction").alias("gender"))

age_pred = rf_age_model.transform(df_domain_norm).select(
    col("uid").alias("uid_age"), 
    col("prediction").alias("age"))

prediction = (
    gender_pred
    .join(age_pred, gender_pred.uid_gender == age_pred.uid_age, 'inner')
    .selectExpr([
        "uid_age as uid", 
        "CAST(CAST(gender AS INT) AS String) AS gender", 
        "CAST(CAST(age AS INT) AS String) AS age"
    ])
    .replace(['0','1'], ['F', 'M'], "gender")
    .replace(['1','2','3','4','5'], ['18-24', '25-34', '35-44', '45-54', '>=55'], "age")
)

prediction = (
    prediction
    .select(lit("").alias('key'), to_json(struct(*prediction.columns)).alias('value'))
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
)

In [None]:
prediction.isStreaming

In [None]:
writer = (
    prediction
#     .write
    .writeStream
    .format("kafka")
    .outputMode("append")
    .option("kafka.bootstrap.servers", "spark-master-1.newprolab.com:6667")
    .option("topic", "roman.kozhushko")
    .option('checkpointLocation', './checkpoint_lab04')
    .trigger(processingTime="5 seconds")
#     .save()
)

writer_stream = writer.start()

In [None]:
writer_stream.isActive

In [None]:
writer_stream.stop()

In [None]:
spark.stop()