In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --executor-cores 3 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())


In [None]:
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, HashingTF, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import *
import numpy as np

In [None]:
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.sql.functions import col, desc, pandas_udf, PandasUDFType, udf, regexp_replace, when, asc, lit, broadcast
from pyspark.sql.types import StructType, IntegerType, StructField, DateType, StringType, TimestampType, FloatType, ArrayType, LongType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql.functions import struct, to_json
from pyspark.sql.functions import shuffle, array, lit
from pyspark.sql.functions import col, explode
from pyspark.sql.functions import lower
from pyspark.sql.functions import struct, to_json
from pyspark.ml import Pipeline, PipelineModel

In [None]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [None]:
schema = StructType([
    StructField("gender", StringType()),
    StructField("age", StringType()),
    StructField("uid", StringType()),
    StructField("user_json", StringType())
])

In [None]:
df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header=True, schema=schema,sep='\t')

In [None]:
df.show(1)

In [None]:
schema_visit = t.StructType([
    t.StructField('visits', t.ArrayType(
        t.StructType([
        t.StructField('url', StringType(), True),
        t.StructField('timestamp', LongType(), True)
                      ])
    ))
])

In [None]:
df1=(
    df
    .select('gender','age','uid', f.col('user_json').cast('string').alias('value'))
    .select('gender','age','uid', from_json(f.col('value'), schema_visit).alias('visits'))
    
)
df1.show(2)

In [None]:
df1.take(1)

In [None]:
df2=df1.select('gender','age','uid','visits.visits')
df2.take(1)

In [None]:
df2.printSchema()

In [None]:
df3=df2.select('gender','age','uid', explode(df2.visits.url).alias('url'))


In [None]:
df3.show(1)

In [None]:
df4 = df3.withColumn('site', f.regexp_extract(df3.url, r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2)).drop('url').distinct()

In [None]:
df4.show(3)

In [None]:
df5 = df4.groupBy("gender","age","uid").\
                    agg(f.collect_list("site").alias("site")).cache()

In [None]:
df5 = df5.filter(df.gender != '-')

In [None]:
indexGender = StringIndexer(inputCol='gender', outputCol='gender_i')
indexAge = StringIndexer(inputCol='age', outputCol='age_i')
indexModelGender = indexGender.fit(df5)
indexModelAge = indexAge.fit(df5)
df_i = indexModelGender.transform(df5)
df_i = indexModelAge.transform(df_i)

In [None]:
df_i.where('gender="M"').show(1)

In [None]:
hashingTF = HashingTF(numFeatures=100000, binary=False, inputCol="site", outputCol="site_h")
forestG = RandomForestClassifier(featuresCol='site_h', labelCol='gender_i', predictionCol='predictionG',
                                 probabilityCol='probabilityG', rawPredictionCol='rawPredictionG')
forestA = RandomForestClassifier(featuresCol='site_h', labelCol='age_i', predictionCol='predictionA',
                                 probabilityCol='probabilityA', rawPredictionCol='rawPredictionA')
strindG = IndexToString(inputCol='predictionG', outputCol='gender_p', labels=indexModelGender.labels)
strindA = IndexToString(inputCol='predictionA', outputCol='age_p', labels=indexModelAge.labels)

pipeline = Pipeline(stages=[   
    hashingTF,
    forestG,
    forestA,
    strindG,
    strindA
])

In [None]:
train = df_i.sampleBy('gender_i', fractions={0: 0.8, 1: 0.8}, seed=42).cache()
val = df_i.join(train, on=['uid'], how='leftanti').cache()

In [None]:
pipiline_model=pipeline.fit(train)

In [None]:
valid=pipiline_model.transform(val)

In [None]:
valid.show(2)
valid.printSchema()

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="predictionG", labelCol="gender_i", metricName='areaUnderROC')

evaluator.evaluate(valid)

In [None]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [None]:
KAFKA_BOOTSTRAP_SERVERS='spark-master-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC='input_yuriy.gulynin'
KAFKA_OUTPUT_TOPIC='yuriy.gulynin'  

In [None]:
event_schema=t.StructType([   
        t.StructField("uid", t.StringType(), True),
        t.StructField("visits", t.StringType(), True)
])

In [None]:
visit_schema =  t.ArrayType(
    t.StructType([  
        t.StructField("url", StringType(), True),
        t.StructField("timestamp", LongType(), True)
    ])
)

In [None]:
kafka_read_df = (spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVERS)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)
clean_df = (kafka_read_df
    .select(f.col('value').cast('string').alias('value'))
    .select(f.from_json(f.col('value'), event_schema).alias('event'))
    .select('evenT.uid', f.from_json(f.col('evenT.visits'), visit_schema).alias('visits'))
    .withColumn('url', f.col('visits.url'))
    .drop('visits')
)
clean_df2=clean_df.select('uid', explode(clean_df.url).alias('url'))

clean_df2=clean_df2.withColumn('site', f.regexp_extract('url', r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2)).distinct().drop("url")

clean_df2 = clean_df2.groupBy("uid").agg(f.collect_list("site").alias("site"))

predictions_df = pipiline_model.transform(clean_df2)

predictions_df=predictions_df.select('uid', f.col('gender_p').alias('gender'), f.col('age_p').alias('age'))

kafka_out_df = predictions_df.select(to_json(struct(*predictions_df.columns)).alias('value'))

kafka_write_stream = (
    kafka_out_df
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "tmp/lab04/checkpointLocation")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)
kafka_write_stream.start()


In [None]:
kill_all()

In [None]:
spark.stop()