In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 10 --executor-memory 5g --executor-cores 8 --driver-memory 8g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
from pyspark.ml.feature import *
from pyspark.sql.window import Window
from pyspark.ml.classification import GBTClassifier, LogisticRegression, GBTClassificationModel, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.stat import Correlation
from pyspark.ml.recommendation import ALS
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import time
import pandas as pd
from datetime import datetime

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("spark-course") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

# Inference

In [3]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_roman.razumovskiy",
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()
# kafka_df = spark.read.format("kafka").options(**read_kafka_params).load()

In [4]:
def get_domain(s):
    try:
        domain = s.split('/')[2]
        if domain in ['http:','https:']:
            domain = s.split('/')[4]
        return domain
    except:
        return ''

get_domain_udf = F.udf(get_domain, StringType())

In [5]:
json_type_1 = MapType(StringType(), StringType())
json_type_2 = ArrayType(MapType(StringType(), StringType()))

clickstream = kafka_sdf.select(F.col('value').cast('string'))\
.select(F.from_json('value', json_type_1).alias('user_json'))\
.select(F.col('user_json')['uid'].alias('uid') 
        , F.explode(F.from_json(F.col('user_json')['visits'], json_type_2)).alias('visit'))\
.withColumn('url', F.col('visit')['url'])\
.withColumn('timestamp', F.col('visit')['timestamp'].cast('double'))\
.withColumn('domain', get_domain_udf('url'))\
.drop('visit')

clickstream.printSchema()

root
 |-- uid: string (nullable = true)
 |-- url: string (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- domain: string (nullable = true)



In [6]:
htf = HashingTF(numFeatures = 1000, inputCol = 'url_list', outputCol = 'features')

features_df = clickstream.groupBy('uid')\
.agg(F.collect_list('url').alias('url_list'))

features_df = htf.transform(features_df)

In [7]:
gender_model = GBTClassificationModel.load('gender_model')
age_model = LogisticRegressionModel.load('age_model')

In [14]:
submit_df = gender_model\
.transform(features_df)\
.select('uid','features',F.col('prediction').cast('int').cast('string').alias('gender'))\
.replace(['0','1'],['F', 'M'],'gender')

In [15]:
submit_df = age_model\
.transform(submit_df)\
.select('uid',F.col('prediction').cast('int').cast('string').alias('age'))\
.replace(['0','1','2','3','4'],['18-24', '25-34', '35-44', '45-54', '>=55'],'age')

In [19]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "roman.razumovskiy"
}
submit_df.writeStream.outputMode('complete')\
    .format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f76b8f07b70>