### Евгений Шенк

## Лабораторная раота №4. Прогнозирование пола и возрастной категории — Spark Streaming
#### Kafka streaming

In [1]:
import json
import os
import sys
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


### Spark Session

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer
from pyspark.ml.feature import HashingTF, IDF, Normalizer, StopWordsRemover, IDFModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.classification import GBTClassifier, LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("ESShenk_spark_session")
         .getOrCreate())

In [3]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_evgeniy.shenk",
    "startingOffsets": "latest"
}
df = spark.readStream.format("kafka").options(**read_kafka_params).load()
# df = spark.read.format("kafka").options(**read_kafka_params).load()

# df.printSchema()
# df.show()

In [4]:
@F.udf(ArrayType(StringType()))
def get_urls(col_1):
    url_col = json.loads(col_1)
    url_list = url_col["visits"]
    url_list = json.loads(url_list)
    result = []
    for el in url_list:
        result = [*result, *(el["url"].split("://")[1].split("/"))]
        
    return result

@F.udf(StringType())
def get_uids(col_1):
    uid_col = json.loads(col_1)
        
    return uid_col["uid"]

In [5]:
gender_to_label = {"M": 0,
                   "F": 1,}

label_to_gender = {0: "M",
                   1: "F",}

age_to_label = {"18-24": 0,
                "25-34": 1,
                "35-44": 2,
                "45-54": 3,
                ">=55": 4,}

label_to_age = {0: "18-24",
                1: "25-34",
                2: "35-44",
                3: "45-54",
                4: ">=55",}

In [6]:
@F.udf(StringType())
def get_age(col_label):
    return label_to_age[int(col_label)]

@F.udf(StringType())
def get_gender(col_label):
    return label_to_gender[int(col_label)]

@F.udf(StringType())
def get_value(col_uid, col_gender, col_age):
    return json.dumps({"uid": col_uid, "gender": col_gender, "age": col_age})

### Pipeline

In [7]:
df_for_preds = df \
    .withColumn("uid", get_uids(F.col("value").cast("string"))) \
    .withColumn("urls", get_urls(F.col("value").cast("string"))) \
    .select("uid", "urls")

In [8]:
eng_stopwords = StopWordsRemover.loadDefaultStopWords("english") + ['ru']

In [9]:
swRemover = StopWordsRemover(inputCol="urls", outputCol="pure_words", stopWords=eng_stopwords)

In [10]:
pure_document = swRemover.transform(df_for_preds)

In [11]:
hashingTF = HashingTF.load('./hashingTF')

In [12]:
tf = hashingTF.transform(pure_document)

In [13]:
tf = tf.withColumnRenamed('tf', 'features')

In [14]:
# idf = IDFModel.load(path='idf')

In [15]:
# tfidf = idf.transform(tf)

In [16]:
# tfidf = tfidf.withColumnRenamed('idf', 'features')

In [17]:
# assembler = VectorAssembler(inputCols=["idf"], outputCol='features')

In [18]:
# assembler = VectorAssembler.load(path='./assembler')

In [19]:
# train_data = assembler.transform(tfidf)#.select("uid", "features")

In [20]:
model_g = CrossValidatorModel.read().load("./model_g")
model_a = CrossValidatorModel.read().load("./model_a")

In [21]:
predictions_g = model_g.transform(tf)

In [22]:
df_g = predictions_g.withColumn("gender", get_gender(F.col("prediction"))).select("uid", "gender", "features")

In [23]:
predictions_a = model_a.transform(df_g)

In [24]:
df_a = predictions_a.withColumn("age", get_age(F.col("prediction"))).select("uid", "gender", "age")

In [25]:
result_df = df_a.withColumn("value", get_value(F.col("uid"), F.col("gender"), F.col("age"))).select("value")

In [26]:
# result_df.show(n=2, truncate=False, vertical=True)

### Kafka writeStream

In [27]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [28]:
# kill_all()

In [32]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "evgeniy.shenk"
}
result_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()
# batch_df.write.format("kafka").options(**write_kafka_params).save()

<pyspark.sql.streaming.StreamingQuery at 0x7fa271a54a20>

In [33]:
spark.stop()