# Лабораторная работа №4. Константин Кобылкин. Вариант 1.

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Konstantin Kobylkin lab 4 app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer, , IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.param.shared import HasOutputCol, HasInputCol
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml import Transformer
from pyspark import keyword_only

# Загрузка данных обучающей выборки

In [4]:
path = '/labs/slaba04/gender_age_dataset.txt'

schema = t.StructType(fields=[
    t.StructField('gender', t.StringType()),
    t.StructField('age', t.StringType()),
    t.StructField('uid', t.StringType()),
    t.StructField('user_json', t.StringType()),
])

train_data = spark.read.csv(path, 
                            header=True, 
                            schema=schema, 
                            sep='\t')

## Схема для парсинга данных по визитам

In [9]:
visits_schema = t.StructType([
    t.StructField('visits', t.ArrayType(
        t.StructType([
            t.StructField('url', t.StringType(), True),
            t.StructField('timestamp', t.LongType(), True)
        ])
    ))
])

## Собственно, чтение данных обучающей выборки

In [10]:
train_data = train_data.filter(train_data.age != '-') \
                       .filter(train_data.gender != '-') \
                       .withColumn('visits', f.from_json(f.col('user_json'), 
                                                         visits_schema )) \
                       .withColumn('url', f.col('visits.visits.url')) \
                       .drop('visits', 'user_json')

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)



## LabelEncoding для пола и возраста

In [11]:
indexerG = StringIndexer(inputCol='gender', 
                         outputCol='gender_i')
indexerA = StringIndexer(inputCol='age', 
                         outputCol='age_i')
indexModelG = indexerG.fit(train_data)
indexModelA = indexerA.fit(train_data)

## Предобработка URL

In [12]:
class ParseURLTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(ParseURLTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        
    def _transform(self, dataset):
        res = dataset.withColumn('url_inter', 
                                 f.explode(self.getInputCol())) \
                     .withColumn('url_inter', 
                                 f.expr('parse_url(url_inter, "HOST")')) \
                     .withColumn('url_inter', 
                                 f.lower(f.col('url_inter'))) \
                     .drop(self.getInputCol())
        
        res_col_list = res.columns
        res_col_list.remove('url_inter')
        res = res.groupBy(res_col_list) \
                 .agg(f.collect_list('url_inter').alias(self.getOutputCol()))
        return res

In [13]:
urlpt = ParseURLTransformer(inputCol='url', outputCol='url_parsed')

## Выстраивание pipeline

In [14]:
hasher = HashingTF(numFeatures=130000, 
                   binary=False, 
                   inputCol="url_parsed", 
                   outputCol="url_freq")

In [15]:
forestG = RandomForestClassifier(featuresCol='url_freq', 
                                 labelCol='gender_i',
                                 predictionCol='predictionG', 
                                 probabilityCol='probabilityG', 
                                 rawPredictionCol='rawPredictionG')
forestA = RandomForestClassifier(featuresCol='url_freq', 
                                 labelCol='age_i',  \
                                 predictionCol='predictionA', 
                                 probabilityCol='probabilityA', 
                                 rawPredictionCol='rawPredictionA')

### Обратное преобразование индексов в метки возраста и пола

In [16]:
stringerG = IndexToString(inputCol='predictionG', 
                          outputCol='gender_s', 
                          labels=indexModelG.labels)
stringerA = IndexToString(inputCol='predictionA', 
                          outputCol='age_s', 
                          labels=indexModelA.labels)

In [17]:
pipeline = Pipeline(stages=[
    urlpt,
    hasher,
    forestG,
    forestA,
    stringerG,
    stringerA
])

In [18]:
df_visits = indexModelG.transform(train_data)
df_visits = indexModelA.transform(df_visits)

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- gender_i: double (nullable = false)
 |-- age_i: double (nullable = false)



# Обучение модели

In [19]:
pipeline_model = pipeline.fit(df_visits)

# Описание параметров входного потока

In [27]:
KAFKA_BOOTSTRAP_SERVER = 'spark-master-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_konstantin.kobylkin'
KAFKA_OUTPUT_TOPIC = 'konstantin.kobylkin'

In [28]:
kafka_stream = (
    spark
    .readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER)
    .option('subscribe', KAFKA_INPUT_TOPIC)
    .option('startingOffsets', 'earliest')
    .option('failOnDataLoss', 'False')
    .load()
)

In [29]:
event_schema = t.StructType([
    t.StructField('uid', t.StringType(), True),
    t.StructField('visits', t.StringType(), True),
])

visit_schema2 = t.ArrayType(
    t.StructType([
        t.StructField('url', t.StringType(), True),
        t.StructField('timestamp', t.LongType(), True)
    ])
)

In [30]:
parsed_sdf = kafka_stream.select(f.col('value').cast('string').alias('value')) \
                         .select(f.from_json(f.col('value'), event_schema).alias('event')) \
                         .select( 'event.uid', f.from_json(f.col('event.visits'), visit_schema2).alias('visits')) \
                         .withColumn( 'url', f.col('visits.url')) \
                         .drop('visits')

root
 |-- uid: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)



# Предсказание

In [31]:
predictions_df_test = pipeline_model.transform(parsed_sdf)

root
 |-- uid: string (nullable = true)
 |-- url_parsed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- url_freq: vector (nullable = true)
 |-- rawPredictionG: vector (nullable = true)
 |-- probabilityG: vector (nullable = true)
 |-- predictionG: double (nullable = false)
 |-- rawPredictionA: vector (nullable = true)
 |-- probabilityA: vector (nullable = true)
 |-- predictionA: double (nullable = false)
 |-- gender_s: string (nullable = true)
 |-- age_s: string (nullable = true)



In [32]:
predictions_df = predictions_df_test.select('uid', f.col('gender_s').alias('gender'), f.col('age_s').alias('age'))

root
 |-- uid: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)



# Описание параметров выходного потока с предсказаниями

In [26]:
kafka_write_stream = (
    predictions_df
    .select(f.to_json(f.struct(*predictions_df.columns)).alias('value'))
    .writeStream
    .format("kafka")
    .outputMode("complete")
    .option("checkpointLocation", "checkpoints/checkpoints_lab04")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER)
    .option("topic", KAFKA_OUTPUT_TOPIC)
)
kafka_write_stream.start()

<pyspark.sql.streaming.StreamingQuery at 0x7ffa9c069a90>

In [33]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [34]:
kill_all()

Stopped KafkaV2[Subscribe[input_konstantin.kobylkin]]


In [35]:
spark.stop()