In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
# exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Lab04 test run mikhail.novikov")
         .getOrCreate()
        )

In [3]:
spark

## либы

In [4]:
from pyspark.sql.functions import json_tuple, from_json, get_json_object, col, explode, expr, \
collect_set, collect_list, regexp_replace, get_json_object, to_json, struct
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, IndexToString
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vector, DenseVector
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json, pprint

## глобальные переменные

In [5]:
MODEL_PATH = '/user/mikhail.novikov/lab04_model'

In [6]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
# KAFKA_BOOTSTRAP_SERVER = 'spark-master-1.newprolab.com:6667'
INPUT_KAFKA_TOPIC = 'input_mikhail.novikov'
OUTPUT_KAFKA_TOPIC = 'mikhail.novikov'

# INPUT_KAFKA_TOPIC = 'input_alexander.sedykh'
# OUTPUT_KAFKA_TOPIC = 'alexander.sedykh'

## грузим данные

In [None]:
!hdfs dfs -ls /labs/slaba04/

In [None]:
!hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head

In [None]:
train_df_0 = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', header=True, inferSchema=True, sep='\t')
train_df_0.printSchema()

In [None]:
print('count', train_df_0.count())
train_df_0.show(1,truncate=False, vertical=True)

In [None]:
VisitsType = StructType([
    StructField('visits', ArrayType(
        StructType([
            StructField('url', StringType(), True),
            StructField('timestamp', LongType(), True)
            ])
        ))
    ])

train_df_flattened = (
    train_df_0
    .withColumn('visits', from_json(col('user_json'), VisitsType))
    .withColumn('visit', explode('visits.visits').alias('visit'))
    .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
    .drop('visits', 'visit', 'user_json')
)

train_df_flattened.printSchema()
train_df_flattened.show(3)

In [None]:
full_train_df = (
    train_df_flattened
    .groupBy('gender', 'age', 'uid')
    .agg(collect_list('host')
    .alias('hosts'))
    .cache()
)

full_train_df.printSchema()
full_train_df.show(3)

## моделируем

In [None]:
train_df, test_df = full_train_df.randomSplit([0.8, 0.2], seed=42)

#### задаем компоненты нашего pipeline

In [None]:
hashing_TF = HashingTF(inputCol='hosts', outputCol='rawFeatures', numFeatures=10000, binary=False)
#----------------------------------------------------------------------------------------------------------------
indexer_age = (StringIndexer(inputCol='age', 
                             outputCol='ageIndex'
                            )
               .fit(full_train_df)
              )

indexer_gender = (StringIndexer(inputCol='gender', 
                                outputCol='genderIndex'
                               )
                  .fit(full_train_df)
                 )
#----------------------------------------------------------------------------------------------------------------
rf_age = RandomForestClassifier(featuresCol = 'rawFeatures', 
                                labelCol = 'ageIndex',
                                predictionCol='age_index_prediction', 
                                rawPredictionCol='age_index_raw_prediction',
                                probabilityCol='age_probability'
                               )

rf_gender = RandomForestClassifier(featuresCol = 'rawFeatures', 
                                   labelCol = 'genderIndex',
                                   predictionCol='gender_index_prediction', 
                                   rawPredictionCol='gender_index_raw_prediction',
                                   probabilityCol='gender_probability'
                                  )
#----------------------------------------------------------------------------------------------------------------
converter_age = IndexToString(inputCol='age_index_prediction', 
                              outputCol='PredictedAge', 
                              labels=indexer_age.labels
                             )

converter_gender = IndexToString(inputCol='gender_index_prediction', 
                                 outputCol='PredictedGender', 
                                 labels=indexer_gender.labels
                                )

#### фитим pipeline

In [None]:
pipeline = (
    Pipeline(
        stages=[hashing_TF, indexer_age, indexer_gender, rf_age, rf_gender, converter_age, converter_gender]
    )
)

model = pipeline.fit(train_df)
predictions = model.transform(test_df)

predictions.printSchema()

#### селектим необходимые поля

In [None]:
predictions.select('gender', 'age', 'PredictedAge', 'PredictedGender').show(3)

#### оцениваем качество модели

In [None]:
evaluator_age = (MulticlassClassificationEvaluator(
    labelCol='ageIndex', 
    predictionCol='age_index_prediction', 
    metricName='accuracy')
                )

accuracy_age = evaluator_age.evaluate(predictions)

evaluator_gender = (
    MulticlassClassificationEvaluator(
        labelCol='genderIndex', 
        predictionCol='gender_index_prediction', 
        metricName='accuracy')
)
accuracy_gender = evaluator_gender.evaluate(predictions)

print('accuracy for age: ' + str(accuracy_age))
print('accuracy for gender: ' + str(accuracy_gender))

#### сохраняем обученную модель

In [None]:
model.write().overwrite().save(MODEL_PATH)

In [None]:
!hdfs dfs -ls /user/mikhail.novikov/lab04_model

## скоринг по батчам

In [7]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'earliest',
    'endingOffsets': 'latest'
}

kafka_sdf = (
    spark
    .read
    .format('kafka')
    .options(**read_kafka_params)
    .option("failOnDataLoss", 'False')
    .load()
    .cache()
)

kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [8]:
print('count',kafka_sdf.count())
kafka_sdf.show(3)

count 10000
+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_alexander.s...|        0| 75000|2021-03-24 13:29:...|            0|
|null|[7B 22 75 69 64 2...|input_alexander.s...|        0| 75001|2021-03-24 13:29:...|            0|
|null|[7B 22 75 69 64 2...|input_alexander.s...|        0| 75002|2021-03-24 13:29:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [9]:
eventType = StructType([
    StructField('uid', StringType(), True),
    StructField('visits', StringType(), True),
])

visitType = ArrayType(
    StructType([
        StructField('url', StringType(), True),
        StructField('timestamp', LongType(), True)
    ])
)

clean_df = (
    kafka_sdf
    .select(col('value').cast('string').alias('value'))
    .select(from_json(col('value'), eventType).alias('data'))
    .select('data.*')
    .select('uid', from_json(col('visits'), visitType).alias('visits'))
    )

clean_df.show(3)

+--------------------+--------------------+
|                 uid|              visits|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[[http://www.inte...|
|bd7a6f52-45db-49b...|[[https://www.pac...|
|bd7a7fd9-ab06-42f...|[[http://www.mk.r...|
+--------------------+--------------------+
only showing top 3 rows



In [10]:
prep_df = (
    clean_df
    .withColumn('visit', explode('visits').alias('visit'))
    .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
    .drop('visits', 'visit')
    .groupBy('uid')
    .agg(collect_list('host').alias('hosts'))
)

prep_df.printSchema()
prep_df.show(3)

root
 |-- uid: string (nullable = true)
 |-- hosts: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|0108d217-e476-493...|[kvartblog.ru, kv...|
|0192cc54-559c-4c8...|[metanol.lv, meta...|
|019acd5e-be9a-4cd...|[www.russianfood....|
+--------------------+--------------------+
only showing top 3 rows



In [11]:
inf_model = PipelineModel.load(MODEL_PATH)

predictions_df = (
    inf_model.transform(prep_df)
    .select('uid', 'PredictedGender', 'PredictedAge')
    .withColumnRenamed('PredictedAge', 'age')
    .withColumnRenamed('PredictedGender', 'gender')
)

predictions_df.show(3)

+--------------------+------+-----+
|                 uid|gender|  age|
+--------------------+------+-----+
|0108d217-e476-493...|     M|25-34|
|0192cc54-559c-4c8...|     M|25-34|
|019acd5e-be9a-4cd...|     F|25-34|
+--------------------+------+-----+
only showing top 3 rows



In [13]:
kafka_out_df = predictions_df.select(to_json(struct(*predictions_df.columns)).alias('value')).limit(10)

kafka_out_df.show(3)

+--------------------+
|               value|
+--------------------+
|{"uid":"0108d217-...|
|{"uid":"0192cc54-...|
|{"uid":"019acd5e-...|
+--------------------+
only showing top 3 rows



In [15]:
write_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'topic': OUTPUT_KAFKA_TOPIC
}

(kafka_out_df.write
    .format('kafka')
    .options(**write_kafka_params)
    .save())

## скоринг в стриме

In [None]:
inf_model = PipelineModel.load(MODEL_PATH)

In [None]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'latest'
}

write_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'topic': OUTPUT_KAFKA_TOPIC
}

In [None]:
eventType = StructType([
    StructField('uid', StringType(), True),
    StructField('visits', StringType(), True)
])

visitType = ArrayType(
    StructType([
        StructField('url', StringType(), True),
        StructField('timestamp', LongType(), True)  
    ])
)

In [None]:
def process_batch(batch_df, batch_id):  
    clean_df = (
        batch_df
        .select(col('value').cast('string').alias('value'))
        .select(from_json(col('value'), eventType).alias('data'))
        .select('data.*')
        .select('uid', from_json(col('visits'), visitType).alias('visits'))
    )
    
    proc_df = (
        clean_df
        .withColumn('visit', explode('visits').alias('visits'))
        .withColumn('host', expr('parse_url(visit.url, "HOST")').alias('host'))
        .drop('visits', 'visit')
        .groupBy('uid')
        .agg(collect_list('host').alias('hosts'))
    )
    
    predictions_df = (
        inf_model.transform(proc_df)
        .select('uid', 'PredictedGender', 'PredictedAge')
        .withColumnRenamed('PredictedAge', 'age')
        .withColumnRenamed('PredictedGender', 'gender')
    )
    
    kafka_df = (
        predictions_df
        .select(to_json(struct(*predictions_df.columns)).alias('value'))
    )
       
    kafka_df\
     .write\
     .format('kafka')\
     .options(**write_kafka_params)\
     .mode('append')\
     .save()

In [None]:
kafka_test_df = (spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .option("failOnDataLoss", 'False')
    .load()
)

In [None]:
kafka_test_df.printSchema()

In [None]:
def create_console_sink(df):
    return df.writeStream\
            .foreachBatch(process_batch)\
            .option('checkpointLocation', 'streaming/chk/chk_kafka_mikhail_novikov_lab04')

In [None]:
sink = create_console_sink(kafka_test_df)
sq = sink.start()

In [None]:
sq.status

In [None]:
sq.lastProgress

## Чек нашей отправки

In [16]:
chk_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': OUTPUT_KAFKA_TOPIC,
    'startingOffsets': 'earliest'
}

In [17]:
chk_sdf = (
    spark
    .read
    .format('kafka')
    .options(**chk_kafka_params)
    .load()
    .cache()
)

chk_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [18]:
print('count',chk_sdf.count())
chk_sdf.show(3)

count 20
+----+--------------------+----------------+---------+------+--------------------+-------------+
| key|               value|           topic|partition|offset|           timestamp|timestampType|
+----+--------------------+----------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|alexander.sedykh|        0|     0|2021-03-24 13:41:...|            0|
|null|[7B 22 75 69 64 2...|alexander.sedykh|        0|     1|2021-03-24 13:41:...|            0|
|null|[7B 22 75 69 64 2...|alexander.sedykh|        0|     2|2021-03-24 13:41:...|            0|
+----+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 3 rows



## остановка стрима и spark-а

In [19]:
sq.stop()

NameError: name 'sq' is not defined

In [None]:
spark.stop()