In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [None]:
spark

In [None]:
!hdfs dfs -ls /labs/slaba04/

In [None]:
!hdfs dfs -head /labs/slaba04/gender_age_dataset.txt | sed -n '1,2p'
!hdfs dfs -head /labs/slaba04/gender_age_dataset.txt | xxd | sed -n '1,5p'

In [3]:
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("delimiter", "\t") \
        .load('/labs/slaba04/gender_age_dataset.txt')
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [None]:
df.count(), df.filter(df.gender != '-').count()

In [4]:
df = df.filter(df.gender != '-')

# Шаг 1: Объединяем пол и возрастную группу

In [5]:
df_1 = df.withColumn('clazz', F.concat(df.gender, df.age))
df_1.printSchema()
df_1.show(2, True, True)

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)
 |-- clazz: string (nullable = true)

-RECORD 0-------------------------
 gender    | F                    
 age       | 18-24                
 uid       | d50192e5-c44e-4ae... 
 user_json | {"visits": [{"url... 
 clazz     | F18-24               
-RECORD 1-------------------------
 gender    | M                    
 age       | 25-34                
 uid       | d502331d-621e-472... 
 user_json | {"visits": [{"url... 
 clazz     | M25-34               
only showing top 2 rows



### Баланс классов

In [6]:
cl_sum = df_1.select(df_1.clazz).groupBy(df_1.clazz).count()
clazz_sum = cl_sum.collect()

In [7]:
class_sum = {}
for row in clazz_sum:
    class_sum[row.clazz] = row[1]
sorted_classes = [i[0] for i in sorted(class_sum.items(), key=lambda item: item[1])[::-1]]
sorted_classes

['M25-34',
 'F25-34',
 'M35-44',
 'F35-44',
 'F18-24',
 'F45-54',
 'M45-54',
 'M18-24',
 'F>=55',
 'M>=55']

# Шаг 2: Парсим json

In [8]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType, ArrayType

In [9]:
schema_url = StructType([ 
    StructField("url",StringType())
  ])

In [10]:
schema_visits = StructType([
    StructField("visits", ArrayType(schema_url))
])

In [None]:
df_2 = df_1.withColumn('visits', F.from_json(df_1.user_json, schema_visits).visits)
df_2.printSchema()
df_2.show(2, True, True)

# Шаг 3: Разбиваем массив сайтов на строки урлов

In [None]:
df_3 = df_2.withColumn('site', F.explode(df_2.visits)).withColumn('url', F.col('site').url)
df_3.printSchema()
df_3.explain()

# Шаг 4: Вытаскиваем название сайта

In [None]:
df_4 = df_3.withColumn('site_name', F.regexp_extract(df_3.url, r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2))
df_4.printSchema()
df_4.explain()
df_4.select('site', 'site_name').take(1)

# Шаг 5: Плющим посещения одних и тех же сайтов в одной истории посещения

In [None]:
df_5 = df_4.select(df_4.uid, df_4.clazz, df_4.site_name).distinct().select(df_4.clazz, df_4.site_name)
df_5.printSchema()
df_5.explain()
df_5.take(1)

# Шаг 6: Группируем по сайтам и транспонируем

In [None]:
df_6 = df_5.withColumn('amount', F.lit(1)).groupBy('site_name').pivot('clazz').sum('amount').na.fill(0).cache()
df_6.printSchema()
df_6.explain()
df_6.show(5)

In [None]:
df_6.unpersist()

In [None]:
df_6.count()

In [None]:
classes = df_6.columns[1:]
print(classes)

In [11]:
classes = sorted_classes

### Вычислим дефолтный массив весов по классам

In [12]:
import numpy as np

In [None]:
def_nn = np.array([class_sum[clazz] for clazz in classes])
def_nn

In [None]:
def_arr = def_nn / np.sum(def_nn)
def_arr

In [None]:
def_class = classes[def_arr.argmax()]
def_class

# Шаг 7: Соединяем все классы в один вектор

In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import DenseVector

Тут собираем по `sorted_classes`, чтобы когда `argmax` находил одинаковые веса по классам, отдавал наиболее встречаемый класс в "обучающей" выборке

In [14]:
vecAssembler = VectorAssembler(inputCols=sorted_classes, outputCol="classes")

In [None]:
df_7 = vecAssembler.transform(df_6)
df_7.printSchema()
df_7.explain()
df_7.show(2)

# Шаг 8: Усредняем влияние между сайтами

In [15]:
from pyspark.ml.linalg import VectorUDT

In [None]:
df_8 = df_7.withColumn('wclasses', F.udf(lambda classes: DenseVector(classes.toArray()/np.sum(classes.toArray().astype(int))), VectorUDT())(df_7.classes)).cache()
df_8.printSchema()
df_8.explain()
df_8.show(2, True, True)

# Результат подготовки данных для предсказания

In [None]:
site_weight = df_8.select(df_8.site_name, df_8.wclasses)\
    .rdd\
    .map(lambda row: (1, {row.site_name: row.wclasses.toArray().tolist()}))\
    .reduceByKey(lambda x, y: {**x, **y})\
    .collect()[0][1]

In [None]:
!rm site_weight_data.json

In [16]:
import json

In [None]:
with open("site_weight_data.json", "w") as outfile:
    json.dump(site_weight, outfile)

In [None]:
!head site_weight_data.json | cut -c 1-150

In [17]:
with open("site_weight_data.json", "r") as infile:
    site_weight = json.load(infile)

In [18]:
site_w_br = spark.sparkContext.broadcast(site_weight)

# Kafka

In [101]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_dmitriy.kamaev",
    "startingOffsets": "latest",
    "failOnDataLoss": 'False'
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [38]:
kafka_sdf, rates

(DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int],
 DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int])

In [59]:
kafka_sdf = rates

# Кафка Шаг 1: value -> json: str

In [39]:
kafkaSchema = StructType([
    StructField("uid", StringType()),
    StructField("visits", StringType())
])

In [102]:
sdf_1 = kafka_sdf.withColumn('json', F.from_json(kafka_sdf.value.cast('string'), kafkaSchema))

In [None]:
sdf_1.printSchema()

# Кафка Шаг 2: Достаем uid и посещения

In [103]:
sdf_2 = sdf_1.select(sdf_1.json.uid.alias('uid'), F.from_json(sdf_1.json.visits, ArrayType(schema_url)).url.alias('urls'))

In [62]:
sdf_2.printSchema()

root
 |-- uid: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: string (containsNull = true)



Отсюда и ниже начнутся deprecated шаги, потому что первый вариант был написан на aggregated механизмах, как выяснилось позже, нельзя работать в режиме append после группировки и нельзя использовать F.PandasUDFType.GROUPED_AGG в стримовых дата фреймах

# (Deprecated) Кафка Шаг 3: Explode, чтобы обработать все сайты регуляркой

In [42]:
sdf_3 = sdf_2.select(sdf_2.uid, F.explode(sdf_2.urls).alias('url'))
sdf_3.printSchema()

root
 |-- uid: string (nullable = true)
 |-- url: string (nullable = true)



# Кафка Шаг 3: Вытаскиваем имена сайтов из урлов и делаем distinct (set)

In [73]:
import re
@F.pandas_udf(ArrayType(StringType()))
def extract_site_name(urls_sr):
    return urls_sr.apply(lambda urls: list(set([re.search(r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', url).group(2) for url in urls])))

In [104]:
sdf_3 = sdf_2.withColumn('site_names', extract_site_name(sdf_2.urls))
sdf_3.printSchema()

root
 |-- uid: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- site_names: array (nullable = true)
 |    |-- element: string (containsNull = true)



# (Deprecated) Кафка Шаг 4: Выдираем имя сайта с помощью регулярки

In [43]:
sdf_4 = sdf_3.withColumn('site_name', F.regexp_extract(sdf_3.url, r'\w+:\/\/(www\.)?(([\w-]+)(\.[\w-]+)*)\/?', 2))
sdf_4.printSchema()

root
 |-- uid: string (nullable = true)
 |-- url: string (nullable = true)
 |-- site_name: string (nullable = true)



# Кафка Шаг 4: Считаем веса по посещенным сайтам

In [86]:
@F.pandas_udf(StringType())
def get_class_from_visited_sites(site_names_sr):
    def _get_class_from_visited_sites(site_names):
        # [0]*len(classes) - если сайта нет в списке, то вставляем нулевой массив
        weights = np.array([site_w_br.value[site_name] if site_name in site_w_br.value else [0]*len(classes) for site_name in site_names])
        return sorted_classes[weights.sum(axis=0).argmax()]
    return site_names_sr.apply(_get_class_from_visited_sites)

In [105]:
sdf_4 = sdf_3.withColumn('clazz', get_class_from_visited_sites(sdf_3.site_names))
sdf_4.printSchema()

root
 |-- uid: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- site_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clazz: string (nullable = true)



# (Deprecated) Кафка Шаг 5: Ставим в соответствие сайту массив весов по классам
Если в обучающей выборке не было сайта, который пришел на прогноз, заменяем неизвестный сайт на нулевой массив

In [44]:
sdf_5 = sdf_4.withColumn('weights', F.pandas_udf(lambda sr: 
                                                 sr.apply(lambda site_name: 
                                                          site_w_br.value[site_name] if site_name in site_w_br.value else [0]*len(classes)),
                                                 ArrayType(FloatType())
                                                )(sdf_4.site_name))
sdf_5.printSchema()

root
 |-- uid: string (nullable = true)
 |-- url: string (nullable = true)
 |-- site_name: string (nullable = true)
 |-- weights: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [None]:
sdf_5.take(1)

# Кафка Шаг 5: Вытаскиваем из класса пол и возраст

In [106]:
sdf_5 = sdf_4.withColumn('gender', sdf_4.clazz.substr(1, 1)).withColumn('age', sdf_4.clazz.substr(2, 5))
sdf_5.printSchema()

root
 |-- uid: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- site_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clazz: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)



# (Deprecated) Кафка Шаг 6: Складываем вектора по uid

In [45]:
sdf_6 = sdf_5.groupBy(sdf_5.uid).agg(F.pandas_udf(lambda sr:
                                                  sorted_classes[sr.map(lambda arr: np.array(arr)).sum().argmax()],
                                                  StringType(),
                                                  F.PandasUDFType.GROUPED_AGG
                                                 )(sdf_5.weights).alias('clazz'))
sdf_6.printSchema()

root
 |-- uid: string (nullable = true)
 |-- clazz: string (nullable = true)



In [None]:
sdf_6.select('clazz').groupBy('clazz').count().show()

# (Deprecated) Кафка Шаг 7: Вытаскиваем из класс пол и возраст

In [46]:
sdf_7 = sdf_6.withColumn('gender', sdf_6.clazz.substr(1, 1)).withColumn('age', sdf_6.clazz.substr(2, 5))
sdf_7.printSchema()

root
 |-- uid: string (nullable = true)
 |-- clazz: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)



# Кафка запись результата

In [117]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "dmitriy.kamaev"
}
write_sinc = sdf_5.select(sdf_5.uid, sdf_5.gender, sdf_5.age)\
    .select(F.to_json(F.struct(col("*"))).alias("value"))\
    .writeStream\
    .format("kafka")\
    .options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append")

In [131]:
sq = write_sinc.start()

In [133]:
sq.isActive, sq.status

(True,
 {'message': 'Waiting for data to arrive',
  'isDataAvailable': False,
  'isTriggerActive': False})

In [128]:
sq.stop()

In [129]:
sq.isActive, sq.status

(False,
 {'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False})

In [None]:
sq.exception()

# Отладка через паркет

In [20]:
def create_parquet_sink(df, file_name):
    return df \
            .repartition(1) \
            .writeStream \
            .format("parquet") \
            .option("path", "{f}".format(f=file_name)) \
            .option("checkpointLocation", "tmp/chk_sg/{f}".format(f=file_name)) \
            .trigger(processingTime="10 seconds")

In [34]:
!hdfs dfs -ls tmp/chk_sg/ss_01.parquet

Found 4 items
drwxr-xr-x   - dmitriy.kamaev dmitriy.kamaev          0 2022-10-27 18:51 tmp/chk_sg/ss_01.parquet/commits
-rw-r--r--   3 dmitriy.kamaev dmitriy.kamaev         45 2022-10-27 18:50 tmp/chk_sg/ss_01.parquet/metadata
drwxr-xr-x   - dmitriy.kamaev dmitriy.kamaev          0 2022-10-27 18:51 tmp/chk_sg/ss_01.parquet/offsets
drwxr-xr-x   - dmitriy.kamaev dmitriy.kamaev          0 2022-10-27 18:50 tmp/chk_sg/ss_01.parquet/sources


In [26]:
!hdfs dfs -rm -r tmp/chk_sg/ss_01.parquet

22/10/27 18:25:38 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/dmitriy.kamaev/tmp/chk_sg/ss_01.parquet' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/dmitriy.kamaev/.Trash/Current/user/dmitriy.kamaev/tmp/chk_sg/ss_01.parquet


In [27]:
sink = create_parquet_sink(kafka_sdf, "ss_01.parquet")
sink

<pyspark.sql.streaming.DataStreamWriter at 0x7fa974e31390>

In [28]:
sq = sink.start()
sq

<pyspark.sql.streaming.StreamingQuery at 0x7fa974e318d0>

In [35]:
sq.isActive, sq.exception()

(False, None)

In [36]:
sq.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [33]:
sq.stop()

In [None]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [None]:
kill_all()

In [None]:
!hadoop fs -ls ss_01.parquet

In [37]:
from pyspark.sql.functions import max, col

rates = spark.read.parquet("ss_01.parquet")
print(rates.count())
rates.printSchema()
rates.show(2, True, True)

5000
root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

-RECORD 0-----------------------------
 key           | null                 
 value         | [7B 22 75 69 64 2... 
 topic         | input_dmitriy.kamaev 
 partition     | 0                    
 offset        | 5354                 
 timestamp     | 2022-10-26 20:48:... 
 timestampType | 0                    
-RECORD 1-----------------------------
 key           | null                 
 value         | [7B 22 75 69 64 2... 
 topic         | input_dmitriy.kamaev 
 partition     | 0                    
 offset        | 5355                 
 timestamp     | 2022-10-26 20:48:... 
 timestampType | 0                    
only showing top 2 rows



In [58]:
rates

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [134]:
spark.stop()