In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "korneev") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
spark

In [7]:
import pyspark.sql.functions as f
import re
import json
import numpy as np
import pandas as pd
from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF, IDFModel
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.sql.functions import struct, to_json
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DoubleType

In [6]:
#load data 

In [8]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [9]:
list_fields = [StructField("gender", StringType()),
               StructField("age", StringType()),
               StructField("uid", StringType()),
               StructField("user_json", StringType())]
schema = StructType(list_fields)

In [12]:
dataset = spark.read.load("/labs/slaba04/gender_age_dataset.txt",
                          format="csv",
                          sep="\t",
                          header="true")

In [13]:
dataset.show(2)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 2 rows



In [14]:
dataset.show(1, vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender    | F                                                                                                                                                                                                                                                                             

In [15]:
def prepare_user_json(x):
    string = str(x)
    string = " ".join([urls.get("url") for urls in eval(string)['visits']])
    string = re.split('\W', string)
    #stop_words
#     stop_words = []
#     string = [x if x not in stop_words else "" for x in string]
    string = " ".join(string)
    return string

udf_prepare_user_json = f.udf(lambda x: prepare_user_json(x), StringType())

In [16]:
dataset = dataset.withColumn("prepare_user_json",
                             udf_prepare_user_json(dataset["user_json"]))

что вообще есть? 

In [17]:
dataset.withColumn('word', f.explode(f.split(f.col('prepare_user_json'), ' ')))\
       .groupBy('word')\
       .count()\
       .sort('count', ascending=False)\
       .show()

+------+--------+
|  word|   count|
+------+--------+
|      |16233277|
|  http| 5662188|
|    ru| 4181093|
|   www| 2252099|
|   com| 1295527|
|  html|  978352|
| https|  688894|
|   php|  511687|
| avito|  473979|
|     1|  430540|
|     0|  393646|
|  news|  273026|
|     2|  255385|
|   net|  228808|
|smotri|  207966|
| index|  205858|
|  mail|  201938|
|    d0|  194756|
| forum|  193086|
|  page|  175254|
+------+--------+
only showing top 20 rows



In [18]:
#добавляем стоп-слова
def prepare_user_json_sw(x):
    string = str(x)
    try:
        string = " ".join([urls.get("url") for urls in eval(string)['visits']])
    except:
        string = " ".join([urls.get("url") for urls in eval(string)])
    string = re.split('\W', string)
    #stop_words
    stop_words = ["http", "https", "ru", "www", "com", "html", "php"]
    string = [x if x not in stop_words else "" for x in string]
    string = " ".join(string)
    return re.sub(r"\s(\s)?", "\\1", string)

udf_prepare_user_json_sw = f.udf(lambda x: prepare_user_json_sw(x), StringType())

In [19]:
dataset = dataset.withColumn("prepare_user_json_sw",
                             udf_prepare_user_json_sw(dataset["user_json"]))

In [20]:
dataset.withColumn('word', f.explode(f.split(f.col('prepare_user_json_sw'), ' ')))\
       .groupBy('word')\
       .count()\
       .sort('count', ascending=False)\
       .show()

+-------------+-------+
|         word|  count|
+-------------+-------+
|             |8181942|
|        avito| 466710|
|       smotri| 207802|
|       24open|  97626|
|   loveplanet|  91148|
|        index|  86838|
|  mailrambler|  84379|
|      youtube|  80249|
|       yandex|  79789|
|           vk|  77332|
|         ebay|  54484|
|      flirchi|  51294|
|      echomsk|  50616|
|     yaplakal|  50343|
|      topface|  48957|
|         text|  48300|
|       yabadu|  45694|
|yandsearchweb|  44755|
|     bkavanga|  38911|
|           vz|  38086|
+-------------+-------+
only showing top 20 rows



In [21]:
dataset.show(5)

+------+-----+--------------------+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|   prepare_user_json|prepare_user_json_sw|
+------+-----+--------------------+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|http   zebra zoya...| zebrazoya 200028...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|http   sweetradin...| sweetrading p900...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|http   ru oriflam...|  oriflame produc...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|http   translate ...| translatetattoo ...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|https   mail ramb...| mailrambler  fol...|
+------+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [23]:
dataset.select(f.col('uid'))\
       .distinct()\
       .count()

41138

In [24]:
dataset.select(f.col('gender'))\
       .distinct().collect()

[Row(gender='F'), Row(gender='M'), Row(gender='-')]

In [25]:
dataset.select(f.col('age'))\
       .distinct().collect()

[Row(age='>=55'),
 Row(age='45-54'),
 Row(age='-'),
 Row(age='35-44'),
 Row(age='25-34'),
 Row(age='18-24')]

In [28]:
def encode_gender(x):
    mapping = {
        '-': 0,
        'F': 1,
        'M': 2,
    }
    return mapping.get(x)

def decode_gender(x):
    mapping = {
        0:'-',
        1:'F',
        2:'M',
    }
    return mapping.get(x)

udf_encode_gender = f.udf(lambda x: encode_gender(x), IntegerType())
udf_decode_gender = f.udf(lambda x: decode_gender(x), StringType())

def encode_age(x):
    mapping = {
        '-': 0,
        '45-54': 1,
        '35-44': 2,
        '25-34': 3,
        '18-24': 4,
        '>=55': 5,
    }
    return mapping.get(x)

def decode_age(x):
    mapping = {
        0:'-',
        1:'45-54',
        2:'35-44',
        3:'25-34',
        4:'18-24',
        5:'>=55',
    }
    return mapping.get(x)

udf_encode_age = f.udf(lambda x: encode_age(x), IntegerType())
udf_decode_age = f.udf(lambda x: decode_age(x), StringType())


dataset = dataset.withColumn("gender", udf_encode_gender("gender"))
dataset = dataset.withColumn("age", udf_encode_age("age"))

In [29]:
dataset.show(3)

+------+---+--------------------+--------------------+--------------------+--------------------+
|gender|age|                 uid|           user_json|   prepare_user_json|prepare_user_json_sw|
+------+---+--------------------+--------------------+--------------------+--------------------+
|     1|  4|d50192e5-c44e-4ae...|{"visits": [{"url...|http   zebra zoya...| zebrazoya 200028...|
|     2|  3|d502331d-621e-472...|{"visits": [{"url...|http   sweetradin...| sweetrading p900...|
|     1|  3|d50237ea-747e-48a...|{"visits": [{"url...|http   ru oriflam...|  oriflame produc...|
+------+---+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [30]:
def text_regexp_filter(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return " ".join(regex.findall(string.lower()))
udf_text_regexp_filter = f.udf(lambda x: text_regexp_filter(x), StringType())

In [31]:
dataset = dataset.withColumn("prepare_user_json_sw_filter", 
                             udf_text_regexp_filter(dataset["prepare_user_json_sw"]))

#tokenizer
tokenizer = Tokenizer(inputCol="prepare_user_json_sw_filter", outputCol="words")
dataset = tokenizer.transform(dataset)

#считаем tf
ht = HashingTF(inputCol="words", outputCol="features_ht", numFeatures=10000)
dataset = ht.transform(dataset)

#считаем tfidf
tfidf = IDF(inputCol="features_ht", outputCol="user_json_tfidf").fit(dataset)
dataset = tfidf.transform(dataset)

tokenizer.write().overwrite().save("tokenizer_model")
ht.write().overwrite().save("ht_model")
tfidf.write().overwrite().save("tfidf_model")

In [32]:
pred_test = dataset.limit(5) 
clf = RandomForestClassifier(labelCol="gender", featuresCol="user_json_tfidf") 
model = clf.fit(dataset.where('gender !=0'))
model.write().overwrite().save("gender_model")
predictions = model.transform(pred_test)
predictions = predictions.withColumn("pred_gender", udf_decode_gender("prediction"))
columns_to_drop = ['rawPrediction', 'probability', 'prediction']
predictions = predictions.drop(*columns_to_drop)

In [33]:
clf = RandomForestClassifier(labelCol="age", featuresCol="user_json_tfidf") 
model = clf.fit(dataset.where('age != 0'))
model.write().overwrite().save("age_model")
predictions = model.transform(predictions)
predictions = predictions.withColumn("pred_gender", udf_decode_age("prediction"))

In [34]:
predictions.columns

['gender',
 'age',
 'uid',
 'user_json',
 'prepare_user_json',
 'prepare_user_json_sw',
 'prepare_user_json_sw_filter',
 'words',
 'features_ht',
 'user_json_tfidf',
 'pred_gender',
 'rawPrediction',
 'probability',
 'prediction']

In [42]:
# kafka
# KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_BOOTSTRAP_SERVER = 'spark-master-2.newprolab.com:6667'
INPUT_KAFKA_TOPIC = 'input_andrey.korneev'
OUTPUT_KAFKA_TOPIC = 'andrey.korneev'

# read 
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'earliest',
    'endingOffsets': 'latest'
}

In [43]:
kafka_sdf = (
    spark
    .read
    .format('kafka')
    .options(**read_kafka_params)
    .option("failOnDataLoss", 'False')
    .load()
    .cache()
)

In [44]:
kafka_sdf

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [45]:
print('count', kafka_sdf.count())
kafka_sdf.show(3)

count 5000
+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_andrey.korneev|        0|     0|2022-11-06 17:16:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.korneev|        0|     1|2022-11-06 17:16:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.korneev|        0|     2|2022-11-06 17:16:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
only showing top 3 rows



In [46]:
def parse_url_visits_(x):
    
    x = json.loads(x)
    x = json.loads(x['visits'])
    return '{"visits":' + str(x) + '}'
def parse_url_visits_series(x):
    return x.apply(parse_url_visits_) 
parse_url_visits = f.pandas_udf(parse_url_visits_series,"string") 

kafka_sdf = kafka_sdf.withColumn("user_json", parse_url_visits("value"))

In [47]:
def parse_uid_(x):
    x = json.loads(x)
    return x['uid']

def parse_uid_series(x):
    return x.apply(parse_uid_) 
parse_uid = f.pandas_udf(parse_uid_series,"string") 

kafka_sdf = kafka_sdf.withColumn("uid", parse_uid("value"))

In [48]:
kafka_sdf.show(1)

+----+--------------------+--------------------+---------+------+--------------------+-------------+--------------------+--------------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|           user_json|                 uid|
+----+--------------------+--------------------+---------+------+--------------------+-------------+--------------------+--------------------+
|null|[7B 22 75 69 64 2...|input_andrey.korneev|        0|     0|2022-11-06 17:16:...|            0|{"visits":[{'url'...|bd7a30e1-a25d-4cb...|
+----+--------------------+--------------------+---------+------+--------------------+-------------+--------------------+--------------------+
only showing top 1 row



In [49]:
deserialized = kafka_sdf.select(f.col("value").cast("string").alias("value"))

parsed_test = deserialized.select(f.get_json_object(f.col("value"), "$.uid").alias("uid"),
                                  f.get_json_object(f.col("value"), "$.visits").alias("visits"))

In [50]:
#стрим
read_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVER,
    "subscribe": INPUT_KAFKA_TOPIC,
    "startingOffsets": "latest",
    "failOnDataLoss": "False"
}
write_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'topic': OUTPUT_KAFKA_TOPIC
}

In [51]:
tokenizer = Tokenizer.load("tokenizer_model")
hashingTF = HashingTF.load("ht_model")
idfModel = IDFModel.load("tfidf_model")
age_model = RandomForestClassificationModel.load("age_model")
gender_model = RandomForestClassificationModel.load("gender_model")

In [52]:
def process_batch(batch_df, batch_id):  
    deserialized = batch_df.select(f.col("value").cast("string").alias("value"))
    batch_df = deserialized.select(f.get_json_object(f.col("value"), "$.uid").alias("uid"),
                                   f.get_json_object(f.col("value"), "$.visits").alias("user_json"))
    #data prepare
    batch_df = batch_df.withColumn("prepare_user_json_sw",
                                     udf_prepare_user_json_sw(batch_df["user_json"]))
    batch_df = batch_df.withColumn("prepare_user_json_sw_filter", 
                                     udf_text_regexp_filter(batch_df["prepare_user_json_sw"]))

    tokens = tokenizer.transform(batch_df)
    hashing = hashingTF.transform(tokens)
    batch_df = idfModel.transform(hashing) 
    #gender predict
    batch_df = gender_model.transform(batch_df)
    batch_df = batch_df.withColumn("gender", udf_decode_gender("prediction"))
    columns_to_drop = ['rawPrediction', 'probability', 'prediction']
    batch_df = batch_df.drop(*columns_to_drop)

    #age predict
    batch_df = age_model.transform(batch_df)
    batch_df = batch_df.withColumn("age", udf_decode_age("prediction"))

    #submit
    message = batch_df.select('uid', 'gender', 'age')       
    message = message.select(to_json(struct(*message.columns)).alias("value"))
    message.write\
           .format('kafka')\
           .options(**write_kafka_params)\
           .mode('append')\
           .save()

In [53]:
kafka_sdf = (spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .option("failOnDataLoss", 'False')
    .load()
)
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [54]:
def create_console_sink(df):
    return df.writeStream\
            .foreachBatch(process_batch)\
            .option('checkpointLocation', 'streaming/chk/chk_kafka_nikita_gribov_lab04')

In [55]:
sink = create_console_sink(kafka_sdf)
sq = sink.start()

In [59]:
sq.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [60]:
sq.lastProgress

{'id': '7f05257b-28dd-4309-94d2-ee9fa90a6bc3',
 'runId': 'e8c94e08-9218-41a9-8df5-3d72b0c544ae',
 'name': None,
 'timestamp': '2022-11-06T14:20:22.655Z',
 'batchId': 7,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getEndOffset': 0, 'setOffsetRange': 2, 'triggerExecution': 2},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[input_andrey.korneev]]',
   'startOffset': {'input_andrey.korneev': {'0': 10000}},
   'endOffset': {'input_andrey.korneev': {'0': 10000}},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'ForeachBatchSink'}}