In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 --executor-memory 1g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("Lab04_by_sand") \
                    .config("spark.driver.memory", "512m") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, IDF, CountVectorizer
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import struct
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, RegexTokenizer, MinMaxScaler, Normalizer
from pyspark.ml.feature import VectorAssembler
import re
import json

In [4]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [5]:
!hdfs dfs -head /labs/slaba04/gender_age_dataset.txt

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
M	25-34	d502331d-621e-4721-ada2-5d30b2c3801f	{"visits": [{"url": "http://sweetrading.ru/?p=900", "timestamp": 1419717886224}, {"url": "http://sweetrading.ru/?p=884", "timestamp": 1419717884437}, {"url": "http://sweetrading.ru

In [6]:
USERS_FILE = "/labs/slaba04/gender_age_dataset.txt"

In [7]:
users = spark.read.csv(USERS_FILE, sep='\t', header=True)
users.show(2, vertical=True)

-RECORD 0-------------------------
 gender    | F                    
 age       | 18-24                
 uid       | d50192e5-c44e-4ae... 
 user_json | {"visits": [{"url... 
-RECORD 1-------------------------
 gender    | M                    
 age       | 25-34                
 uid       | d502331d-621e-472... 
 user_json | {"visits": [{"url... 
only showing top 2 rows



In [8]:
users.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [9]:
@f.pandas_udf(ArrayType(StringType()))
def extract_hosts(pdf):
    def __parse_url(url):
        regexp = '(\w+)://([\w\d\-\.]*)/*([\w\-/\.]*)[\?]*(.*)'
        found = re.findall(regexp, str(url))
        if len(found) > 0:
            return re.findall(regexp, url)[0]
        else:
            return ('', '', '', '')
    def __parse_json(x):
        d = json.loads(x)
        hosts = [__parse_url(x['url'])[1] for x in d]
        return hosts
    return pdf.apply(lambda x: __parse_json(x))

visits = users.withColumn("hosts", extract_hosts(f.get_json_object(f.col("user_json"), "$.visits"))).drop('user_json')
visits.show(1, vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------
 gender | F                                                                               
 age    | 18-24                                                                           
 uid    | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777                                            
 hosts  | [zebra-zoya.ru, news.yandex.ru, www.sotovik.ru, news.yandex.ru, www.sotovik.ru] 
only showing top 1 row



In [10]:
visits = visits.withColumn("host_counts", f.size(f.col("hosts")))
visits.show(1, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------------------------
 gender      | F                                                                               
 age         | 18-24                                                                           
 uid         | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777                                            
 hosts       | [zebra-zoya.ru, news.yandex.ru, www.sotovik.ru, news.yandex.ru, www.sotovik.ru] 
 host_counts | 5                                                                               
only showing top 1 row



### Prepare targets

In [11]:
genders = visits.groupBy("gender").count().collect()
ages = visits.groupBy("age").count().collect()

In [12]:
all_targets = [(row1.gender, row2.age) for row1 in genders for row2 in ages]
target_map = {i: t for i, t in zip(range(len(genders) * len(ages)), all_targets)}
target_map_rev = {t: i for i, t in zip(range(len(genders) * len(ages)), all_targets)}
target_map, target_map_rev

({0: ('F', '>=55'),
  1: ('F', '45-54'),
  2: ('F', '-'),
  3: ('F', '35-44'),
  4: ('F', '25-34'),
  5: ('F', '18-24'),
  6: ('M', '>=55'),
  7: ('M', '45-54'),
  8: ('M', '-'),
  9: ('M', '35-44'),
  10: ('M', '25-34'),
  11: ('M', '18-24'),
  12: ('-', '>=55'),
  13: ('-', '45-54'),
  14: ('-', '-'),
  15: ('-', '35-44'),
  16: ('-', '25-34'),
  17: ('-', '18-24')},
 {('F', '>=55'): 0,
  ('F', '45-54'): 1,
  ('F', '-'): 2,
  ('F', '35-44'): 3,
  ('F', '25-34'): 4,
  ('F', '18-24'): 5,
  ('M', '>=55'): 6,
  ('M', '45-54'): 7,
  ('M', '-'): 8,
  ('M', '35-44'): 9,
  ('M', '25-34'): 10,
  ('M', '18-24'): 11,
  ('-', '>=55'): 12,
  ('-', '45-54'): 13,
  ('-', '-'): 14,
  ('-', '35-44'): 15,
  ('-', '25-34'): 16,
  ('-', '18-24'): 17})

In [13]:
broadcasted_target_map = spark.sparkContext.broadcast(target_map)
broadcasted_target_map_rev = spark.sparkContext.broadcast(target_map_rev)

In [14]:
@f.udf(IntegerType())
def map_targets(gender, age):
    target_map_rev = broadcasted_target_map_rev.value
    return target_map_rev[(gender, age)]
visits = visits.withColumn("target", map_targets(f.col('gender'), f.col('age')))
visits.show(1, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------------------------
 gender      | F                                                                               
 age         | 18-24                                                                           
 uid         | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777                                            
 hosts       | [zebra-zoya.ru, news.yandex.ru, www.sotovik.ru, news.yandex.ru, www.sotovik.ru] 
 host_counts | 5                                                                               
 target      | 5                                                                               
only showing top 1 row



### Pipeline

In [15]:
num_features = [
    'host_counts'
]
assembler = VectorAssembler(inputCols=num_features, outputCol="num_features")
scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="num_features_scaled")
num_pipeline = Pipeline(stages=[assembler, scaler])

In [16]:
host_encoder = HashingTF(inputCol="hosts", outputCol="features")
host_pipeline = Pipeline(stages=[host_encoder])

In [17]:
all_assemble = VectorAssembler(inputCols=[host_pipeline.getStages()[-1].getOutputCol()], outputCol="features")

In [18]:
transformer = Pipeline(stages=[num_pipeline, host_pipeline, all_assemble])

### Train test

In [19]:
train, valid = visits.randomSplit([0.8, 0.2], seed=42)

### Base model (LR)

In [20]:
lr = LogisticRegression(featuresCol=transformer.getStages()[-1].getOutputCol(), labelCol="target", maxIter=15)
estimator = Pipeline(stages=[host_encoder, lr])
model = estimator.fit(train)

In [21]:
prediction = model.transform(valid)

In [22]:
correct_predictions = prediction.select("target", f.col("prediction").cast("int")) \
    .filter("target == prediction").count()
correct_predictions

1425

In [23]:
all_predictions = prediction.count()
all_predictions

8118

In [24]:
print("Accuracy is {}".format(correct_predictions / all_predictions))

Accuracy is 0.17553584626755359


### RandomForest

In [25]:
from pyspark.ml.classification import RandomForestClassifier

In [26]:
rf = RandomForestClassifier(featuresCol=transformer.getStages()[-1].getOutputCol(), labelCol="target", 
                  numTrees=10, maxDepth=10, seed=42)
estimator = Pipeline(stages=[host_encoder, rf])
model = estimator.fit(train)

In [27]:
prediction = model.transform(valid)

In [28]:
correct_predictions = prediction.select("target", f.col("prediction").cast("int")) \
    .filter("target == prediction").count()
correct_predictions

1716

In [29]:
all_predictions = prediction.count()
all_predictions

8118

In [30]:
print("Accuracy is {}".format(correct_predictions / all_predictions))

Accuracy is 0.21138211382113822


### Kafka batch

In [31]:
KAFKA_BOOTSTRAP_SERVERS = 'spark-master-1.newprolab.com:6667'
INPUT_TOPIC = "input_andrey.sorokin"
OUTPUT_TOPIC = "andrey.sorokin"

In [32]:
read_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
    "subscribe": INPUT_TOPIC,
    "startingOffsets": "earliest"
}
kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).option("failOnDataLoss", 'False').load()

In [33]:
kafka_sdf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     0|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     1|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     2|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     3|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     4|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     5|2022-10-29 18:22:...|            0|
|null|[7B 22 75 69 64 2...|input_andrey.sorokin|        0|     6|2022-10-29 18:22:...|     

In [34]:
test = kafka_sdf.select(f.get_json_object(f.col("value").cast("string"), "$.uid").alias("uid"),
                 extract_hosts(f.get_json_object(f.col("value").cast("string"), "$.visits")).alias("hosts"))
test.show()

+--------------------+--------------------+
|                 uid|               hosts|
+--------------------+--------------------+
|bd7a30e1-a25d-4cb...|[www.interfax.ru,...|
|bd7a6f52-45db-49b...|[www.packagetrack...|
|bd7a7fd9-ab06-42f...|[www.mk.ru, www.m...|
|bd7c5d7a-0def-41d...|[www.24open.ru, w...|
|bd7e54a2-0215-45c...|[www.dns-shop.ru,...|
|bd7e9797-4cdb-46e...|      [news.meta.ua]|
|bd7e9ec7-fb67-45e...|[dynamobryansk.fo...|
|bd8056df-cc25-4b6...|[www.2mm.ru, www....|
|bd818690-73d2-445...|[www.lacywear.ru,...|
|bd81e006-f059-4cd...|       [nn.domru.ru]|
|bd81e64a-bfa3-414...|[cache.betweendig...|
|bd82fee4-afb3-408...|[apostrophe.com.u...|
|bd83400b-abe2-42f...|[index.ru, com.ad...|
|bd843c8c-dbba-4ec...|[katushka.net, ka...|
|bd86d250-a6ee-41f...|[www.sq.com.ua, n...|
|bd889738-93b2-402...|[www.proforientat...|
|bd88fac5-3211-439...|[yourlust.com, tu...|
|bd89a20f-a7af-462...|[yandex.ru, prime...|
|bd8c2ee4-9c45-46d...|[video-dom2.ru, v...|
|bd8f08a1-d845-4cb...|[live.russ

In [35]:
pred = model.transform(test)
pred.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 uid|               hosts|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|bd7a30e1-a25d-4cb...|[www.interfax.ru,...|(262144,[1386,156...|[0.29451470104907...|[0.02945147010490...|      14.0|
|bd7a6f52-45db-49b...|[www.packagetrack...|(262144,[1386,774...|[0.19073160619967...|[0.01907316061996...|      10.0|
|bd7a7fd9-ab06-42f...|[www.mk.ru, www.m...|(262144,[179661,2...|[0.21016184237876...|[0.02101618423787...|      10.0|
|bd7c5d7a-0def-41d...|[www.24open.ru, w...|(262144,[9670,135...|[0.20703404009113...|[0.02070340400911...|      10.0|
|bd7e54a2-0215-45c...|[www.dns-shop.ru,...|(262144,[4368,372...|[0.19284246114706...|[0.01928424611470...|      10.0|
|bd7e9797-4cdb-46e...|      [news.meta.ua]|(262144,[1423

In [36]:
@f.udf(StringType())
def unmap_gender(target):
    target_map = broadcasted_target_map.value
    return target_map[target][0]

@f.udf(StringType())
def unmap_age(target):
    target_map = broadcasted_target_map.value
    return target_map[target][1]

out_df = pred.select(f.to_json(f.struct("uid", 
        unmap_gender(f.col("prediction")).alias("gender"), 
        unmap_age(f.col("prediction")).alias("age"))).alias("value"))

In [37]:
out_df.show(10, truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|{"uid":"bd7a30e1-a25d-4cbf-a03f-61748cbe540e","gender":"-","age":"-"}    |
|{"uid":"bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd","gender":"M","age":"25-34"}|
|{"uid":"bd7a7fd9-ab06-42f5-bf0f-1cbb0463004c","gender":"M","age":"25-34"}|
|{"uid":"bd7c5d7a-0def-41d1-895f-fdb96c56c2d4","gender":"M","age":"25-34"}|
|{"uid":"bd7e54a2-0215-45cb-a869-9efebf250e38","gender":"M","age":"25-34"}|
|{"uid":"bd7e9797-4cdb-46e1-a540-f3ea010605ad","gender":"M","age":"25-34"}|
|{"uid":"bd7e9ec7-fb67-45eb-8ad3-209d01d15ae6","gender":"M","age":"25-34"}|
|{"uid":"bd8056df-cc25-4b63-bc12-a46f888baa49","gender":"M","age":"25-34"}|
|{"uid":"bd818690-73d2-445d-be5d-5c8f748dbb19","gender":"M","age":"25-34"}|
|{"uid":"bd81e006-f059-4cdd-b716-3467c78d1312","gender":"M","age":"25-34"}|
+-----------

In [38]:
write_kafka_params = {
   "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
   "topic": OUTPUT_TOPIC
}
out_df.write.format("kafka").options(**write_kafka_params).save()

In [39]:
test_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
    "subscribe": OUTPUT_TOPIC,
    "startingOffsets": "earliest"
}
test_test  = spark.read.format("kafka").options(**test_kafka_params) \
    .option("failOnDataLoss", 'False').load()
test_test.show()

+----+--------------------+--------------+---------+------+--------------------+-------------+
| key|               value|         topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------+---------+------+--------------------+-------------+
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     0|2022-10-29 21:41:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     1|2022-10-29 21:41:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     2|2022-10-29 21:41:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     3|2022-10-29 21:41:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     4|2022-10-29 21:41:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     5|2022-10-29 21:54:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        0|     6|2022-10-29 21:54:...|            0|
|null|[7B 22 75 69 64 2...|andrey.sorokin|        

### Kafka stream

In [40]:
read_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
    "subscribe": INPUT_TOPIC,
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [41]:
parsed_df = kafka_sdf.select(f.get_json_object(f.col("value").cast("string"), "$.uid").alias("uid"),
                 extract_hosts(f.get_json_object(f.col("value").cast("string"), "$.visits")).alias("hosts"))

In [42]:
pred_df = model.transform(parsed_df)

In [43]:
out_df = pred_df.select(f.to_json(f.struct("uid", 
                                           unmap_gender(f.col("prediction")).alias("gender"), 
                                           unmap_age(f.col("prediction")).alias("age"))).alias("value"))

In [44]:
out_df.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f70bc0c8a58>

In [45]:
streams = SparkSession.builder.getOrCreate().streams.active
streams

[<pyspark.sql.streaming.StreamingQuery at 0x7f70bc0b3668>]

In [47]:
if streams:
    for s in streams:
        if s.lastProgress["sources"][0] is not None: 
            desc = s.lastProgress["sources"][0]["description"]
        else:
            desc = "Unknown"
        s.stop()
        print("Stopped {s}".format(s=desc))

Stopped KafkaV2[Subscribe[input_andrey.sorokin]]


In [48]:
spark.stop()