In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Zakharov YA") 

spark = SparkSession.builder.config(conf=conf).appName("Zakharov YA").getOrCreate()

In [3]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

# Работа с dataset

In [4]:
data = spark.read.load("/labs/slaba04/gender_age_dataset.txt",
                        format = "csv",
                        sep="\t",
                        header=True,
                        inferSchema=True)

In [5]:
data.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [7]:
data.show(5)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 5 rows



In [7]:
import re
@f.udf(returnType=ArrayType(StringType()))
def find_url(dict_data):
    data_dict = eval(dict_data)["visits"]
    try: 
        list_data = []
        for i in data_dict: 
            list_data.extend(re.findall(r'\w+',re.sub(r'(http://|https://|www)', '', i["url"])))
        return list_data
    except:
        return []

In [8]:
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml import Transformer

In [9]:
class CleanSitesTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(CleanSitesTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), find_url(f.col(self.getInputCol())))

In [10]:
transformer1 = CleanSitesTransformer(inputCol="user_json", outputCol="user_json_visits")

In [11]:
df = transformer1.transform(data)

In [13]:
df.show(10)

+------+-----+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|    user_json_visits|
+------+-----+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[zebra, zoya, ru,...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[sweetrading, ru,...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[ru, oriflame, co...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[translate, tatto...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[mail, rambler, r...|
|     F|25-34|d5090ddf-5648-487...|{"visits": [{"url...|[cfire, mail, ru,...|
|     F|25-34|d50bcef8-16ff-4e8...|{"visits": [{"url...|[msn, com, ru, ru...|
|     F|18-24|d50e23dc-0cbd-488...|{"visits": [{"url...|[gazprom, ru, pre...|
|     F|45-54|d50fdabb-4208-441...|{"visits": [{"url...|[lifenews, ru, li...|
|     F|18-24|d511b480-23a6-482...|{"visits": [{"url...|[google,

In [13]:
df = df.withColumn("gender_map", f.when(f.col("gender")=='F', 1).otherwise(0))
df = df.withColumn("age_map", f.when(f.col("age")=='18-24', 0).when(f.col("age")=='25-34', 1).when(f.col("age")=='35-44', 2).when(f.col("age")=='45-54', 3).otherwise(4))

In [14]:
from pyspark.ml.feature import HashingTF, RegexTokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler,StandardScaler

# en_stopwords = StopWordsRemover.loadDefaultStopWords("english")
# remover = StopWordsRemover(inputCol="user_json_visits",
#                            outputCol="filtered",
#                            stopWords=en_stopwords)

hasher = HashingTF(numFeatures=2500, binary=True, inputCol="user_json_visits", outputCol="word_vector")

# scaler = StandardScaler()\
#          .setInputCol("word_vector")\
#          .setOutputCol("sc_word_vector")

lr = LogisticRegression(featuresCol="word_vector", rawPredictionCol='rawPrediction_gender', predictionCol='prediction_gender', labelCol="gender_map", maxIter=30)

In [15]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    hasher, lr
])

In [16]:
train = df.sampleBy("gender_map", fractions={0: 0.75, 1: 0.75}, seed=42)
test = df.join(train, on="uid", how="leftanti")

In [17]:
pipeline_gender = pipeline.fit(train)

In [18]:
train = df.sampleBy("age_map", fractions={0: 0.75, 1: 0.75, 2:0.75, 3:0.75, 4:0.75}, seed=42)
test = df.join(train, on="uid", how="leftanti")

In [19]:
lg_age = LogisticRegression(featuresCol="word_vector", rawPredictionCol='rawPrediction_age', predictionCol='prediction_age', labelCol="age_map", maxIter=30, regParam=0.25)

In [20]:
pipeline = Pipeline(stages=[
    hasher, lg_age
])

In [21]:
pipeline_age = pipeline.fit(train)

# Работа с kafka

In [27]:
#KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_BOOTSTRAP_SERVER = 'spark-de-node-1.newprolab.com:6667'
INPUT_KAFKA_TOPIC = 'input_yakov.zakharov'
OUTPUT_KAFKA_TOPIC = 'yakov.zakharov'

In [28]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'latest'
}
kafka_sdf = (
    spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .load()
)
kafka_sdf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [29]:
parsed_sdf = kafka_sdf.select(F.col("value").cast("string"), F.col("topic"), F.col("partition"), F.col("offset"))

In [30]:
from pyspark.sql.functions import udf
import re
@udf(returnType=ArrayType(StringType()))
def extract_dictionary_from_kafca(dict_data):
    data_dict = eval(dict_data)["visits"]
    uid_dict = eval(dict_data)["uid"]
    list_data = []
    for i in eval(data_dict): 
        list_data.extend(re.findall(r'\w+',re.sub(r'(http://|https://|www)', '', i["url"])))    
    return list_data

In [31]:
from pyspark.sql.functions import udf
import re
@udf(returnType=StringType())
def extract_uid_from_kafca(dict_data):
    uid_dict = eval(dict_data)["uid"]
    return uid_dict
    

In [32]:
class ReadTransformer(Transformer, HasInputCol, HasOutputCol):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(ReadTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.select(self.getInputCol(), 
                                  extract_dictionary_from_kafca(F.col(self.getInputCol())).alias(self.getOutputCol().split(",")[1]),
                                  extract_uid_from_kafca(F.col(self.getInputCol())).alias(self.getOutputCol().split(",")[0]))

In [34]:
def create_console_sink(df):
    #parsed_sdf = df.select(F.col("value").cast("string"), F.col("topic"), F.col("partition"), F.col("offset"))
    write_kafka_params = {
   "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVER,
   "topic": "yakov.zakharov", 
    "truncate": "false", 
    "numRows": "1000"    
    }
    read_transformer = ReadTransformer(inputCol="value", outputCol="uid,user_json_visits")
    predictions_ = read_transformer.transform(parsed_sdf)
    pred_topic = pipeline_age.transform(predictions_)
    pred_topic = pred_topic.withColumn("age", f.when(f.col("prediction_age")==0, '18-24').when(f.col("prediction_age")==1, '25-34').when(f.col("prediction_age")==2, '35-44').when(f.col("prediction_age")==3, '45-54').otherwise(">=55")).select(["uid", "age", "user_json_visits"])
    pred_topic = pipeline_gender.transform(pred_topic)
    pred_topic = pred_topic.withColumn("gender", f.when(f.col("prediction_gender")==1, 'F').otherwise('M'))
    pred_topic = pred_topic.select(["uid", "gender", "age"])
    pred_topic = pred_topic.selectExpr("to_json(struct(*)) AS value")
    return pred_topic \
        .writeStream.format("kafka").options(**write_kafka_params) \
        .option("checkpointLocation", "streaming/chk/chk_yakov.zakharov")\
        .outputMode("append")

In [75]:
!hdfs dfs -rm -r /user/yakov.zakharov/streaming/chk
!hdfs dfs -rm -r /user/yakov.zakharov/streaming/datasets


21/03/25 20:10:57 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/yakov.zakharov/streaming/chk' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/yakov.zakharov/.Trash/Current/user/yakov.zakharov/streaming/chk1616692257762
rm: `/user/yakov.zakharov/streaming/datasets': No such file or directory


In [76]:
!hdfs dfs -ls /user/yakov.zakharov/streaming

In [77]:
sink = create_console_sink(parsed_sdf)
sq = sink.start()

In [78]:
sq.isActive

True

In [81]:
sq.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [83]:
sq.lastProgress

{'id': '496f5afe-117a-421a-9a5e-cfb88815170a',
 'runId': 'ecb5257d-3f3d-4a3b-9a2e-210d0e199e1e',
 'name': None,
 'timestamp': '2021-03-25T17:12:43.391Z',
 'batchId': 2,
 'numInputRows': 294,
 'inputRowsPerSecond': 94.47300771208226,
 'processedRowsPerSecond': 38.082901554404145,
 'durationMs': {'addBatch': 7468,
  'getBatch': 0,
  'getEndOffset': 0,
  'queryPlanning': 78,
  'setOffsetRange': 2,
  'triggerExecution': 7720,
  'walCommit': 74},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[input_yakov.zakharov]]',
   'startOffset': {'input_yakov.zakharov': {'0': 45001}},
   'endOffset': {'input_yakov.zakharov': {'0': 45295}},
   'numInputRows': 294,
   'inputRowsPerSecond': 94.47300771208226,
   'processedRowsPerSecond': 38.082901554404145}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider@41a05500'}}

In [71]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))

In [74]:
kill_all()

Stopped KafkaV2[Subscribe[input_yakov.zakharov]]


In [117]:
spark.stop()