In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Evgeniy.osipchuk") 

spark = SparkSession.builder.config(conf=conf).appName("Evgeniy.osipchuk").getOrCreate()

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import HashingTF, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import re
import json

In [3]:
# KAFKA_BOOTSTRAP_SERVER = 'spark-de-master-1.newprolab.com:6667'
KAFKA_BOOTSTRAP_SERVER = 'spark-de-node-1.newprolab.com:6667'
# KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667' 
INPUT_KAFKA_TOPIC = 'input_evgeniy.osipchuk'
OUTPUT_KAFKA_TOPIC = 'evgeniy.osipchuk'

In [10]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))
        
def get_visits_list(user_json):
    user_json = eval(user_json)["visits"]
    try: 
        list_visits = []
        for x in user_json: 
            for word in re.findall(r'\w+',re.sub(r'(http://|https://|www)', '', x["url"])):
                list_visits.append(word)
        return list_visits
    except:
        return []

get_visits_list_udf = F.udf(get_visits_list, ArrayType(StringType()))

def get_uid(data):
    uid = eval(data)["uid"]
    return uid
get_uid_udf = F.udf(get_uid, StringType())

def get_data(data):
    visits = eval(data)["visits"]
    uid = eval(data)["uid"]
    list_data = []
    for visit in eval(visits): 
        for i in re.findall(r'\w+',re.sub(r'(http://|https://|www)', '', i["url"])):
            list_data.append(i)  
    return list_data
get_data_udf = F.udf(get_data, ArrayType(StringType()))


def process_batch(batch_df, batch_id): 
    write_kafka_params = {
   "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVER,
   "topic": "evgeniy.osipchuk", 
    "truncate": "false", 
    "numRows": "1000"    
    }
    
    batch_df = batch_df.select('value',
                     get_data_udf(batch_df.value).alias('parsed_json'),
                     get_uid_udf(batch_df.value).alias('uid'))
    result = logreg_model_age.transform(batch_df)
    result = (result
                  .withColumn("age",
                              F.when(F.col("prediction_age")==0, '18-24')
                              .when(F.col("prediction_age")==1, '25-34')
                              .when(F.col("prediction_age")==2, '35-44')
                              .when(F.col("prediction_age")==3, '45-54')
                              .otherwise(">=55"))
                  .select(["uid", "age", "parsed_json"]))
    result = pipeline_gender.transform(result)
    result = result.withColumn("gender", F.when(F.col("prediction_gender")==1, 'F').otherwise('M'))
    result = result.select(["uid", "gender", "age"])
    result = result.selectExpr("to_json(struct(*)) AS value")
    result\
     .write\
     .format('kafka')\
     .options(**write_kafka_params)\
     .mode('append')\
     .save()

def create_sink(df):
    return df.writeStream\
            .foreachBatch(process_batch)\
            .option('checkpointLocation', 'streaming/chk/chk_evgeniy_osipchuk')


In [5]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


In [6]:
data = spark.read.load("/labs/slaba04/gender_age_dataset.txt",
                        format = "csv",
                        sep="\t",
                        header=True,
                        inferSchema=True)

In [7]:
data.show(5)

+------+-----+--------------------+--------------------+
|gender|  age|                 uid|           user_json|
+------+-----+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|
+------+-----+--------------------+--------------------+
only showing top 5 rows



In [11]:
data = data.withColumn('parsed_json', get_visits_list_udf(data.user_json))
data.show(5)

+------+-----+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|         parsed_json|
+------+-----+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[zebra, zoya, ru,...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[sweetrading, ru,...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[ru, oriflame, co...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[translate, tatto...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[mail, rambler, r...|
+------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
hasher = HashingTF(numFeatures=2500, binary=True, inputCol="parsed_json", outputCol="words")

In [13]:
data = data.withColumn("gender_cat", F.when(F.col("gender")=='F', 1).otherwise(0))

data = data.withColumn("age_cat", (F.when(F.col("age")=='18-24', 0)
                                   .when(F.col("age")=='25-34', 1)
                                   .when(F.col("age")=='35-44', 2)
                                   .when(F.col("age")=='45-54', 3)
                                   .otherwise(4)))

In [14]:
std_scaler = (StandardScaler().setInputCol("words").setOutputCol("scaled_words"))

logreg = LogisticRegression(featuresCol="scaled_words",
                            rawPredictionCol='rawPrediction_gender',
                            predictionCol='prediction_gender',
                            labelCol="gender_cat",
                            maxIter=30)

pipeline = Pipeline(stages=[hasher, std_scaler, logreg])

In [15]:
train_gender = data.sampleBy("gender_cat",
                             fractions={0: 0.75, 1: 0.75},
                             seed=42)
test_gender = data.filter(~F.col("uid").isin(train_gender.select(F.collect_list('uid')).first()[0]))

In [16]:
train_gender.count()

30916

In [17]:
test_gender.count()

10222

In [18]:
pipeline_gender = pipeline.fit(train_gender)

In [19]:
pred = pipeline_gender.transform(test_gender)

In [20]:
ROCAUC = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction_gender",
                                       labelCol="gender_cat",
                                       metricName='areaUnderROC')

In [21]:
ROCAUC.evaluate(pred)

0.6745571763262851

In [21]:
train_age = data.sampleBy("age_cat",
                          fractions={0: 0.75,
                                     1: 0.75,
                                     2:0.75,
                                     3:0.75,
                                     4:0.75},
                          seed=42)
test_age = data.filter(~F.col("uid").isin(train_age.select(F.collect_list('uid')).first()[0]))

In [22]:
rfc_age = RandomForestClassifier(featuresCol="scaled_words",
                                 labelCol="age_cat")

In [23]:
logreg_age = LogisticRegression(featuresCol="scaled_words",
                                rawPredictionCol='rawPrediction_age',
                                predictionCol='prediction_age',
                                labelCol="age_cat",
                                maxIter=30, regParam=0.25)

In [26]:
pipeline = Pipeline(stages=[hasher, std_scaler, logreg_age])
logreg_model_age = pipeline.fit(train_age)

In [27]:
pred_age = logreg_model_age.transform(test_age)

In [28]:
f1 = MulticlassClassificationEvaluator(labelCol="age_cat",
                                       predictionCol="prediction_age",
                                       metricName="f1")

In [29]:
f1.evaluate(pred_age)

0.26149148156245305

In [112]:
read_kafka_params = {
    'kafka.bootstrap.servers': KAFKA_BOOTSTRAP_SERVER,
    'subscribe': INPUT_KAFKA_TOPIC,
    'startingOffsets': 'latest'
}
kafka_data = (
    spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .load()
)
kafka_data.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [107]:
!hdfs dfs -rm -r /user/evgeniy.osipchuk/streaming/chk

21/03/24 21:15:01 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/evgeniy.osipchuk/streaming/chk' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/evgeniy.osipchuk/.Trash/Current/user/evgeniy.osipchuk/streaming/chk1616609701748


In [113]:
sink = create_sink(kafka_data)
sq = sink.start()

In [114]:
sq.isActive

True

In [118]:
sq.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [128]:
kill_all()

In [129]:
spark.stop()