In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("nazim lab04").getOrCreate()

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, StringIndexer, HashingTF, RegexTokenizer, StopWordsRemover, IDF, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml import Transformer

from pyspark.ml.classification import LogisticRegression, GBTClassifier

In [4]:
import re
import json

In [5]:
path_to_train_data = '/labs/slaba04/'

In [6]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


In [7]:
schema = StructType([StructField('gender', StringType()), 
                     StructField('age', StringType()),
                     StructField('uid', StringType()),
                     StructField('user_json', StringType())
                    ]
                   )

train_data = spark.read.format("csv")\
                       .option("inferSchema", "true")\
                       .schema(schema)\
                       .option("header", "true")\
                       .option("delimiter", "\\t")\
                       .load(path_to_train_data)

train_data = train_data.filter('age != "-" and gender != "-"')

visits_schema = StructType([
    StructField("visits", ArrayType(
      StructType([
          StructField("url", StringType()),
          StructField("timestamp", LongType())
      ])
   ))
]) 

train_data = train_data.withColumn('visits', 
                                   F.from_json(F.col('user_json'),
                                               schema=visits_schema
                                              )
                                  )

train_data = train_data.withColumn('visited_pages', F.col('visits.visits.url'))

train_data = train_data.withColumn('is_M', F.when(F.col("gender") == 'M', 1).otherwise(0))

train_data = train_data.withColumn("age_category", F.when(F.col("age") == '18-24', 0)\
                                                    .when(F.col("age") == '25-34', 1)\
                                                    .when(F.col("age") == '35-44', 2)\
                                                    .when(F.col("age") == '45-54', 3)\
                                                    .otherwise(4)
                                  )

In [15]:
train_data.show(5)

+------+-----+--------------------+--------------------+--------------------+--------------------+----+------------+
|gender|  age|                 uid|           user_json|              visits|       visited_pages|is_M|age_category|
+------+-----+--------------------+--------------------+--------------------+--------------------+----+------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[[[http://zebra-z...|[http://zebra-zoy...|   0|           0|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[[[http://sweetra...|[http://sweetradi...|   1|           1|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[[[http://ru.orif...|[http://ru.orifla...|   0|           1|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[[[http://transla...|[http://translate...|   0|           1|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[[[https://mail.r...|[https://mail.ram...|   1|           4|
+------+-----+--------------------+--------------------+--------

In [8]:
@F.udf(returnType=ArrayType(StringType()))
def extr_words(user_sites):
    sites_cleaned = [re.sub(r'(http://|https://|www)', '', site) for site in user_sites]    
    sites_cleaned = sum([re.findall(r'\w+', site) for site in sites_cleaned], [])
    sites_cleaned = [site for site in sites_cleaned if site.isalpha()]
    return sites_cleaned

class UTransformer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(CleanSitesTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), extr_words(F.col(self.getInputCol())))

In [9]:
transformer_cleaner = Uransformer(inputCol="visited_pages", outputCol="sites_words")

en_stopwords = StopWordsRemover.loadDefaultStopWords("english")
remover = StopWordsRemover(inputCol="sites_words",
                           outputCol="sites_words_filtered",
                           stopWords=en_stopwords)

tf = HashingTF(inputCol="sites_words_filtered", outputCol="tf", numFeatures=15000)

idf = IDF(inputCol="tf", outputCol="tf_idf")

scaler = StandardScaler()\
         .setInputCol("tf_idf")\
         .setOutputCol("tf_idf_norm")


gender_model = LogisticRegression(featuresCol='tf_idf_norm', 
                                  rawPredictionCol='rawPrediction_gender', 
                                  predictionCol='prediction_gender', 
                                  labelCol='is_M', 
                                  maxIter=25)


pipeline1 = Pipeline(stages=[transformer_cleaner, remover, tf, idf, scaler, gender_model])

In [10]:
age_model = LogisticRegression(featuresCol='tf_idf_norm', 
                               rawPredictionCol='rawPrediction_age', 
                               predictionCol='prediction_age', 
                               labelCol='age_category', 
                               maxIter=25, 
                               regParam=0.3)

pipeline2 = Pipeline(stages=[age_model])

In [11]:
pipeline = pipeline.fit(train_data)
scored_train_data = pipeline.transform(train_data)
scored_train_data = scored_train_data.select([col for col in scored_train_data.columns if col != "probability"])
pipeline = pipeline.fit(scored_train_data)
scored_train_data = pipeline.transform(scored_train_data)

In [16]:
read_kafka_params = {
    "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667",
    "subscribe": "input_nazim.dzhavadov",
    "startingOffsets": "earliest"
}

write_kafka_params = {
   "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667",
   "topic": "nazim.dzhavadov"
}

In [17]:
scoring_data_schema = StructType([
    StructField('uid', StringType()),
    StructField('visits', StringType())
]
)

scoring_data_visits_schema = ArrayType(
      StructType([
          StructField("url", StringType()),
          StructField("timestamp", LongType())
      ])
   )

def scoring_batch(batch_df, epoch_id):
    scoring_data = batch_df.withColumn('weblog', F.from_json(F.col('value').cast(StringType()), 
                                                          scoring_data_schema)
                                   ) \
                        .select('weblog.*') \
                        .select('uid', F.from_json(F.col('visits'), 
                                                   scoring_data_visits_schema).alias('visits')
                               )
    
    scoring_data = scoring_data.withColumn('visited_pages', F.col('visits.url'))
    
    scoring_data = pipeline1.transform(scoring_data)
    
    scoring_data = scoring_data.select([col for col in scoring_data.columns if col != "probability"])
    
    scoring_data = pipeline2.transform(scoring_data)
    
    scoring_data = scoring_data.withColumn('gender', F.when(F.col("prediction_gender") == 1, 'M').otherwise('F'))

    scoring_data = scoring_data.withColumn("age", F.when(F.col("prediction_age") == 0, '18-24')\
                                                   .when(F.col("prediction_age") == 1, '25-34')\
                                                   .when(F.col("prediction_age") == 2, '35-44')\
                                                   .when(F.col("prediction_age") == 3, '45-54')\
                                                   .otherwise('>=55')
                                  )
    
    scoring_data = scoring_data.select('uid', 'gender', 'age')
    
    scoring_data = scoring_data.select(
        F.to_json(
            F.struct([scoring_data[x] for x in scoring_data.columns])
        ).alias("value") #.cast(StringType())
                                      )
    scoring_data.write\
     .format('kafka')\
     .options(**write_kafka_params)\
     .mode('append')\
     .save()
    
    pass

In [18]:
def create_console_sink(df):
    return df.writeStream\
            .foreachBatch(scoring_batch)\
            .option('checkpointLocation', 'streaming/chk/chk_kafka')

In [19]:
kafka_df = spark.readStream.format("kafka").options(**read_kafka_params).option("failOnDataLoss", "False").load()
#kafka_df = kafka_df.selectExpr("CAST(value AS STRING)")

In [20]:
kafka_df.isStreaming

True

In [22]:
sink = create_console_sink(kafka_df)

In [23]:
sq = sink.start()

In [24]:
sq.isActive

True

In [27]:
sq.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [26]:
# .select(F.col('value').cast(StringType()).alias('row'))
spark.read.format("kafka").options(**{
    "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667",
    "subscribe": "nazim.dzhavadov",
    "startingOffsets": "earliest"
}).load().select(F.col('value').cast(StringType()).alias('row'), 'timestamp').show(5, vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------------------
 row       | {"uid":"bd7a30e1-a25d-4cbf-a03f-61748cbe540e","gender":"M","age":"35-44"} 
 timestamp | 2021-03-25 20:28:39.787                                                   
-RECORD 1------------------------------------------------------------------------------
 row       | {"uid":"bd7a6f52-45db-49bf-90f2-a3b07a9b7bcd","gender":"F","age":"25-34"} 
 timestamp | 2021-03-25 20:28:55.522                                                   
-RECORD 2------------------------------------------------------------------------------
 row       | {"uid":"bd7a7fd9-ab06-42f5-bf0f-1cbb0463004c","gender":"M","age":"25-34"} 
 timestamp | 2021-03-25 20:28:55.524                                                   
-RECORD 3------------------------------------------------------------------------------
 row       | {"uid":"bd7c5d7a-0def-41d1-895f-fdb96c56c2d4","gender":"M","age":"35-44"} 
 timestamp | 2021-03-25 20:28:55

## spark.stop()

In [28]:
spark.stop()