In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sabanov Denis Spark Dataframe test lab04") 

spark = SparkSession.builder.config(conf=conf).appName("Sabanov Denis Spark Dataframe test lab04").getOrCreate()

In [3]:
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, RegexTokenizer, StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml import Pipeline
import re

In [4]:
spark

In [9]:
labData = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", sep="\t", header=True).where("age != '-'")

In [11]:
schema = StructType([
  StructField("visits", ArrayType(
    StructType(
        [
          StructField("url", StringType()),
          StructField("timestamp", StringType())
        ])
  ))
])

df = labData.withColumn("tt", f.from_json("user_json", schema=schema)).\
withColumn("visits", f.explode(f.col("tt").visits)).\
withColumn("url", f.col("visits").url).\
withColumn("ts", f.col("visits").timestamp).drop("user_json", "tt", "visits")

In [13]:
urlData = df.groupBy("uid", "gender", "age").\
agg(f.concat_ws("|", f.collect_list("url")).alias("urls"), f.concat_ws("|", f.collect_list("ts")).alias("timestamps")).\
withColumn("urls", f.regexp_replace("urls", "http|https|www|://|html|", ""))

In [18]:
tokenizer = RegexTokenizer(inputCol="urls", outputCol="tok_urls", pattern="[_():;,.!?\\-|/&=+]")

In [20]:
hashingTF = HashingTF(inputCol="tok_urls", outputCol="rawFeatures", numFeatures=10000, binary=False)

In [25]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

In [27]:
indexerGen = StringIndexer(inputCol="gender", outputCol="gender_idx")
indexerAge = StringIndexer(inputCol="age", outputCol="age_idx")

In [29]:
pipeline = Pipeline(stages=[tokenizer, indexerGen, indexerAge, hashingTF, idf])

In [31]:
urlMod = pipeline.fit(urlData)
urlTran = urlMod.transform(urlData)

In [33]:
lrGender = LogisticRegression(featuresCol="features"
                               , labelCol="gender_idx"
                               , predictionCol="gen_prediction"
                               , regParam=0.01, elasticNetParam=0.2
                              )

In [39]:
lrAge = LogisticRegression(featuresCol="features"
                            , labelCol="age_idx"
                            , predictionCol="age_prediction"
                            , regParam=0.01, elasticNetParam=0.2
                           )

In [42]:
lrModelGender = lrGender.fit(urlTran)
lrModelAge = lrAge.fit(urlTran)

In [43]:
def batchCalc(batch_df, batch_id):
        
        schema = StructType([
            StructField('uid', StringType()),
            StructField('visits', StringType())
            ])
        kd = batch_df.withColumn("tt", f.from_json("value", schema=schema)).\
        withColumn("uid", f.col("tt").uid).\
        withColumn("visits", f.col("tt").visits).drop("tst", "tt")
        
        schema = ArrayType(StructType([
            StructField('url', StringType()),
            StructField('timestamp', StringType())
            ]))
        kafkaData = kd.withColumn("tt", f.explode(f.from_json("visits", schema=schema))).\
        withColumn("url", f.col("tt").url).\
        withColumn("ts", f.col("tt").timestamp).drop("visits", "tt").groupBy("uid").\
        agg(f.concat_ws("|", f.collect_list("url")).alias("urls"), f.concat_ws("|", f.collect_list("ts")).alias("timestamps"))
        
        kdTrain = urlMod.transform(kafkaData).drop("urls", "timestamps", "tok_urls", "rawFeatures")
        kdRes = lrModelAge.transform(lrModelGender.transform(kdTrain).drop("rawPrediction", "probability"))\
        .drop("rawPrediction", "probability", "features")
        
        ageLbl = urlMod.stages[2].labels
        genLbl = urlMod.stages[1].labels
        
        age_val = IndexToString(inputCol="age_prediction", outputCol="age", labels=ageLbl)
        gen_val = IndexToString(inputCol="gen_prediction", outputCol="gender", labels=genLbl)
        res = gen_val.transform(age_val.transform(kdRes)).select("uid", "gender", "age")
        dres = res.select(f.to_json(f.struct(f.col("*"))).alias("value"))
        
        dres\
             .write\
             .format('kafka')\
             .options(**write_kafka_params)\
             .mode('append')\
             .save()
        pass

In [44]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_denis.sabanov",
    "startingOffsets": "earliest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()
k = kafka_sdf.selectExpr("CAST(value AS STRING)")

In [45]:
write_kafka_params = {
           "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
           "topic": "denis.sabanov"
        }
def create_sink(df):    
    return df.writeStream.foreachBatch(lrModelAge).outputMode("append")\
           .option("checkpointLocation", "streaming/chk/chk_kafka")

In [46]:
sink = create_sink(k)
sq = sink.start()
sq.isActive

True

In [None]:
spark.stop()