In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 --executor-memory 5g --executor-cores 4 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.sql.functions import split, col, explode, regexp_replace, round,when, collect_list, sum, count, array, row_number, avg
from pyspark.ml.linalg import Vectors
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
input_dir = "hdfs:///labs/slaba04/gender_age_dataset.txt"
output_dir = "lab04_train/train"

In [4]:
sh = StructType() \
      .add("timestamp",LongType(),True) \
      .add("url",StringType(),True) 

In [5]:

schema1 = StructType().add("visits", ArrayType(sh),True)

In [6]:
schema_Kafka = StructType().add("uid",StringType(),True).add("visits", ArrayType(sh),True)

In [7]:
schema_Kafka

StructType(List(StructField(uid,StringType,true),StructField(visits,ArrayType(StructType(List(StructField(timestamp,LongType,true),StructField(url,StringType,true))),true),true)))

In [8]:

schema = StructType() \
      .add("gender",StringType(),True) \
      .add("age",StringType(),True) \
      .add("uid",StringType(),True) \
      .add("user_json",StringType(),True) 

In [9]:
 
df = spark.read.options(delimiter='\t').options(header = True).schema(schema).csv(input_dir)

In [10]:
df2 = df.withColumn("visits",F.json_tuple("user_json","visits")).withColumn("visits",F.from_json("visits",ArrayType(sh)))
#.select("gender","age","uid",F.expr("TRIM(BOTH '[]' FROM visits)").alias("visits"))

In [11]:
df3 = df2.select("gender","age","uid",explode("visits").alias("visits")) \
         .select("gender","age","uid",col("visits")["url"].alias("src0")) \
         .select("gender","age","uid",regexp_replace(regexp_replace("src0", "%3A%2F%2F","://"), "3A",":").alias("src1")) \
         .select("gender","age","uid",regexp_replace("src1", "((http)|(https))(://)","").alias("src2")) \
         .select("gender","age","uid",regexp_replace("src2", "^www.","").alias("url_")) \
         .select("gender","age","uid",F.trim(split("url_", "%2")[0]).alias("url")) \
         .select("gender","age","uid",split("url", "/")[0].alias("domain")) \
         .select("uid", F.concat_ws(":","gender", "age").alias("gender_age") , "domain") \
      

In [12]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="domains", outputCol="features")

In [13]:
from pyspark.ml.feature import StringIndexer, IndexToString
st = StringIndexer(inputCol="gender_age", outputCol="label")

In [17]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=500, maxDepth=10, subsamplingRate=0.1)

In [18]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[cv,st,rf])

In [19]:
training = df3.groupBy("uid", "gender_age").agg(collect_list("domain").alias("domains")).filter(col("gender_age") != "-:-")




In [20]:
model = pipeline.fit(training)

In [19]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_yuriy.severyukhin",
    "startingOffsets": "latest"
}


In [21]:
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params1).load()


In [37]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "yuriy.severyukhin"
}


In [23]:
!hdfs dfs -rm -R streaming/chk/chk_kafka/*

22/10/26 11:27:49 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/streaming/chk/chk_kafka/commits' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/.Trash/Current/user/yuriy.severyukhin/streaming/chk/chk_kafka/commits
22/10/26 11:27:49 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/streaming/chk/chk_kafka/metadata' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/.Trash/Current/user/yuriy.severyukhin/streaming/chk/chk_kafka/metadata
22/10/26 11:27:49 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/streaming/chk/chk_kafka/offsets' to trash at: hdfs://spark-master-1.newprolab.com:8020/user/yuriy.severyukhin/.Trash/Current/user/yuriy.severyukhin/streaming/chk/chk_kafka/offsets
22/10/26 11:27:49 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-master-1.newprolab.com:8020/user/yuri

In [26]:
in_stream = kafka_sdf.select(F.json_tuple(col("value").cast("string"),"uid","visits")).select(col("c0").alias("uid"),col("c1").alias("visits")).withColumn("visits",F.from_json("visits",ArrayType(sh)))

In [27]:
dfInput = in_stream.select("uid",explode("visits").alias("visits")) \
         .select("uid",col("visits")["url"].alias("src0")) \
         .select("uid",regexp_replace(regexp_replace("src0", "%3A%2F%2F","://"), "3A",":").alias("src1")) \
         .select("uid",regexp_replace("src1", "((http)|(https))(://)","").alias("src2")) \
         .select("uid",regexp_replace("src2", "^www.","").alias("url_")) \
         .select("uid",F.trim(split("url_", "%2")[0]).alias("url")) \
         .select("uid",split("url", "/")[0].alias("domain")) 
        

In [28]:
testing = dfInput.groupBy("uid").agg(collect_list("domain").alias("domains"))

In [29]:
predicts = model.transform(testing)

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'


In [30]:
converter = IndexToString(inputCol="prediction", outputCol="gender_age")



In [31]:
labelIndexerModel = st.fit(training.filter(col("gender_age") != "-:-"))

In [32]:
converter.setLabels(labelIndexerModel.labels)

IndexToString_62108b729d23

In [34]:
out_ = converter.transform(predicts).select("uid",split("gender_age",":").alias("ge")).select("uid",col("ge")[0].alias("gender"), col("ge")[1].alias("age")).distinct().coalesce(1).cache()

In [36]:
structed = out_ .select(col("uid").alias("key"),F.struct(col("uid"), col("age"),col("gender")).alias("s"))
json_ds = structed.withColumn("value", F.to_json("s")).drop("s").drop("key")

In [38]:
json_ds.selectExpr("CAST(value AS STRING)").write.format("kafka").options(**write_kafka_params).save()
    

In [33]:
sink = json_ds.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("update")

In [None]:
sQuery = sink.start()
sQuery.awaitTermination()

In [129]:
spark.stop()