In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkContext, SparkConf
import json
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Lab3") 
conf.set("spark.driver.memory", "16g") 
conf.set("spark.driver.memoryOverhead", "32g") 
conf.set("spark.executor.memory", "4g") 
conf.set("spark.executor.instances", "2") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()



In [8]:
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.ml.classification import LogisticRegression
import json
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
from pyspark.ml.feature import CountVectorizer, StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.types import LongType


# Function to convert JSON array string to a list
def parse_json(array_str):
    json_obj = json.loads(array_str)
    for item in json_obj["visits"]:
        yield (item["url"], item["timestamp"])

In [10]:
df = spark.read.option("delimiter", "\t").csv("/labs/slaba04/gender_age_dataset.txt", header = True)


# Define the schema
json_schema = ArrayType(StructType([StructField('url', StringType(), nullable=False), StructField('timestamp', StringType(), nullable=False)]))
# Define udf
udf_parse_json = udf(lambda str: parse_json(str), json_schema)


# Generate a new data frame with the expected schema
df = df.withColumn("visits", udf_parse_json(F.col("user_json")))
df = df.select(F.col("gender"), F.col("age"), F.col("uid"), F.col("visits"))
df = df.withColumn("visits", F.explode(F.col("visits")))
df = df.withColumn("url", F.col("visits.url"))
df = df.drop(F.col("visits"))

df = df.withColumn("url", F.lower(F.expr("parse_url(url, 'HOST')"))).withColumn("url", F.regexp_replace(F.col("url"), "www.", "")).withColumn("url", F.regexp_replace(F.col("url"), "[.]", "-"))
df = df.filter(F.col("url").isNotNull())
df = df.withColumn("gender_age", F.concat(F.col("gender"), F.lit(":"), F.col("age")))

df = df.groupBy(F.col("gender_age"), F.col("uid")).agg(F.collect_list(F.col("url")).alias("domains"))

df = df.filter(df.gender_age != F.lit("-:-"))


cv = CountVectorizer(inputCol="domains", outputCol="features")

indexer = StringIndexer(inputCol="gender_age", outputCol="label")
labels = indexer.fit(df).labels

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20, 
    maxBins=32, 
    maxDepth=15,
    seed=37)

converter = IndexToString(inputCol="prediction", labels=labels, outputCol="res")

pipeline = Pipeline(stages=[cv, indexer, rf, converter])

model = pipeline.fit(df)
model.write().overwrite().save("Lab_4_model")

In [None]:
model.write().overwrite().save("Lab_4_model")

In [None]:

read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-2.newprolab.com:6667',
    "subscribe": "input_vitaly.monastyrev",
    "startingOffsets": "latest"
}
dfInput = spark.readStream.format("kafka").options(**read_kafka_params).load()

df = dfInput.selectExpr("CAST(value AS STRING)")
schema = StructType([
  StructField("uid", StringType(), True),
  StructField("visits", StringType(), True),
])

df = df.withColumn("jsonData", F.from_json(F.col("value"), schema)).select("jsonData.*")
df = df.withColumn("visits", F.concat(F.lit("{\"visits\": "), F.col("visits"), F.lit("}")))
# Define the schema
json_schema = ArrayType(StructType([StructField('url', StringType(), nullable=False), StructField('timestamp', StringType(), nullable=False)]))
# Define udf
udf_parse_json = udf(lambda str: parse_json(str), json_schema)


# Generate a new data frame with the expected schema
df = df.withColumn("visits", udf_parse_json(F.col("visits")))

df = df.withColumn("visits", F.explode(F.col("visits")))
df = df.withColumn("url", F.col("visits.url"))
df = df.drop(F.col("visits"))


df = df.withColumn("url", F.lower(F.expr("parse_url(url, 'HOST')"))).withColumn("url", F.regexp_replace(F.col("url"), "www.", "")).withColumn("url", F.regexp_replace(F.col("url"), "[.]", "-"))

df = df.groupBy(F.col("uid")).agg(F.collect_list(F.col("url")).alias("domains"))
model = PipelineModel.load("Lab_4_model")
df = model.transform(df)
df = df.select(F.col("uid"), F.col("res").alias("gender_age"))

split_col = F.split(F.col("gender_age"), ':')
df = df.withColumn('gender', split_col.getItem(0))
df = df.withColumn('age', split_col.getItem(1))
df = df.select(F.col("uid"), F.col("gender"), F.col("age"))


write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-2.newprolab.com:6667',
   "topic": "vitaly.monastyrev"
}
query = df.selectExpr("CAST(uid AS STRING) AS key", "to_json(struct(*)) AS value")\
    .writeStream\
    .format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .option("maxOffsetsPerTrigger", 200)\
    .outputMode("update").start()

query.awaitTermination()
