In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Tatiana Gavrikova")
         .getOrCreate())

In [3]:
spark

In [4]:
sc = spark.sparkContext
sc

# Читаем данные для обучения

In [None]:
! hdfs dfs -ls /labs/slaba04/

In [None]:
! hdfs dfs -head /labs/slaba04/gender_age_dataset.txt | sed -n '1,2p'

In [5]:
visitSchema = StructType([
    StructField("url", StringType())
])

In [6]:
jsonSchema = StructType([
    StructField("visits", ArrayType(visitSchema))
])

In [7]:
schema = StructType([
    StructField("gender", StringType()),
    StructField("age", StringType()),
    StructField("uid", StringType()),
    StructField("user_json", StringType())
])

In [127]:
df_train = spark.read.schema(schema).format("csv").option("delimiter", "\t").option("header", "true").load("/labs/slaba04/gender_age_dataset.txt").filter(~ F.col("gender").like("-"))

In [128]:
df_train.show(2, True, True)

-RECORD 0-------------------------
 gender    | F                    
 age       | 18-24                
 uid       | d50192e5-c44e-4ae... 
 user_json | {"visits": [{"url... 
-RECORD 1-------------------------
 gender    | M                    
 age       | 25-34                
 uid       | d502331d-621e-472... 
 user_json | {"visits": [{"url... 
only showing top 2 rows



In [129]:
df_train_1 = df_train.withColumn("visits", F.from_json(df_train.user_json, jsonSchema).visits)

In [None]:
df_train_1.take(1)

In [130]:
df_train_2 = df_train_1.withColumn("visit", F.explode(df_train_1.visits))

In [None]:
df_train_2.take(1)

In [131]:
df_train_3 = df_train_2.withColumn("url", df_train_2.visit.url)

In [None]:
df_train_3.take(1)[0].url

In [132]:
df_train_4 = df_train_3.withColumn("site", F.regexp_extract(df_train_3.url, r'http(s)?:\/\/(www\.)?(([\w-]+[\.]?)+)', 3))

In [133]:
df_train_5 = df_train_4.groupBy(df_train_4.uid, df_train_4.gender, df_train_4.age).agg(F.collect_list(df_train_4.site).alias("sites"))

In [None]:
df_train_5.printSchema()

# Обучение

In [14]:
from pyspark.ml.feature import HashingTF

In [83]:
htf = HashingTF(numFeatures=10000, inputCol="sites", outputCol="features", binary=True)

## Подготовим label колонки

In [16]:
from pyspark.ml.feature import StringIndexer

In [143]:
si_gender = StringIndexer(inputCol="gender", outputCol="gender_i")

In [144]:
si_age = StringIndexer(inputCol="age", outputCol="age_i")

## Две модели, на возраст и пол

In [22]:
from pyspark.ml.classification import RandomForestClassifier

In [164]:
rfc_gender = RandomForestClassifier(featuresCol='features', labelCol='gender_i', maxDepth=30, maxBins=100, numTrees=500)

In [165]:
rfc_age = RandomForestClassifier(featuresCol='features', labelCol="age_i", maxDepth=30, maxBins=100, numTrees=500)

# Pipelines

In [72]:
from pyspark.ml import Pipeline

In [166]:
pipeline_gender = Pipeline().setStages([htf, si_gender, rfc_gender])

In [167]:
pipeline_age = Pipeline().setStages([htf, si_age, rfc_age])

In [168]:
pipeline_gender_model = pipeline_gender.fit(df_train_5)

KeyboardInterrupt: 

In [None]:
pipeline_age_model = pipeline_age.fit(df_train_5)

In [153]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_gender = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="gender_i")
evaluator_age = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="age_i")

In [None]:
evaluator_gender.evaluate(pipeline_gender_model.transform(df_train_5))

In [None]:
evaluator_age.evaluate(pipeline_age_model.transform(df_train_5))

# CrossValidation

In [158]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [160]:
rf_gender_paramGrid = (ParamGridBuilder()
             .addGrid(rfc_gender.maxDepth, [10, 20, 30])
             .addGrid(rfc_gender.maxBins, [40, 80, 100])
             .addGrid(rfc_gender.numTrees, [100, 500])
             .build())

In [163]:
crossval_gender = CrossValidator(estimator=pipeline_gender_model,
                          estimatorParamMaps=rf_gender_paramGrid,
                          evaluator=evaluator_gender,
                          numFolds=3)

cv_gender_model = crossval_gender.fit(df_train_5)

AttributeError: 'PipelineModel' object has no attribute 'fitMultiple'

In [90]:
pipeline_gender_model.write().overwrite().save("pipeline_gender_model.model")

In [91]:
pipeline_age_model.write().overwrite().save("pipeline_age_model.model")

In [88]:
! hdfs dfs -ls

Found 7 items
drwx------   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 21:54 .Trash
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 20:43 .sparkStaging
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 21:14 lab04_debug
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 21:14 lab04_debug_checkpoint
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-04 23:40 lab05.csv
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 22:01 pipeline_age_model.model
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 22:01 pipeline_gender_model.model


# Читаем данные из кафки

In [40]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_tatiana.gavrikova",
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [44]:
sinc = kafka_sdf.writeStream.format("parquet").option("path", "lab04_debug").outputMode("append").option("checkpointLocation", "lab04_debug_checkpoint")

In [45]:
sq = sinc.start()

In [48]:
sq.status

{'message': 'Getting offsets from KafkaV2[Subscribe[input_tatiana.gavrikova]]',
 'isDataAvailable': False,
 'isTriggerActive': True}

In [49]:
sq.stop()

# Читаем дебаг данные из паркета, которые прочитали из кафки

In [50]:
! hdfs dfs -ls

Found 6 items
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 20:43 .sparkStaging
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 21:14 lab04_debug
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 21:14 lab04_debug_checkpoint
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-04 23:40 lab05.csv
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 20:56 rfc_age_model.model
drwxr-xr-x   - tatiana.gavrikova tatiana.gavrikova          0 2022-11-06 20:56 rfc_gender_model.model


In [51]:
df_test = spark.read.format("parquet").load("lab04_debug")

In [183]:
df_test = kafka_sdf

In [52]:
df_test.count()

5000

In [53]:
df_test.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [61]:
kafkaValueSchema = StructType([
    StructField("uid", StringType()),
    StructField("visits", StringType())
])

In [184]:
df_test_1 = df_test.select(F.from_json(df_test.value.cast(StringType()), schema=kafkaValueSchema).alias("json_struct"))

In [67]:
df_test_1.printSchema()

root
 |-- json_struct: struct (nullable = true)
 |    |-- uid: string (nullable = true)
 |    |-- visits: string (nullable = true)



In [185]:
df_test_2 = df_test_1.select(df_test_1.json_struct.uid.alias("uid"), F.from_json(df_test_1.json_struct.visits, ArrayType(visitSchema)).alias("urls"))

In [None]:
import re
@F.pandas_udf(ArrayType(StringType()))
def extract_site_name(urls_sr):
    return urls_sr.apply(lambda urls: list(set([re.search(r'http(s)?:\/\/(www\.)?(([\w-]+[\.]?)+)', url).group(3) for url in urls])))

In [None]:
df_test_3 = df_test_2.withColumn('sites', extract_site_name(df_test_2.urls))

In [None]:
preds_gender = pipeline_gender_model.transform(df_test_3)

In [None]:
preds_age = pipeline_age_model.transform(df_test_3)

## IndexToString

In [92]:
from pyspark.ml.feature import IndexToString

In [156]:
is_gender = IndexToString(inputCol="prediction", outputCol="gender", labels=pipeline_gender_model.stages[1].labels)

In [157]:
is_age = IndexToString(inputCol="prediction", outputCol="age", labels=pipeline_age_model.stages[1].labels)

In [None]:
df_gender = is_gender.transform(preds_gender)

In [None]:
df_age = is_age.transform(preds_age)

In [None]:
df_res = df_gender.join(df_age, on="uid").select(F.col('uid'), F.col('gender'), F.col('age'))

In [None]:
df_res.printSchema()

In [None]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "tatiana.gavrikova"
}
write_sinc = df_res.select(df_res.uid, df_res.gender, df_res.age)\
    .select(F.to_json(F.struct(F.col("*"))).alias("value"))\
    .writeStream\
    .format("kafka")\
    .options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append")

In [None]:
sq = write_sinc.start()

In [None]:
spark.stop()