In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [10]:
import re
import json

In [11]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


In [12]:
schema = StructType([StructField('gender', StringType()),
                    StructField('age', StringType()),
                    StructField('uid', StringType()),
                    StructField('user_json', StringType())])
data = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', schema=schema, header=True, sep='\t')
data.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [14]:
json_schema = StructType().add('visits', ArrayType(StructType([StructField('url', StringType()),
                                                              StructField('timestamp', StringType())])))
data = data.withColumn('parsed_json', f.from_json('user_json', json_schema)).repartition(9)

data.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)
 |-- parsed_json: struct (nullable = true)
 |    |-- visits: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- timestamp: string (nullable = true)



In [15]:
tmp_df = data.select('uid', 'gender', 'age', f.explode(f.col('parsed_json').visits).alias('visits')) \
                .select('uid', 'gender', 'age', f.col('visits').url.alias('url'), 
                        f.col('visits').timestamp.alias('timestamp').cast(LongType()))

In [16]:
clean_df = tmp_df.groupby("uid").agg(f.max(tmp_df.gender).alias('gender'), f.max(tmp_df.age).alias('age'),
                                     f.concat_ws("|", f.collect_list(tmp_df.url)).alias('urls'),
                                    f.collect_list(tmp_df.timestamp).alias('timestamps')).cache()

In [17]:
clean_df.printSchema()

root
 |-- uid: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- urls: string (nullable = false)
 |-- timestamps: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [18]:
clean_df = clean_df.where((f.col('age')!='-')&(f.col('gender')!='-'))

In [19]:
clean_df = clean_df.withColumn('age_gender', f.concat(f.col('age'), f.lit('|'), f.col('gender')))

In [20]:
clean_df.groupby('age_gender').count().show()

+----------+-----+
|age_gender|count|
+----------+-----+
|    >=55|M|  784|
|   45-54|F| 2597|
|   18-24|F| 2886|
|   25-34|F| 6791|
|   45-54|M| 2147|
|   35-44|M| 5089|
|   18-24|M| 2012|
|   25-34|M| 8666|
|   35-44|F| 4271|
|    >=55|F|  895|
+----------+-----+



In [21]:
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark import keyword_only

from functools import partial

In [22]:
def clean_url(tokens):
    return [t for t in tokens if t.isalpha() and len(t)>2 and t not in ["http", "www", "https", "html", "ru", "com"]]

class UrlTransformer(Transformer, HasInputCol, HasOutputCol):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(UrlTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        tokenize_udf = f.udf(clean_url, returnType=ArrayType(StringType()))
        return dataset.withColumn(self.getOutputCol(), tokenize_udf(f.col(self.getInputCol())))

In [55]:
from pyspark.ml.feature import StringIndexer, IDF, RegexTokenizer, StopWordsRemover, HashingTF, IndexToString
from pyspark.ml import Pipeline

In [94]:
indexer_age = StringIndexer(inputCol="age", outputCol="age_target")
indexer_gender = StringIndexer(inputCol="gender", outputCol="gender_target")
tokenizer = RegexTokenizer(inputCol='urls', outputCol='tokens', pattern="[\\p{Punct}\\s]+")
cleaner = UrlTransformer(inputCol=tokenizer.getOutputCol(), outputCol='clean_tokens')
vect = HashingTF(inputCol=cleaner.getOutputCol(), outputCol='features', numFeatures=15000)
idf = IDF(inputCol=vect.getOutputCol(), outputCol='idf')
pipeline = Pipeline(stages=[indexer_age, indexer_gender, tokenizer, cleaner, vect, idf]).fit(clean_df)

In [95]:
features = pipeline.transform(clean_df)

In [26]:
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [96]:
gbt_gender = GBTClassifier(labelCol="gender_target", featuresCol="idf", predictionCol='gender_pred', maxIter=10)

model_gender = gbt_gender.fit(features.select('idf', 'gender_target'))

In [98]:
lr_age = LogisticRegression(labelCol="age_target", featuresCol="idf", predictionCol='age_pred', 
                            probabilityCol="age_probability", rawPredictionCol="age_rawPrediction", 
                            elasticNetParam=0.2, regParam=0.01)

model_age = lr_age.fit(features.select('idf', 'age_target'))

In [99]:
! hdfs dfs -rm -R gender_gbt

21/03/23 17:02:51 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/gender_gbt' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/.Trash/Current/user/maria.godgildieva/gender_gbt1616508171347


In [100]:
! hdfs dfs -rm -R age_lr

21/03/23 17:02:54 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/age_lr' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/.Trash/Current/user/maria.godgildieva/age_lr1616508174786


In [101]:
model_gender.save('gender_gbt')
model_age.save('age_lr')

In [None]:
# model_gender = GBTClassifier.load('gender_gbt')
# model_age = LogisticRegression.load('age_gbt')

In [102]:
!hdfs dfs -rm -R streaming/chk/chk_kafka

21/03/23 17:03:00 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/streaming/chk/chk_kafka' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/maria.godgildieva/.Trash/Current/user/maria.godgildieva/streaming/chk/chk_kafka1616508180121


In [103]:
def foreach_batch_function(batch_df, epoch_id):
    json_schema = StructType([StructField('uid', StringType()),
                          StructField('visits', StringType())])
    json_schema2 = ArrayType(StructType([StructField('url', StringType()),StructField('timestamp', StringType())]))

    data = batch_df.withColumn('tmp', f.from_json('value', json_schema)) \
            .select(f.col('tmp.uid').alias('uid'), f.from_json('tmp.visits', json_schema2).alias('visits'))
    tmp_df = data.select('uid', f.explode(f.col('visits')).alias('visits')) \
                .select('uid', f.col('visits').url.alias('url'), 
                        f.col('visits').timestamp.alias('timestamp').cast(LongType()))

    clean_data = tmp_df.groupby("uid").agg(f.concat_ws("|", f.collect_list(tmp_df.url)).alias('urls'),
                                    f.collect_list(tmp_df.timestamp).alias('timestamps'))
    clean_data = pipeline.transform(clean_data)

    pred = model_age.transform(model_gender.transform(clean_data))

    age_labels = pipeline.stages[0].labels
    gender_labels = pipeline.stages[1].labels
    age_to_label = IndexToString(inputCol="age_pred", outputCol="age", labels=age_labels)
    gender_to_label = IndexToString(inputCol="gender_pred", outputCol="gender", labels=gender_labels)
    res = gender_to_label.transform(age_to_label.transform(pred)).select('uid', 'age', 'gender') \
                    .select(f.to_json(f.struct(f.col('*'))).alias('value'))
    write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "maria.godgildieva"
    }
    res.write\
     .format('kafka')\
     .options(**write_kafka_params)\
     .mode('append')\
     .save()
    pass

In [104]:
def create_console_sink(df):
    return df.writeStream\
            .foreachBatch(foreach_batch_function)\
            .option('checkpointLocation', 'streaming/chk/chk_kafka')

In [105]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_maria.godgildieva",
    "startingOffsets": "earliest"
}
kafka_df = spark.readStream.format("kafka").options(**read_kafka_params).option("failOnDataLoss", 'False').load()
kafka_df = kafka_df.selectExpr("CAST(value AS STRING)")

In [106]:
kafka_df.isStreaming

True

In [107]:
sink = create_console_sink(kafka_df)
sq = sink.start()

In [108]:
sq.isActive

True

In [109]:
sc.stop()