In [148]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [149]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row

from pyspark.ml.feature import HashingTF, StandardScaler, OneHotEncoder, RegexTokenizer, VectorAssembler, IndexToString, StringIndexer, CountVectorizer, IDF
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.window import Window
from pyspark.ml.classification import GBTClassifier, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [150]:
PARTITION_NUM = 4
KAFKA_CONNECT = 'spark-master-1.newprolab.com:6667'
INPUT_TOPIC = 'input_konstantin.kulushev'
OUTPUT_TOPIC = 'konstantin.kulushev'

read_kafka_params = {
    "kafka.bootstrap.servers": KAFKA_CONNECT,
    "subscribe": INPUT_TOPIC,
    "startingOffsets": "latest",
    "failOnDataLoss": False,
}

write_kafka_params = {
   "kafka.bootstrap.servers": KAFKA_CONNECT,
   "topic": OUTPUT_TOPIC,
}

Prepare

In [177]:
schema = ArrayType(StringType())
pattern = r'\/\/[^@\/\n]+(?:www\.)?(?:[^:\/\n]+)'

train = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', header=True, sep='\t') \
        .filter(F.col('gender') != '-')
        #.withColumn('visits', F.get_json_object(F.col('user_json'), '$.visits')) \
        #.drop('user_json')

In [172]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCol, HasInputCol, HasOutputCols
# self.getInputCol()

class CountArrayLenTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(CountArrayLenTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.size(F.col(self.getInputCol())))
    
class SearchFreqTransformer(Transformer, HasInputCol, HasOutputCol):
    search_engines = ['yandex', 'google', 'yahoo']
    site = ['http://www.pornhub.com', 'http://www.xvideos.com', 'http://ruxvideos.ru']
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(SearchFreqTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.when(F.array_contains(self.getInputCol(), 'http://www.pornhub.com'), 1).otherwise(0))

    
class CutterArrayLenTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(CutterArrayLenTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset \
            .withColumn('idx', F.monotonically_increasing_id()) \
            .withColumn('exploded_col', F.explode(col(self.getInputCol()))) \
            .withColumn('substr_col', F.substring(col('exploded_col'),2,100)) \
            .groupBy(col('idx')) \
            .agg(F.collect_list('substr_col').alias('new_column'))
            .withColumn(self.getOutputCol(), )

In [173]:
regex_tokenizer = RegexTokenizer(inputCol='user_json', outputCol='domains', gaps=False, pattern=pattern)
tf = HashingTF(inputCol=regex_tokenizer.getOutputCol(), outputCol='tf_features', numFeatures=15000)
idf = IDF(inputCol=tf.getOutputCol(), outputCol='features')

mytr1 = CountArrayLenTransformer(inputCol=regex_tokenizer.getOutputCol(), outputCol='sites_cnt')
mytr2 = SearchFreqTransformer(inputCol=regex_tokenizer.getOutputCol(), outputCol='porn_sites_cnt')

cv = CountVectorizer(inputCol=regex_tokenizer.getOutputCol(), outputCol='cv_features', binary=False)
#scaler = StandardScaler(inputCol=cv.getOutputCol(), outputCol='scaled_features', withStd=True, withMean=True)

indexer_age = StringIndexer(inputCol="age", outputCol="age_label")
indexer_gender = StringIndexer(inputCol="gender", outputCol="gender_label")

assembler = VectorAssembler(inputCols=['cv_features'], outputCol='features')

lr_age = LogisticRegression(labelCol=indexer_age.getOutputCol(),
                            featuresCol='features',
                            predictionCol='pred_age',
                            probabilityCol='prob_age',
                            rawPredictionCol='rawPred_age',
                            regParam=0.4,
                            elasticNetParam=0.1)
lr_gender = LogisticRegression(labelCol=indexer_gender.getOutputCol(),
                               featuresCol='features',
                               predictionCol='pred_gender',
                               probabilityCol='prob_gender',
                               rawPredictionCol='rawPred_gender',
                               regParam=0.2,
                               elasticNetParam=0.1)

gbt = GBTClassifier(labelCol=indexer_gender.getOutputCol(), featuresCol="features", maxDepth=7)


rfc_age = RandomForestClassifier(labelCol=indexer_age.getOutputCol(),
                            featuresCol='features',
                            predictionCol='pred_age',
                            probabilityCol='prob_age',
                            rawPredictionCol='rawPred_age', maxDepth=7)

rfc_gender = RandomForestClassifier(labelCol=indexer_gender.getOutputCol(),
                            featuresCol='features',
                            predictionCol='pred_gender',
                            probabilityCol='prob_gender',
                            rawPredictionCol='rawPred_gender', maxDepth=7)

converter_age = IndexToString(inputCol=rfc_age.getPredictionCol(), outputCol='age_pred', labels=['18-24', '25-34', '35-44', '45-54', '>=55'])
converter_gender = IndexToString(inputCol=rfc_gender.getPredictionCol(), outputCol='gender_pred', labels=['F', 'M'])




In [154]:
pipe_prepare = Pipeline(stages=[regex_tokenizer, tf, idf])
pipe_index = Pipeline(stages=[indexer_age, indexer_gender])
pipe_age = Pipeline(stages=[rfc_age, converter_age])
pipe_gender = Pipeline(stages=[rfc_gender, converter_gender])
pipeline = Pipeline(stages=[pipe_prepare, pipe_index, pipe_age, pipe_gender])
model = pipeline.fit(train)

In [None]:
#model_age.write().overwrite().save('models/model_age')
#model_gender.write().overwrite().save('models/model_gender')

Streams

In [107]:
def foreach_batch_function(df, epoch_id):
    predictions = model.transform(df)
    results = predictions.select(F.col('uid'), F.col('gender_pred').alias('gender'), F.col('age_pred').alias('age')) \
               .withColumn('value', F.to_json(F.struct(['uid', 'gender', 'age'])))
    
    return results.select('value')

In [155]:
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

#test = spark.read.format('kafka').options(**read_kafka_params).load()
test = kafka_sdf.select(F.col('value').cast('string').alias('user_json')) \
           .withColumn('uid', F.get_json_object(F.col('user_json'), '$.uid')) \

predictions = model.transform(test)

results = predictions.select(F.col('uid'), F.col('gender_pred').alias('gender'), F.col('age_pred').alias('age')) \
               .withColumn('value', F.to_json(F.struct(['uid', 'gender', 'age'])))

results.select('value') \
    .writeStream \
    .format("kafka") \
    .options(**write_kafka_params) \
    .option("checkpointLocation", "kkv/chk_lab04")\
    .outputMode("append").start()#.awaitTermination()

# test.writeStream \
#     .foreachBatch(foreach_batch_function) \
#     .format("kafka") \
#     .options(**write_kafka_params) \
#     .option("checkpointLocation", "kkv/chk_lab04") \
#     .outputMode("append").start()


<pyspark.sql.streaming.StreamingQuery at 0x7f44f8ff1908>

In [158]:
SparkSession.builder.getOrCreate().streams.active

[]

In [157]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))
            
kill_all()

Stopped KafkaV2[Subscribe[input_konstantin.kulushev]]


In [178]:
spark.stop()