In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "lab004")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
import pandas as pd
import numpy as np

import urllib.parse

import pyspark.sql.functions as f

from datetime import datetime
from scipy import stats
from pyspark.sql import types as t
from pyspark.ml.feature import VectorAssembler, StringIndexer, CountVectorizer, Normalizer, StandardScaler, IndexToString

In [5]:
df_train = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', sep='\t', header=True)

user_schema = spark.read.json(df_train.rdd.map(lambda row: row.user_json)).schema

df_train = df_train.withColumn('user_json', f.from_json('user_json', user_schema))

Filter out missing data

In [6]:
df_train = df_train.filter((f.col('gender') != '-') & (f.col('age') != '-'))

Construct target

In [7]:
df_train = df_train.withColumn('target', f.concat(f.col('gender'),f.lit('_'), f.col('age')))

## Feature engineering

In [8]:
@f.udf(t.FloatType())
def get_hour_avg(row):
    return float(np.mean([datetime.fromtimestamp(ts // 1000).hour for ts in row]))

@f.udf(t.IntegerType())
def get_hour_mode(row):
    return float(stats.mode([datetime.fromtimestamp(ts // 1000).hour for ts in row])[0][0])

@f.udf(t.ArrayType(t.IntegerType()))
def get_hours(row):
    return [datetime.fromtimestamp(ts // 1000).hour for ts in row]

@f.udf(t.ArrayType(t.StringType()))
def get_netloc(row):
    return [urllib.parse.urlparse(url).netloc for url in row]

@f.udf(t.IntegerType())
def get_morning(row):
    return sum([1 if (hour>=7)&(hour<=11) else 0 for hour in row])

@f.udf(t.IntegerType())
def get_day(row):
    return sum([1 if (hour>=12)&(hour<=18) else 0 for hour in row])

@f.udf(t.IntegerType())
def get_evening(row):
    return sum([1 if (hour>=19)&(hour<=23) else 0 for hour in row])

@f.udf(t.IntegerType())
def get_night(row):
    return sum([1 if (hour>=0)&(hour<=6) else 0 for hour in row])

In [9]:
df_train = df_train.withColumn('hour', get_hours(df_train.user_json.visits.timestamp))
df_train = df_train.withColumn('hour_avg', get_hour_avg(df_train.user_json.visits.timestamp))
df_train = df_train.withColumn('hour_mode', get_hour_mode(df_train.user_json.visits.timestamp))
df_train = df_train.withColumn('visit_cnt', f.size(df_train.user_json.visits))
df_train = df_train.withColumn('site', get_netloc(df_train.user_json.visits.url))
df_train = df_train.withColumn('morning', get_morning(df_train.hour))\
.withColumn('day', get_day(df_train.hour))\
.withColumn('evening', get_evening(df_train.hour))\
.withColumn('night', get_night(df_train.hour))

## Train test split

In [10]:
tmp = df_train.groupby('target').count().withColumn('target_ratio', f.col('count') / 36138).toPandas()

tmp = tmp[['target', 'target_ratio']].to_dict('split')['data']

tmp = dict(tmp)

In [12]:
train = df_train.sampleBy("target", fractions=tmp, seed=5757).persist()
valid = df_train.join(train, on=["uid"], how="leftanti")

train = train.na.fill(0)
valid = valid.na.fill(0)

In [13]:
site_cnt = CountVectorizer(inputCol='site', outputCol='site_cnt')
site_cnt_model = site_cnt.fit(train)
train = site_cnt_model.transform(train)
valid = site_cnt_model.transform(valid)

In [32]:
cols = ['visit_cnt', 'site_cnt', 'hour_avg', 'hour_mode', 'morning', 'day', 'evening', 'night']
assembler = VectorAssembler(inputCols=cols, outputCol="features")

train_data = assembler.transform(train).persist()
valid_data = assembler.transform(valid)

In [33]:
target_indxr = StringIndexer(inputCol="target", outputCol="label")
target2num = target_indxr.fit(train_data)
train_data = target2num.transform(train_data)
valid_data = target2num.transform(valid_data)

In [34]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, LogisticRegression, RandomForestClassifier

# Train

In [35]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [37]:
std_scaler = StandardScaler(inputCol='features', outputCol='features_scaled', withMean=True)

rf = RandomForestClassifier(labelCol='label')
logreg = LogisticRegression(featuresCol='features', labelCol='label')

pipeline_a = Pipeline(stages=[ 
    std_scaler,
    logreg
])

evaluator_a = MulticlassClassificationEvaluator(labelCol="label", metricName='accuracy')

paramGrid_a = ParamGridBuilder().addGrid(logreg.regParam, [0.3, 0.04])\
                              .addGrid(logreg.elasticNetParam, [0.8, 0.1])\
                              .build()

crossval_a = CrossValidator(estimator=pipeline_a, estimatorParamMaps=paramGrid_a,
                              evaluator=evaluator_a, numFolds=3, parallelism=3)

In [42]:
# cv_model_a = crossval.fit(train_data)

# cv_model_a.avgMetrics

# cv_model_a.getEstimatorParamMaps()[np.argmax(cv_model_a.avgMetrics)]

log reg

In [45]:
# std_model = std_scaler.fit(train_data)
# train_data = std_model.transform(train_data)

# valid_data = std_model.transform(valid_data)

In [None]:
logreg_model = logreg.fit(train_data)
predictions_valid = logreg_model.transform(valid_data)

In [None]:
evaluator_a.evaluate(predictions_valid)

In [None]:
logreg_model.write().overwrite().save('logreg_4.pkl')

RF

In [None]:
rf_model = rf.fit(train_data)
predictions_valid = rf_model.transform(valid_data)

In [None]:
evaluator_a.evaluate(predictions_valid)

In [None]:
cv_model_g.bestModel.write().overwrite().save('model_4.pkl')

# Kafka

In [None]:
num2target = IndexToString(inputCol="prediction", outputCol="prediction_labels", labels=target2num.labels)

In [48]:
read_kafka_params = {
    "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
    "subscribe": "input_tatiana.inkhireeva",
    "startingOffsets": "latest",
    "failOnDataLoss": "False"
}

kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [49]:
def foreach_batch_function(df, epoch_id):
    write_kafka_params = {
   "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
   "topic": "tatiana.inkhireeva"
        }
    
    df_parsed = (
    df
    .withColumn(
        'value',
        f.col("value").cast("string").alias("value"),
    )
    .select(
        'timestamp',
        f.json_tuple(f.col("value"), "uid", "visits")
        .alias("uid", "visits")
    )
    .withColumn(
        'uid',
        f.col('uid').cast(t.StringType())
    )
    )
    
    df_parsed = df_parsed.withColumn('user_json', f.from_json('user_json', user_schema)) \
        .withColumn('hour', get_hours(df_parsed.user_json.visits.timestamp)) \
        .withColumn('hour_avg', get_hour_avg(df_parsed.user_json.visits.timestamp)) \
        .withColumn('hour_mode', get_hour_mode(df_parsed.user_json.visits.timestamp)) \
        .withColumn('visit_cnt', f.size(df_parsed.user_json.visits)) \
        .withColumn('site', get_netloc(df_parsed.user_json.visits.url)) \
        .withColumn('morning', get_morning(df_parsed.hour)) \
        .withColumn('day', get_day(df_parsed.hour)) \
        .withColumn('evening', get_evening(df_parsed.hour)) \
        .withColumn('night', get_night(df_parsed.hour))

    df_parsed = df_parsed.na.fill(0)
    df_parsed = site_cnt_model.transform(df_parsed)
    df_parsed = assembler.transform(df_parsed)

    df_parsed = target2num.transform(df_parsed)

    df_parsed = logreg_model.transform(df_parsed)

    df_parsed = num2target.transform(df_parsed)
    
    split_col = f.split(df_parsed['prediction_labels'], '_')
    df_parsed = df_parsed.withColumn('gender', split_col.getItem(0))
    df_parsed = df_parsed.withColumn('age', split_col.getItem(1))
    
    df_parsed = df_parsed.select('uid', 'gender', 'age')

    df_parsed = df_parsed.withColumn('gender', f.lit('M'))
    df_parsed = df_parsed.withColumn('age', f.lit('25-34'))
    df = df_parsed.select(
        f.to_json(f.struct(['uid', 'gender', 'age'])).alias("value")
    )
    
    
    df.write \
            .format("kafka") \
            .options(**write_kafka_params) \
            .save()

In [247]:
def foreach_batch_function(df, epoch_id):
    write_kafka_params = {
   "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
   "topic": "tatiana.inkhireeva"
        }
    
    df.write \
            .format("kafka") \
            .options(**write_kafka_params) \
            .save()

In [None]:
def foreach_batch_function(df, epoch_id):
    write_kafka_params = {
   "kafka.bootstrap.servers": "spark-de-node-1.newprolab.com:6667" ,
   "topic": "tatiana.inkhireeva"
        }
    
    df_parsed = df.withColumn('value', f.col("value").cast("string").alias("value"),)\
    .select('timestamp',f.json_tuple(f.col("value"), "uid", "visits").alias("uid", "visits"))/
    .withColumn('uid', f.col('uid').cast(t.StringType()))\
    
    df.write \
            .format("kafka") \
            .options(**write_kafka_params) \
            .save()

In [50]:
def create_sink(df):
    return df.writeStream \
             .foreachBatch(foreach_batch_function) \
             .outputMode('append') \
             .option("checkpointLocation", "streaming/chk/chk_kafka_inkhireeva_tatiana_lab04")

sink = create_sink(kafka_sdf)

sink.start()

<pyspark.sql.streaming.StreamingQuery at 0x7fcebc326668>

In [51]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))

In [52]:
kill_all()

Stopped KafkaV2[Subscribe[input_tatiana.inkhireeva]]
