In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import Row
import json
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as f
import re
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, rank, lit, split,  udf, desc, when, concat_ws, lower

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("rez_test")
         .getOrCreate())

In [3]:
logs = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .csv("/labs/slaba04/gender_age_dataset.txt")

logs.show(5, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
df = logs

In [5]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [6]:
def splitter2(object):
    c = object[12:-1].replace('}', '}]')[:-2].split('], ')
    a = []
    for i in c:
        if 'http://' in i:
            a += [i[9:].split('http://')[1].split('/')[0]]
        elif 'https://' in i:
            a += [i[9:].split('https://')[1].split('/')[0]]
    return a
splitter2 = udf(splitter2, ArrayType(StringType()))

In [7]:
df2 = df.filter(f.col('age') != '-').select('gender', 'age', 'uid', splitter2('user_json').alias('user_json'))

In [8]:
hasher_freq = HashingTF(numFeatures=10000, binary=True, inputCol='user_json', outputCol="url")
dataset2_freq = hasher_freq.transform(df2)

In [9]:
dic1 = {'F': 1, 'M': 0}

In [10]:
@f.pandas_udf(IntegerType())
def gender_find(object):
    return object.apply(lambda x: dic1[x])

In [11]:
dic2 = {'>=55': 0, '45-54': 1, '35-44': 2, '25-34': 3, '18-24': 4}

In [12]:
@f.pandas_udf(IntegerType())
def age_find(object):
    return object.apply(lambda x: dic2[x])

In [13]:
dataset3 = dataset2_freq.withColumn('gender2', gender_find('gender')).withColumn('age2', age_find('age'))

In [14]:
dataset3.show()

+------+-----+--------------------+--------------------+--------------------+-------+----+
|gender|  age|                 uid|           user_json|                 url|gender2|age2|
+------+-----+--------------------+--------------------+--------------------+-------+----+
|     F|18-24|d50192e5-c44e-4ae...|[zebra-zoya.ru, n...|(10000,[1497,8072...|      1|   4|
|     M|25-34|d502331d-621e-472...|[sweetrading.ru, ...|(10000,[38,741,79...|      0|   3|
|     F|25-34|d50237ea-747e-48a...|[ru.oriflame.com,...|(10000,[1026,4702...|      1|   3|
|     F|25-34|d502f29f-d57a-46b...|[translate-tattoo...|(10000,[2622,7172...|      1|   3|
|     M| >=55|d503c3b2-a0c2-4f4...|[mail.rambler.ru,...|(10000,[184,225,3...|      0|   0|
|     F|25-34|d5090ddf-5648-487...|[cfire.mail.ru, p...|(10000,[392,444],...|      1|   3|
|     F|25-34|d50bcef8-16ff-4e8...|[www.msn.com, www...|(10000,[2094,2951...|      1|   3|
|     F|18-24|d50e23dc-0cbd-488...|[www.gazprom.ru, ...|(10000,[172,651,1...|      1|   4|

In [15]:
from pyspark.ml.classification import GBTClassifier

In [16]:
gb = GBTClassifier(featuresCol='features', labelCol='label', maxDepth=4, maxBins=32, maxIter = 20)

In [17]:
dataset_age = dataset3.select('age2','url').withColumnRenamed('age2', 'label').withColumnRenamed('url', 'features')

In [24]:
dataset_age.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4744|
|    3|15457|
|    4| 4898|
|    2| 9360|
|    0| 1679|
+-----+-----+



In [30]:
dataset_age2 = dataset_age.filter("label=0").limit(1000)\
               .union(dataset_age.filter("label=1").limit(1000))\
               .union(dataset_age.filter("label=2").limit(1000))\
               .union(dataset_age.filter("label=3").limit(1000))\
               .union(dataset_age.filter("label=4").limit(1000))
dataset_age2.groupBy('label').count().show()            

+-----+-----+
|label|count|
+-----+-----+
|    1| 1000|
|    3| 1000|
|    4| 1000|
|    2| 1000|
|    0| 1000|
+-----+-----+



In [18]:
dataset_gender = dataset3.select('gender2','url').withColumnRenamed('gender2', 'label').withColumnRenamed('url', 'features')

In [19]:
dataset_gender.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(10000,[1497,8072...|
|    0|(10000,[38,741,79...|
|    1|(10000,[1026,4702...|
|    1|(10000,[2622,7172...|
|    0|(10000,[184,225,3...|
|    1|(10000,[392,444],...|
|    1|(10000,[2094,2951...|
|    1|(10000,[172,651,1...|
|    1|(10000,[3252,5553...|
|    1|(10000,[585,2521,...|
|    1|(10000,[450,1068,...|
|    1|(10000,[8726,9157...|
|    0|(10000,[452,489,4...|
|    1|(10000,[428,3582,...|
|    1|(10000,[1239,2523...|
|    0| (10000,[876],[1.0])|
|    0|(10000,[4602],[1.0])|
|    1|(10000,[111,1108,...|
|    1|(10000,[5482],[1.0])|
|    1|(10000,[7525],[1.0])|
+-----+--------------------+
only showing top 20 rows



In [20]:
%%time
gbt_model = gb.fit(dataset_gender)

CPU times: user 124 ms, sys: 52.5 ms, total: 176 ms
Wall time: 3min 19s


In [21]:
gbpredictions = gbt_model.transform(dataset_age)

In [23]:
gbpredictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|11097|
|       1.0|25041|
+----------+-----+



In [None]:
# print('Accuracy:', gbevaluator.evaluate(gbpredictions))

In [25]:
from pyspark.ml.classification import RandomForestClassifier

In [31]:
forest_age = RandomForestClassifier(featuresCol='features', labelCol='label', maxDepth=4, maxBins=32, seed=44)

In [33]:
%%time
forest_age = forest_age.fit(dataset_age2)

CPU times: user 24.9 ms, sys: 12.4 ms, total: 37.3 ms
Wall time: 41.6 s


In [34]:
prediction = forest_age.transform(dataset_age)

In [35]:
prediction.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 4083|
|       1.0| 2934|
|       4.0| 8917|
|       3.0|17204|
|       2.0| 3000|
+----------+-----+



In [36]:
KAFKA_BOOTSTRAP_SERVER = 'spark-node-1.newprolab.com:6667'
KAFKA_INPUT_TOPIC = 'input_elena.pavlyuk'
KAFKA_OUTPUT_TOPIC = 'elena.pavlyuk'

In [37]:
@f.pandas_udf(StringType())
def uid_find(object):
    return object.apply(lambda x: x[9:].split('", "vi')[0])

In [38]:
def splitter3(object):
    c = object[12:-1].replace('}', '}]')[:-4].split('], ')
    a = []
    for i in c:
        if 'http://' in i:
            a += [i[9:].split('http://')[1].split('/')[0]]
        elif 'https://' in i:
            a += [i[9:].split('https://')[1].split('/')[0]]
    return a
splitter3 = udf(splitter3, ArrayType(StringType()))

In [39]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "elena.pavlyuk",
    "startingOffsets": "latest"
}
#kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

In [40]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_elena.pavlyuk",
    "startingOffsets": "latest"
}
df = spark.readStream.format("kafka").options(**read_kafka_params).load()
df_kafka = df.selectExpr("CAST(value AS STRING)")

df_kafka = df_kafka.withColumn('user_json', splitter3(col('value'))).withColumn('uid', uid_find(col('value')))

kafka_out = hasher_freq.transform(df_kafka).withColumnRenamed('url', 'features')

kafka_out_age = forest_age.transform(kafka_out)

kafka_out_age = kafka_out_age.select('uid', col('prediction').alias('age2'))

kafka_out_gender = gbt_model.transform(kafka_out)

kafka_out_gender = kafka_out_gender.select('uid', col('prediction').alias('gender2'))

dic3 = {v: k for k, v in dic1.items()}
dic4 = {v: k for k, v in dic2.items()}
@f.pandas_udf(StringType())
def gender_find(object):
    return object.apply(lambda x: dic3[x])
@f.pandas_udf(StringType())
def age_find(object):
    return object.apply(lambda x: dic4[x])

kafka_out = kafka_out_age.join(kafka_out_gender, on = 'uid').withColumn('gender', gender_find(col('gender2')))\
.withColumn('age', age_find(col('age2'))).drop('age2', 'gender2')

kafka_out = kafka_out.select(f.to_json(f.struct(*kafka_out.columns)).alias('value'))

write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "elena.pavlyuk"
}
kafka_out.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f1abcfbd9e8>

In [None]:
#elena.pavlyuk@spark-master-4:-$ /usr/hdp/current/kafka-broker/bin/kafka-topics.sh --zookeeper spark-master-1 --delete --topic input_elena.pavlyuk

In [41]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = ""
            try:
                desc = s.lastProgress["sources"][0]["description"]
            except:
                print(f"{s} data not available")
            s.stop()
            print(f"Stopped {desc} at {s}")

In [42]:
kill_all()