In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("dmitriy.sokolov lab04").getOrCreate()

In [50]:
spark.stop()

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pyspark.ml.regression import GBTRegressor, RandomForestRegressor
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import *

spark.sparkContext.setCheckpointDir('checkpoint/')

In [61]:
# import pickle

In [3]:
import pyspark.sql.functions as f


In [4]:
! hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [5]:
spark.read \
     .format("csv") \
     .option("header", True) \
     .option("sep",  "\t") \
     .load("/labs/slaba04/gender_age_dataset.txt").show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", sep="\t",header=True,  inferSchema="true").cache()
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [7]:
df.groupBy("gender").count().show()

+------+-----+
|gender|count|
+------+-----+
|     F|17440|
|     M|18698|
|     -| 5000|
+------+-----+



In [8]:
df.select("gender").distinct().collect()

[Row(gender='F'), Row(gender='M'), Row(gender='-')]

In [9]:
df.filter(df.gender.isNull()).show()

+------+---+---+---------+
|gender|age|uid|user_json|
+------+---+---+---------+
+------+---+---+---------+



In [10]:
df.filter("gender != 'M' AND gender != 'F'").show()

+------+---+--------------------+--------------------+
|gender|age|                 uid|           user_json|
+------+---+--------------------+--------------------+
|     -|  -|bd7a30e1-a25d-4cb...|{"visits": [{"url...|
|     -|  -|bd7a6f52-45db-49b...|{"visits": [{"url...|
|     -|  -|bd7a7fd9-ab06-42f...|{"visits": [{"url...|
|     -|  -|bd7c5d7a-0def-41d...|{"visits": [{"url...|
|     -|  -|bd7e54a2-0215-45c...|{"visits": [{"url...|
|     -|  -|bd7e9797-4cdb-46e...|{"visits": [{"url...|
|     -|  -|bd7e9ec7-fb67-45e...|{"visits": [{"url...|
|     -|  -|bd8056df-cc25-4b6...|{"visits": [{"url...|
|     -|  -|bd818690-73d2-445...|{"visits": [{"url...|
|     -|  -|bd81e006-f059-4cd...|{"visits": [{"url...|
|     -|  -|bd81e64a-bfa3-414...|{"visits": [{"url...|
|     -|  -|bd82fee4-afb3-408...|{"visits": [{"url...|
|     -|  -|bd83400b-abe2-42f...|{"visits": [{"url...|
|     -|  -|bd843c8c-dbba-4ec...|{"visits": [{"url...|
|     -|  -|bd86d250-a6ee-41f...|{"visits": [{"url...|
|     -|  

In [11]:
df.summary().show()

+-------+------+-----+--------------------+--------------------+
|summary|gender|  age|                 uid|           user_json|
+-------+------+-----+--------------------+--------------------+
|  count| 41138|41138|               41138|               41138|
|   mean|  null| null|                null|                null|
| stddev|  null| null|                null|                null|
|    min|     -|    -|0000e7ca-32e6-4be...|{"visits": [{"url...|
|    25%|  null| null|                null|                null|
|    50%|  null| null|                null|                null|
|    75%|  null| null|                null|                null|
|    max|     M| >=55|ffc8d1e1-c2ef-47a...|{"visits": [{"url...|
+-------+------+-----+--------------------+--------------------+



In [12]:
df = df.filter("gender == 'M' OR gender == 'F'")

In [13]:
from urllib.parse import urlparse

In [14]:
schema = StructType([
    StructField('visits', ArrayType(StructType(
    [
        StructField('url', StringType()),
        StructField('timestamp', LongType())
    ])
))
])

In [15]:
mod_df = df.withColumn('pars_json', f.from_json(df.user_json, schema))

mod_df = mod_df.withColumn('visits', f.col('pars_json').visits)
mod_df = mod_df.withColumn('url', f.col('visits').url)
mod_df.select('uid','gender','age', 'url').show(1, truncate=True)


+--------------------+------+-----+--------------------+
|                 uid|gender|  age|                 url|
+--------------------+------+-----+--------------------+
|d50192e5-c44e-4ae...|     F|18-24|[http://zebra-zoy...|
+--------------------+------+-----+--------------------+
only showing top 1 row



In [16]:
mod_df.show(5)

+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|           pars_json|              visits|                 url|
+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[[[http://zebra-z...|[[http://zebra-zo...|[http://zebra-zoy...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[[[http://sweetra...|[[http://sweetrad...|[http://sweetradi...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[[[http://ru.orif...|[[http://ru.orifl...|[http://ru.orifla...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[[[http://transla...|[[http://translat...|[http://translate...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[[[https://mail.r...|[[https://mail.ra...|[https://mail.ram...|
+------+-----+--------------------+-----

In [17]:
def get_domain(list_url):
    res = ''
    for l in list_url:  
        res = res+' '+urlparse(l).netloc
    res = res.replace('.', '')
    res = res.replace('www', '')
    res = res.replace('-', '')
    res = res.replace('_', '')
    return res

get_domain_udf = f.udf(get_domain, StringType())


In [18]:
def get_gender(f):
    if f == 'F':
        return 0
    if f == 'M':
        return 1
    
get_gender_udf = f.udf(get_gender, IntegerType())    

In [19]:
def get_age(f):
    
    if f == '<18':
        return 0
    if f == '18-24':
        return 1
    if f == '25-34':
        return 2
    if f == '35-44':
        return 3
    if f == '45-54':
        return 4
    if f == '>=55':
        return 5
    
get_age_udf = f.udf(get_age, IntegerType())  

In [20]:
def out_gender(f):
    if f == 0:
        return 'F'
    if f == 1:
        return 'M'
    
out_gender_udf = f.udf(out_gender, StringType())    

In [21]:
def out_age(f):
    
    if f == 0:
        return '<18'
    if f == 1 :
        return '18-24'
    if f == 2:
        return '25-34'
    if f == 3:
        return '35-44'
    if f == 4:
        return '45-54'
    if f == 5:
        return  '>=55'
    
out_age_udf = f.udf(out_age, StringType())  

In [22]:
type(out_age_udf)

function

In [23]:
mod_df = mod_df.withColumn('new',get_domain_udf(mod_df.url))

In [24]:
mod_df = mod_df.withColumn('gender_int',get_gender_udf(mod_df.gender))

In [25]:
mod_df = mod_df.withColumn('age_int',get_age_udf(mod_df.age))

In [26]:
mod_df.show(2, vertical = False, truncate=False)

+------+-----+------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, Tokenizer

In [28]:
%%time
tokenizer = Tokenizer(inputCol="new", outputCol="words")
wordsData = tokenizer.transform(mod_df)

hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)
    
# idf = IDF(inputCol="rawFeatures", outputCol="features")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)

df = featurizedData
df.cache()

# df.select("id", "lang", "filtered_words").show(3, truncate = True) 

CPU times: user 2.78 ms, sys: 4.53 ms, total: 7.31 ms
Wall time: 279 ms


In [29]:
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)
 |-- pars_json: struct (nullable = true)
 |    |-- visits: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- timestamp: long (nullable = true)
 |-- visits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- new: string (nullable = true)
 |-- gender_int: integer (nullable = true)
 |-- age_int: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [30]:
df.select('gender_int','features').show(5)

+----------+--------------------+
|gender_int|            features|
+----------+--------------------+
|         0|(10000,[3372,5100...|
|         1|(10000,[1487,1720...|
|         0|(10000,[108,2962,...|
|         0|(10000,[1265,3350...|
|         1|(10000,[857,1092,...|
+----------+--------------------+
only showing top 5 rows



In [31]:
df.filter(f.isnull("gender_int")).count()

0

In [32]:
df.groupby('gender_int').count().collect()

[Row(gender_int=1, count=18698), Row(gender_int=0, count=17440)]

In [33]:
%%time
gender_model = GBTClassifier(
    labelCol="gender_int", featuresCol="features", maxDepth=5
).fit(df.select('gender_int','features'))

CPU times: user 156 ms, sys: 35.1 ms, total: 191 ms
Wall time: 7min 8s


In [34]:
%%time
age_model = GBTRegressor(labelCol='age_int', maxDepth=5).fit(df.select('age_int','features'))

CPU times: user 138 ms, sys: 48.4 ms, total: 187 ms
Wall time: 5min 33s


In [62]:
with open("gender_model.pkl", "wb") as f:
    pickle.dump(gender_model, f)

# with open("gender_model.pkl", "rb") as f:
#     gender_model = pickle.load(f)

Py4JError: An error occurred while calling o459.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)



### READ KAFKA

In [35]:
from pyspark.sql.functions import *


In [36]:
for i in spark.streams.active:
    if "KafkaV2" in i.lastProgress["sources"][0]["description"]:
        i.stop()

In [38]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_dmitriy.sokolov",
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

static_from_kafka = kafka_sdf
# Deserialize binary value to string
deserialized = \
    static_from_kafka \
        .select(col("value").cast("string").alias("value")) \

parsed = \
    deserialized \
        .select(
            get_json_object(col("value"), "$.uid").alias("uid"),
            get_json_object(col("value"), "$.visits").alias("visits"),
        )

schema_test =ArrayType(StructType(
    [
        StructField('url', StringType()),
    ])
                      )

mod_test = parsed.withColumn('pars_json', f.from_json(parsed.visits, schema_test))
mod_test = mod_test.withColumn('url', f.col('pars_json').url)
mod_test = mod_test.select('uid','url')

mod_test = mod_test.withColumn('new',get_domain_udf(mod_test.url))

tokenizer = Tokenizer(inputCol="new", outputCol="words")
wordsData = tokenizer.transform(mod_test)

hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

mod_test = featurizedData

pred=gender_model.transform(mod_test.select('uid','features'))
pred = pred.withColumnRenamed("prediction", "gender")

pred = age_model.transform(pred.select('uid','gender','features'))
pred = pred.withColumn('age',round(pred.prediction))

pred = pred.select('uid','gender','age')

pred = pred.withColumn('gender_char',out_gender_udf(pred.gender))

pred = pred.withColumn('age_char',out_age_udf(pred.age))

pred = pred.select('uid','gender_char','age_char')
pred = pred.withColumnRenamed("gender_char", "gender")
pred = pred.withColumnRenamed("age_char", "age")

submission_json = pred.withColumn("jsonCol", to_json(struct([when(col(x)!="  ",pred[x]).otherwise(None).alias(x) for x in pred.columns])))
submission_json = submission_json.select("jsonCol")
submission_json = submission_json.withColumnRenamed("jsonCol", "value")

submission_json = submission_json.withColumn("topic", lit("dmitriy.sokolov"))


write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "dmitriy.sokolov"
}
submission_json.writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7f0d16222550>

In [45]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7f0d160ccb70>]

In [48]:
from pprint import pprint

# for i in spark.streams.active:
#     if "KafkaV2" in i.lastProgress["sources"][0]["description"]:
#         i.stop()
        
for i in spark.streams.active:
    pprint(i.status)

{'isDataAvailable': False,
 'isTriggerActive': False,
 'message': 'Waiting for data to arrive'}


In [49]:
# Stop all running streaming queries
for s in spark.streams.active:
    s.stop()
print("All streams has been stopped!")

All streams has been stopped!
