In [2]:
import os
import sys
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 10 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "baryshev konstantin") 

spark = SparkSession.builder.config(conf=conf).appName("baryshev konstantin").getOrCreate()

### Чтение данных

In [39]:
#!hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt -cat | head -n 2

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
cat: Unable to write to output stream.
cat: `-cat': No such file or directory


In [None]:
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import udf
import pyspark.sql.functions as f
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [4]:
from pyspark.sql.types import (
    StringType,
    StructField,
    StructType,
    ArrayType,
    LongType,
    IntegerType)

In [6]:
from pyspark.sql.functions import from_json
import pyspark.sql.functions as f

In [7]:
from urllib.parse import urlparse
from pyspark.sql.functions import udf
from pyspark.sql.functions import pandas_udf

In [8]:
schema = StructType(fields=[
    StructField("gender", StringType()),
    StructField("age", StringType()),
    StructField("uid", StringType()),
    StructField("user_json", StringType()),
])
train = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', header = True, sep="\t", schema = schema)
#train.show(2)

In [9]:
train = train.withColumn('visits', f.get_json_object("user_json", "$.visits"))
train.show(5)

+------+-----+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|              visits|
+------+-----+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[{"url":"http://z...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[{"url":"http://s...|
|     F|25-34|d50237ea-747e-48a...|{"visits": [{"url...|[{"url":"http://r...|
|     F|25-34|d502f29f-d57a-46b...|{"visits": [{"url...|[{"url":"http://t...|
|     M| >=55|d503c3b2-a0c2-4f4...|{"visits": [{"url...|[{"url":"https://...|
+------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
train.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)
 |-- visits: string (nullable = true)



In [11]:
from urllib.parse import urlparse
def get_urls(x):
    lst = eval(x)
    original_urls =  [urlparse(a['url'])[1] for a in lst]
    return ' '.join(original_urls)
geturls = udf(get_urls)

def get_times(x):
    lst = eval(x)
    return [a["timestamp"] for a in lst]
gettimes = udf(get_times)

train = train.withColumn('urls',geturls('visits'))\
.withColumn('dates',gettimes('visits'))\
.select('gender', 'age','uid','urls','dates')

In [14]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

In [15]:
tokenizer = Tokenizer(inputCol="urls", outputCol="words")
train2 = tokenizer.transform(train)

hashingTF = HashingTF(numFeatures=10000, binary = False, inputCol=tokenizer.getOutputCol(), outputCol="url_vector")
train2 = hashingTF.transform(train2)

idf = IDF(inputCol = 'url_vector', outputCol = 'features').fit(train2)
train_vect = idf.transform(train2)

#train_vect.show(2, vertical = True)

In [35]:
from pyspark.sql.functions import countDistinct
grouped = train_vect.groupBy("age").agg(countDistinct("uid"))
grouped.show()

+-----+-------------------+
|  age|count(DISTINCT uid)|
+-----+-------------------+
| >=55|               1679|
|45-54|               4744|
|35-44|               9360|
|25-34|              15457|
|18-24|               4898|
+-----+-------------------+



In [34]:
#почистим данные 
train_vect = train_vect.where(f.col("age") != '-')

In [36]:
gen_dict = {'F':0, 'M':1}
age_dict = {'18-24':0, '25-34':1, '35-44':2, '45-54':3, '>=55':4}
def encode_gender(x):
    return gen_dict[x]

encodegender = udf(encode_gender, returnType=IntegerType())

def encode_age(x):
    return age_dict[x]
encodeage = udf(encode_age, returnType=IntegerType())

In [37]:
train_vect = train_vect.withColumn('gender_enc',encodegender('gender')).withColumn('age_enc',encodeage('age'))

In [38]:
cv_train = train_vect

In [39]:
gbt_gender = GBTClassifier(featuresCol="features", labelCol="gender_enc", maxIter=20, maxDepth=9)
model_gen = gbt_gender.fit(cv_train)

In [40]:
rfc_age = RandomForestClassifier(numTrees = 100, featuresCol="features", labelCol="age_enc")
model_age = rfc_age.fit(cv_train)

In [41]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-node-1.newprolab.com:6667',
    "subscribe": "input_konstantin.baryshev",
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()
#kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).load()

In [42]:
deserialized = kafka_sdf.select(f.col("value").cast("string").alias("value"))

parsed = deserialized.select(
            f.get_json_object(f.col("value"), "$.uid").alias("uid"),
            f.get_json_object(f.col("value"), "$.visits").alias("visits")
        )

In [43]:
test = parsed.withColumn('urls',geturls('visits'))\
             .withColumn('dates',gettimes('visits')).select('uid','urls','dates')
# test.show(2)

In [44]:
test2 = tokenizer.transform(test)
test2 = hashingTF.transform(test2)
test_vect = idf.transform(test2)
# test_vect.show(5)

In [45]:
res_gen = model_gen.transform(test_vect)
res_gen_2 = res_gen.select("uid", "features", f.col("prediction").alias("gender_num"))
res_age = model_age.transform(res_gen_2)

In [46]:
inv_age = {value: key for key, value in age_dict.items()}
inv_gen = {value: key for key, value in gen_dict.items()}
def code_gender(x):
    return inv_gen[x]
codegender = udf(code_gender, StringType())
def code_age(x):
    return inv_age[x]
codeage = udf(code_age, StringType())

In [47]:
batch_df = res_age.select("uid", f.col("prediction").alias("age"), "gender_num")\
                .withColumn("gender", codegender("gender_num"))\
                .withColumn("age", codeage("age"))\
                .drop("gender_num")
# batch_df.show(5)

In [60]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-node-1.newprolab.com:6667',
   "topic": "konstantin.baryshev"
}
to_kafka = batch_df.select(f.to_json(f.struct('uid', 'gender', 'age')).cast("string").alias("value"))\
    .writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

to_kafka

<pyspark.sql.streaming.StreamingQuery at 0x7fb57e3377f0>

In [61]:
for s in spark.streams.active:
    s.stop()

In [62]:
spark.streams.active

[]

In [63]:
spark.stop()