In [1]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


In [130]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [131]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [132]:
conf = SparkConf()
conf.set("spark.app.name", "ZK Lab04 app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [133]:
schema_test = StructType(fields=[StructField('gender', StringType()), 
                                       StructField('age', StringType()),
                                       StructField('uid', StringType()),
                                       StructField('user_json', StringType())
                                      ])
tdf = spark.read\
.format('csv')\
.schema(schema_test)\
.option("header", "true")\
.option("delimiter", "\t")\
.load("/labs/slaba04/gender_age_dataset.txt")

tdf

DataFrame[gender: string, age: string, uid: string, user_json: string]

In [134]:
from pyspark.sql.functions import *

visits_schema = StructType([
    StructField("visits", ArrayType(
      StructType([
          StructField("url", StringType()),
          StructField("timestamp", LongType())
      ])
   ))
])

tdf2 = tdf.withColumn("visits", from_json("user_json", visits_schema))

tdf2.show(2)

+------+-----+--------------------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|              visits|
+------+-----+--------------------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[[[http://zebra-z...|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[[[http://sweetra...|
+------+-----+--------------------+--------------------+--------------------+
only showing top 2 rows



In [135]:
tdf2.rdd.getNumPartitions()

5

In [136]:
tdf2 = tdf2.repartition(6)
tdf2.rdd.getNumPartitions()

6

In [137]:
from pyspark.sql import functions as F
train_data = tdf2.withColumn('visited_pages', F.col('visits.visits.url'))
train_data.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [138]:
from pyspark.sql.functions import concat, col, lit
train_df = train_data.withColumn('concat', concat("gender", lit(":"), "age"))\
.filter((F.col('gender') != '-') | (F.col('age') != '-'))\
.select('uid','visited_pages','concat')
# train_df.count()
train_df.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [139]:
train_df2 = train_df.withColumn("new_column",expr('transform(visited_pages,x -> split(x,"://")[1])'))
train_df3 = train_df2.withColumn("hosts",expr('transform(new_column,x -> split(x,"/")[0])'))\
.select('uid','visited_pages','hosts','concat')
train_df4 = train_df3.withColumn("count", F.size("hosts")) \
            .withColumn("count_distinct", F.size(F.array_distinct("hosts")))

train_df4.show(1, False, True)

-RECORD 0----------------------------------------------
 uid            | 0d52ad4d-27dc-4211-95fe-0174abe1c5f5 
 visited_pages  | [http://www.dez-shchukino.ru/]       
 hosts          | [www.dez-shchukino.ru]               
 concat         | M:35-44                              
 count          | 1                                    
 count_distinct | 1                                    
only showing top 1 row



In [140]:
train_df5 = train_df4.withColumn('fr_h', F.col('count')/F.col('count_distinct'))
train_df5.show(2,False,True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 uid            | 5fa9da5a-0d75-4a29-aaa1-7475732f12ee                                                                                                                                                                                                                                                                                                                                                                          
 visited_pages  | [http://govoritmoskva.ru/interviews/499/, http://news.smi2.ru/newdata/news?ad=706303&bl=80357&st=16&ct=adpreview&out=1, http://www.womanhit.ru/stars

RandomForestClassifier

In [301]:
dataset_forRF = train_df5.select('concat','hosts')
dataset_forRF.show(2)

+-------+--------------------+
| concat|               hosts|
+-------+--------------------+
|F:25-34|[love.mail.ru, go...|
| M:>=55|   [worldoftanks.ru]|
+-------+--------------------+
only showing top 2 rows



In [305]:
indexer = StringIndexer(inputCol="concat", outputCol="label")

pipeline_pre = Pipeline(stages=[
    CountVectorizer(inputCol="hosts", outputCol="features"),
    indexer
    ])

dataset_rfFit = pipeline_pre.fit(dataset_forRF)
dataset_rf = dataset_rfFit.transform(dataset_forRF)
dataset_rf.show(1)

+-------+--------------------+--------------------+-----+
| concat|               hosts|            features|label|
+-------+--------------------+--------------------+-----+
|M:35-44|[www.dez-shchukin...|(115071,[96919],[...|  2.0|
+-------+--------------------+--------------------+-----+
only showing top 1 row



In [306]:
fractions = dataset_rf.select("label").distinct().withColumn("fraction", lit(0.8)).rdd.collectAsMap()
print(fractions)                                                            

sampled_df_rf = dataset_rf.stat.sampleBy("label", fractions, seed = 36138)  
sampled_df_rf.show(5)

{8.0: 0.8, 0.0: 0.8, 7.0: 0.8, 1.0: 0.8, 4.0: 0.8, 3.0: 0.8, 2.0: 0.8, 6.0: 0.8, 5.0: 0.8, 9.0: 0.8}
+-------+--------------------+--------------------+-----+
| concat|               hosts|            features|label|
+-------+--------------------+--------------------+-----+
| M:>=55|   [worldoftanks.ru]|(115071,[204],[1.0])|  9.0|
| F:>=55|[plirt.ru, plirt....|(115071,[11,36,38...|  8.0|
|F:25-34|[www.1001eda.com,...|(115071,[341,570,...|  1.0|
|M:35-44|[bigcinema.tv, bi...|(115071,[42,68404...|  2.0|
|M:35-44|[loveplanet.ru, l...|(115071,[5,40],[2...|  2.0|
+-------+--------------------+--------------------+-----+
only showing top 5 rows



In [307]:
testData_rf = dataset_rf.exceptAll(sampled_df_rf)
testData_rf.count()

7228

In [308]:
rf = RandomForestClassifier(
        labelCol="label", featuresCol="features", numTrees=20, 
        maxBins=32, maxDepth=15, seed=9001,
    )
rfModel = rf.fit(sampled_df_rf)

predictions_rf = rfModel.transform(testData_rf)

In [312]:
predictions_rf.show(5,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [310]:
correct_predictions2 = predictions_rf.filter("label == prediction").count()
print("correct_predictions is {}".format(correct_predictions2))
all_predictions2 = predictions_rf.count()
print("all_predictions is {}".format(all_predictions2))

print("Accuracy is {}".format(correct_predictions2 / all_predictions2))

correct_predictions is 1761
all_predictions is 7228
Accuracy is 0.24363586054233535


## streaming
foreach_batch_function

In [315]:
def foreach_batch_function(batch_df, batch_id):
    value_schema = StructType(fields=[StructField('uid', StringType()),
                                      StructField('visits', StringType())
                                      ])
    kafka_pre = batch_df.withColumn("temp", from_json("value", value_schema))
    visits_schema = ArrayType(StructType(fields=[StructField('url', StringType()),
                                                 StructField('timestamp', StringType())
                                                ]))
    kafka_pre2 = kafka_pre.select(F.col('temp.uid').alias('uid'), F.from_json('temp.visits', visits_schema).alias('visits'))
    kafka_data = kafka_pre2.withColumn('visited_pages', F.col('visits.url'))
    
    kafka_df3 = kafka_data.withColumn("new_column",expr('transform(visited_pages,x -> split(x,"://")[1])'))
    kafka_df4 = kafka_df3.withColumn("hosts",expr('transform(new_column,x -> split(x,"/")[0])')).select('uid','visited_pages','hosts')

    kafka_df5 = kafka_df4.withColumn("count", F.size("hosts")) \
            .withColumn("count_distinct", F.size(F.array_distinct("hosts")))
    kafka_df6 = kafka_df5.withColumn('fr_h', F.col('count')/F.col('count_distinct'))

    kafka_preprocessed_dataset = dataset_rfFit.transform(kafka_df6).select('uid','features')
    
    kafka_predictions = rfModel.transform(kafka_preprocessed_dataset)
    
    kafka_df_submit = kafka_predictions.withColumn('gender',when(F.col('prediction') == 0.0, 'M')\
                   .when(F.col('prediction') == 1.0, 'F') \
                   .when(F.col('prediction') == 2.0, 'M') \
                   .when(F.col('prediction') == 3.0, 'F') \
                   .when(F.col('prediction') == 4.0, 'F') \
                   .when(F.col('prediction') == 5.0, 'F') \
                   .when(F.col('prediction') == 6.0, 'M') \
                   .when(F.col('prediction') == 7.0, 'M') \
                   .when(F.col('prediction') == 8.0, 'F') \
                    .otherwise('M')) \
                .withColumn('age', when(F.col('prediction') == 0.0, '25-34')\
                   .when(F.col('prediction') == 1.0, '25-34') \
                   .when(F.col('prediction') == 2.0, '35-44') \
                   .when(F.col('prediction') == 3.0, '35-44') \
                   .when(F.col('prediction') == 4.0, '18-24') \
                   .when(F.col('prediction') == 5.0, '45-54') \
                   .when(F.col('prediction') == 6.0, '45-54') \
                   .when(F.col('prediction') == 7.0, '18-24') \
                   .when(F.col('prediction') == 8.0, '>=55') \
                    .otherwise('>=55')) \
                    .select('uid','gender','age')
    
    kafka_submit_js = kafka_df_submit.select(to_json(struct("uid","gender","age")).alias("value"))
    
    kafka_submit_js.write\
    .format('kafka')\
    .options(**write_kafka_params)\
    .mode('append')\
    .save()
    pass

In [316]:
def create_console_sink(df):
    return df.writeStream.foreachBatch(foreach_batch_function).option('checkpointLocation', 'streaming/chk/chk_kafka')

read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_zarina.kamytbaeva",
    "startingOffsets": "latest"
    }
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).option("failOnDataLoss", 'False').load()
kafka_sdf = kafka_sdf.select(col("value").cast("string"))

In [319]:
sink = create_console_sink(kafka_sdf)
sq = sink.start()

In [217]:
kafka_sdf.isStreaming

True

In [320]:
sq.isActive

True

In [210]:
!hdfs dfs -ls streaming/chk/chk_kafka

Found 4 items
drwxr-xr-x   - zarina.kamytbaeva zarina.kamytbaeva          0 2021-03-25 11:01 streaming/chk/chk_kafka/commits
-rw-r--r--   3 zarina.kamytbaeva zarina.kamytbaeva         45 2021-03-24 17:52 streaming/chk/chk_kafka/metadata
drwxr-xr-x   - zarina.kamytbaeva zarina.kamytbaeva          0 2021-03-25 11:01 streaming/chk/chk_kafka/offsets
drwxr-xr-x   - zarina.kamytbaeva zarina.kamytbaeva          0 2021-03-24 17:52 streaming/chk/chk_kafka/sources


In [211]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    for s in streams:
        desc = s.lastProgress["sources"][0]["description"]
        s.stop()
        print("Stopped {s}".format(s=desc))
# 


In [297]:
kill_all()

In [129]:
spark.stop()