In [1]:
#spark.stop()
#sc.stop()


In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "PGG-try4") 
#conf.set("spark.driver.allowMultipleContexts", True) 

spark = SparkSession.builder.config(conf=conf).getOrCreate()
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer,StringIndexer,OneHotEncoder
import pyspark.sql.functions as f_
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType, StringType, ArrayType,TimestampType
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
import json
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.ml.classification import RandomForestClassificationModel


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
from urllib.parse import unquote

In [6]:

@f_.udf(StringType())
def url_cyr(url):
    return str(unquote(url))



In [7]:
data=spark.read.csv('/labs/slaba04/gender_age_dataset.txt',sep='\t',header=True)
json_schema = "map<string, array<struct<url:string,timestamp:string>>>"

tabular_data = data.withColumn('json', f_.from_json(f_.col('user_json'), json_schema))\
    .select('gender', 'age', 'uid', f_.explode('json'))\
    .select('gender', 'age', 'uid',f_.explode('value'))\
    .select('gender', 'age', 'uid',f_.col('col').url.alias('url'), f_.col('col').timestamp.alias('timestamp'))\
    .select('gender', 'age', 'uid',f_.col('timestamp').alias('ts_original'),f_.to_timestamp(f_.from_unixtime(f_.col('timestamp')/1000)).alias('timestamp'), url_cyr(f_.col('url')).alias('url'))\
    .select('*'
                ,f_.concat(f_.col('gender'),f_.lit('_'),f_.col('age')).alias('label')
                ,f_.regexp_replace(f_.lower(f_.split(f_.col('url'),'/').getItem(2)),'www.','').alias('domain')
                ,f_.when((f_.length(f_.split(f_.col('url'),'\\?').getItem(0))  >  f_.coalesce(f_.length(f_.split(f_.col('url'),'\\?').getItem(1)),f_.lit(0))), f_.split(f_.col('url'),'\\?').getItem(0) ).otherwise(f_.col('url')).alias('url2'))\
    .select('*',f_.trim(f_.lower(f_.regexp_replace(f_.regexp_replace('url2',r'[^\pL\p{Space}]',' ' ),'[ ]+',' '))).alias('url_words'))\
            .withColumn("lag_ts",f_.lag('timestamp').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))))\
            .withColumn("lag_domain",f_.lag('domain').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))))\
    .select('*'
                ,(f_.col('timestamp').cast('long') - f_.col('lag_ts').cast('long')).alias('timediff')
            ,f_.when(((f_.col('lag_domain').isNotNull()) & (f_.col('lag_domain') != f_.col('domain'))),f_.lit(1)).otherwise(f_.lit(0)).alias('change_site_flg')
           )
            

In [8]:
grouped_tabular_data=tabular_data.groupBy('gender', 'age', 'label','uid').agg(
     f_.concat_ws( ' ',f_.collect_list(f_.col('url_words'))).alias('urls_w')
    ,f_.concat_ws( ' ',f_.collect_list(f_.col('domain'))).alias('domains')
    ,f_.max('timestamp').cast('long').alias('max_ts')
    ,f_.min('timestamp').cast('long').alias('min_ts')
    ,f_.avg('timestamp').cast('long').alias('avg_ts')
    ,f_.max('timediff').alias('max_tdiff')
    ,f_.min('timediff').alias('min_tdiff')
    ,f_.avg('timediff').alias('avg_tdiff')
    ,f_.countDistinct('domain').alias('domain_cnt')
    ,f_.sum('change_site_flg').alias('change_site_cnt')
    ,f_.count('*').alias('visit_cnt')
                                                )

In [9]:
%%time

si = StringIndexer(inputCol="label", outputCol="indexedLabel")
stop_words_url=['www','http','html','htm','utm','php']
tokenizer = Tokenizer(inputCol="urls_w", outputCol="words")
tokenizer2 = Tokenizer(inputCol="domains", outputCol="domains_w")
swr=StopWordsRemover(inputCol="words", outputCol="words_censored", stopWords=stop_words_url)
tf = HashingTF(numFeatures =75,inputCol="words_censored", outputCol="tf")
tf2 = HashingTF(numFeatures =100,inputCol="domains_w", outputCol="domain_tf")

tfidf = IDF(inputCol="tf", outputCol="idf")
tfidf2 = IDF(inputCol="domain_tf", outputCol="domain_idf")


pipeline = Pipeline(stages=[
    tokenizer,
    tokenizer2,
    swr,
    tf,
    tf2,
    tfidf,
    tfidf2
])

calc=pipeline.fit(grouped_tabular_data)
tabular_tfidf_data=calc.transform(grouped_tabular_data)
si_label=si.fit(tabular_tfidf_data)
tabular_labled_tfidf_data=si_label.transform(tabular_tfidf_data)

CPU times: user 81.3 ms, sys: 21.7 ms, total: 103 ms
Wall time: 52.9 s


In [10]:
tabular_labled_tfidf_data.show(1,truncate=False,vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gender          | -                                                                                                                                                                                                                                                                                                                                                
 age             | -                                                                                                                                                                                                                                                                          

In [11]:
print(tabular_labled_tfidf_data.columns)

['gender', 'age', 'label', 'uid', 'urls_w', 'domains', 'max_ts', 'min_ts', 'avg_ts', 'max_tdiff', 'min_tdiff', 'avg_tdiff', 'domain_cnt', 'change_site_cnt', 'visit_cnt', 'words', 'domains_w', 'words_censored', 'tf', 'domain_tf', 'idf', 'domain_idf', 'indexedLabel']


In [12]:
feature_list=[ 'max_ts', 'min_ts', 'avg_ts', 'max_tdiff', 'min_tdiff', 'avg_tdiff', 'visit_cnt', 'idf', 'domain_idf','domain_cnt','change_site_cnt']
assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [13]:
train_data=assembler.transform(tabular_labled_tfidf_data.fillna( {'min_tdiff':0,'max_tdiff':0,'avg_tdiff':0} )).select('uid','indexedLabel','features')

In [14]:
!hdfs dfs -rm -r /user/igor.gorchakov/train_data_lab04.parquet

rm: `/user/igor.gorchakov/train_data_lab04.parquet': No such file or directory


In [15]:
train_data.write.parquet('train_data_lab04.parquet',mode='overwrite')
train_data=spark.read.parquet('/user/igor.gorchakov/train_data_lab04.parquet')

In [16]:
(trainingData, testData) = train_data.randomSplit([0.7, 0.3])

trainingData = train_data.sampleBy("indexedLabel", 
        fractions={0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7, 7: 0.7, 8: 0.7,9: 0.7, 10: 0.7}, seed=713)
testData = train_data.join(trainingData, on=['uid'], how="leftanti")

In [17]:
trainingData.limit(1).show()

+--------------------+------------+--------------------+
|                 uid|indexedLabel|            features|
+--------------------+------------+--------------------+
|00c5207a-1bea-453...|         3.0|(184,[0,1,2,6,11,...|
+--------------------+------------+--------------------+



In [18]:
%%time
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100,maxDepth=10)
model_rf=rf.fit(trainingData)
predictions_train_=model_rf.transform(trainingData)
predictions_=model_rf.transform(testData)



CPU times: user 27.6 ms, sys: 1.04 ms, total: 28.7 ms
Wall time: 1min 12s


In [19]:
%%time
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

CPU times: user 1.6 ms, sys: 275 µs, total: 1.87 ms
Wall time: 8.26 ms


In [20]:
%%time
accuracy_ = evaluator.evaluate(predictions_)
accuracy_train_ = evaluator.evaluate(predictions_train_)


CPU times: user 6.97 ms, sys: 513 µs, total: 7.48 ms
Wall time: 36.2 s


In [21]:
print(accuracy_train_, accuracy_)

0.3495542991918421 0.21946859510847486


In [22]:
print(accuracy_)

0.21946859510847486


In [23]:
!hdfs dfs -ls

Found 4 items
drwxr-xr-x   - igor.gorchakov igor.gorchakov          0 2022-11-09 11:55 .sparkStaging
drwxr-xr-x   - igor.gorchakov igor.gorchakov          0 2022-11-07 21:02 lab05.csv
drwxr-xr-x   - igor.gorchakov igor.gorchakov          0 2022-10-11 18:08 test_save.txt
drwxr-xr-x   - igor.gorchakov igor.gorchakov          0 2022-11-09 11:57 train_data_lab04.parquet


In [24]:
model_rf.save('rf_1g.sav')

In [25]:
model_load = RandomForestClassificationModel.load('rf_1g.sav')

### Kafka

In [26]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time is :", current_time)

Current Time is : 11:59:54


In [27]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_igor.gorchakov",
    "checkpointLocation":"/tmp/chk_gr/{n}".format(n='igor.gorchakov'),
    "startingOffsets": "earliest"
}
kafka_sdf = spark.read.format("kafka").options(**read_kafka_params).load()

In [28]:
kafka_sdf.count()

10000

In [29]:
json_schema = "map<string, array<struct<url:string,timestamp:string>>>"
kafka_in_tabular_data=kafka_sdf.select(f_.get_json_object(f_.col("value").cast("string"), "$.uid").alias("uid"),
                f_.concat(f_.lit('''{"visits": '''),f_.get_json_object(f_.col("value").cast("string"), "$.visits"),f_.lit('''}''')).alias("user_json")
                ).withColumn('json', f_.from_json(f_.col('user_json'), json_schema))\
    .select('uid', f_.explode('json'))\
    .select('uid',f_.explode('value'))\
    .select('uid',f_.col('col').url.alias('url'), f_.col('col').timestamp.alias('timestamp'))\
    .select( 'uid',f_.col('timestamp').alias('ts_original'),f_.to_timestamp(f_.from_unixtime(f_.col('timestamp')/1000)).alias('timestamp'), url_cyr(f_.col('url')).alias('url'))\
    .select('*'
                ,f_.regexp_replace(f_.lower(f_.split(f_.col('url'),'/').getItem(2)),'www.','').alias('domain')
                ,f_.when((f_.length(f_.split(f_.col('url'),'\\?').getItem(0))  >  f_.coalesce(f_.length(f_.split(f_.col('url'),'\\?').getItem(1)),f_.lit(0))), f_.split(f_.col('url'),'\\?').getItem(0) ).otherwise(f_.col('url')).alias('url2'))\
    .select('*',f_.trim(f_.lower(f_.regexp_replace(f_.regexp_replace('url2',r'[^\pL\p{Space}]',' ' ),'[ ]+',' '))).alias('url_words'))\
            .withColumn("lag_ts",f_.lag('timestamp').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))))\
            .withColumn("lag_domain",f_.lag('domain').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))))\
    .select('*'
                ,(f_.col('timestamp').cast('long') - f_.col('lag_ts').cast('long')).alias('timediff')
            ,f_.when(((f_.col('lag_domain').isNotNull()) & (f_.col('lag_domain') != f_.col('domain'))),f_.lit(1)).otherwise(f_.lit(0)).alias('change_site_flg')
           )
   

In [30]:
kafka_in_grouped_tabular_data=kafka_in_tabular_data.groupBy('uid').agg(
     f_.concat_ws( ' ',f_.collect_list(f_.col('url_words'))).alias('urls_w')
    ,f_.concat_ws( ' ',f_.collect_list(f_.col('domain'))).alias('domains')
    ,f_.max('timestamp').cast('long').alias('max_ts')
    ,f_.min('timestamp').cast('long').alias('min_ts')
    ,f_.avg('timestamp').cast('long').alias('avg_ts')
    ,f_.max('timediff').alias('max_tdiff')
    ,f_.min('timediff').alias('min_tdiff')
    ,f_.avg('timediff').alias('avg_tdiff')
    ,f_.countDistinct('domain').alias('domain_cnt')
    ,f_.sum('change_site_flg').alias('change_site_cnt')
    ,f_.count('*').alias('visit_cnt')
                                                )

In [31]:
kafka_in_grouped_tabular_data.count()

5000

In [32]:
%%time
kafka_tabular_tfidf_data=calc.transform(kafka_in_grouped_tabular_data)
test_data=assembler.transform(kafka_tabular_tfidf_data).select('uid','features')
model_load=RandomForestClassificationModel.load('rf_1g.sav')
test_predictions=model_load.transform(test_data)

CPU times: user 25.7 ms, sys: 23.3 ms, total: 49 ms
Wall time: 15.2 s


In [33]:
label_decode=spark.createDataFrame([[float(i),v.split('_')[0],v.split('_')[1]] for i,v in enumerate(si_label.labels)],schema='prediction:float, gender:string,age:string')
result=test_predictions.join(label_decode,['prediction'],'left').select('uid','gender','age')


In [34]:
def write_kafka(topic, data):
    kafka_params = {"kafka.bootstrap.servers": "spark-master-1.newprolab.com:6667"}
    kafka_doc = f_.to_json(f_.struct(f_.col("*")))
    raw = data \
        .select(kafka_doc.alias("value")) \
        .withColumn("topic", f_.lit(topic))
    return raw

In [35]:
%%time
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "igor.gorchakov",
"checkpointLocation":"/tmp/chk_gr/{n}".format(n='igor.gorchakov')
}
write_kafka('igor.gorchakov',result).write.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "/tmp/chk_gr/{n}".format(n='igor.gorchakov')).save()

CPU times: user 18 ms, sys: 15.2 ms, total: 33.2 ms
Wall time: 58.7 s


In [36]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time is :", current_time)

Current Time is : 12:01:25


In [37]:
#spark.stop()
#sc.stop()