In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "lab04") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [2]:
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer,StringIndexer,OneHotEncoder
import pyspark.sql.functions as f_
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType, StringType, ArrayType,TimestampType
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
import json
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Обучающая выборка

In [4]:
!hadoop fs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


### Поле user_json имеет внутри json со следующей схемой: 
- {"visits": [{"url": "url1", "timestamp": "timestamp1"}, {"url": "url2", "timestamp": "timestamp2"}]}

In [5]:
json_schema = "map<string, array<struct<url:string,timestamp:string>>>"

In [6]:
spark.read.csv('/labs/slaba04/gender_age_dataset.txt',sep='\t',header=True)\
     .select("*"
             ,f_.from_json(f_.col('user_json'), json_schema).alias("user_json_"))\
     .select("*", f_.explode('user_json_'))\
     .select("*", f_.explode('value'))\
.show(5)

+------+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|gender|  age|                 uid|           user_json|          user_json_|   key|               value|                 col|
+------+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[visits -> [[http...|visits|[[http://zebra-zo...|[http://zebra-zoy...|
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[visits -> [[http...|visits|[[http://zebra-zo...|[http://news.yand...|
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[visits -> [[http...|visits|[[http://zebra-zo...|[http://www.sotov...|
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[visits -> [[http...|visits|[[http://zebra-zo...|[http://news.yand...|
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[visits -> [[http...|visits|[[http://zebra-zo...|[http:

In [7]:
from urllib.parse import unquote
@f_.udf(StringType())
def url_cyr(url): return str(unquote(url))

In [8]:
gender_age_dataset = spark.read.csv('/labs/slaba04/gender_age_dataset.txt',sep='\t',header=True)\
        .select('gender', 'age', 'uid'
                    ,f_.from_json(f_.col('user_json'), json_schema).alias("user_json")
                    ,f_.concat(f_.col('gender'),f_.lit('_'),f_.col('age')).alias('label')
                    )\
        .select('age', 'uid', "label", f_.explode('user_json'))\
        .select('uid', "label", f_.explode('value'))\
        .select('uid', "label", f_.col('col').timestamp.alias('timestamp'), url_cyr(f_.col('col').url).alias('url'))

In [9]:
gender_age_dataset.show(1, vertical = True, truncate = False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------
 uid       | d50192e5-c44e-4ae8-ae7a-7cfe67c8b777                                                                                                                    
 label     | F_18-24                                                                                                                                                 
 timestamp | 1419688144068                                                                                                                                           
 url       | http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun 
only showing top 1 row



### Результрующий DataFrame:
- uid - ID пользователя
- timestamp - временная метка посещения
- domain - домен (сайт)
- url_split - адрес  url, разделенный пробелом на отдельные части
- timediff - перодичность посещения (разница между временем текущего посещения и предыдушего)
- change_site_flg - флаг-индикатор смены пользователем домена
- label - пол_возраст - целевая метка

In [10]:
clear_src_data = gender_age_dataset\
.select('uid', "label", "timestamp", "url"
       ,f_.regexp_replace(f_.lower(f_.split(f_.col('url'),'/').getItem(2)),'www.','').alias('domain')
       ,f_.split(f_.col('url'),'\\?').alias("url_")
                   )\
.select('uid', "label", "timestamp", "domain", "url", "url_"
       ,f_.when((f_.length(f_.col("url_").getItem(0)) > f_.coalesce(f_.length(f_.col("url_").getItem(1)),f_.lit(0)))
                ,f_.col('url_').getItem(0))
          .otherwise(f_.col('url')).alias('url_clear'))\
.select('*'
       ,f_.trim(
            f_.lower(f_.regexp_replace(f_.regexp_replace('url_clear',r'[^\pL\p{Space}]',' '),'[ ]+',' '))
               ).alias('url_split')
       ,f_.lag('timestamp').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))).alias("prev_time")
       ,f_.lag('domain').over(Window.partitionBy("uid").orderBy(f_.col('timestamp'))).alias("prev_domain")
           )\
.select('*'
       ,(f_.col('timestamp').cast('long') - f_.col('prev_time').cast('long')).alias('timediff')
       ,f_.when(((f_.col('prev_domain').isNotNull()) & (f_.col('prev_domain') != f_.col('domain'))),f_.lit(1))\
          .otherwise(f_.lit(0)).alias('change_site_flg')
           )\
.select("uid", "timestamp", "domain", "url_split", "timediff", "change_site_flg", "label")

In [11]:
clear_src_data.show(5)

+--------------------+-------------+------------+--------------------+--------+---------------+-----+
|                 uid|    timestamp|      domain|           url_split|timediff|change_site_flg|label|
+--------------------+-------------+------------+--------------------+--------+---------------+-----+
|0108d217-e476-493...|1419843601415|kvartblog.ru|http kvartblog ru...|    null|              0|  -_-|
|0108d217-e476-493...|1419843669325|kvartblog.ru|http kvartblog ru...|   67910|              0|  -_-|
|0108d217-e476-493...|1419843739446|kvartblog.ru|http kvartblog ru...|   70121|              0|  -_-|
|0192cc54-559c-4c8...|1426772398000| primorye.ru|    http primorye ru|    null|              0|  -_-|
|0192cc54-559c-4c8...|1426772398001|vestiprim.ru|http vestiprim ru...|       1|              1|  -_-|
+--------------------+-------------+------------+--------------------+--------+---------------+-----+
only showing top 5 rows



### Фичи

#### агрегаты

In [12]:
agg_ = clear_src_data.groupBy('label', 'uid').agg(
     f_.concat_ws(' ',f_.collect_list(f_.col('url_split'))).alias('urls_words')
    ,f_.concat_ws(' ',f_.collect_list(f_.col('domain'))).alias('domains')
    ,f_.max('timestamp').cast('long').alias('time_max')
    ,f_.min('timestamp').cast('long').alias('time_min')
    ,f_.avg('timestamp').cast('long').alias('time_avg')
    ,f_.max('timediff').alias('times_diff_max')
    ,f_.min('timediff').alias('times_diff_min')
    ,f_.avg('timediff').alias('times_diff_avg')
    ,f_.countDistinct('domain').alias('domain_cnt')
    ,f_.sum('change_site_flg').alias('change_site_cnt')
    ,f_.count('*').alias('visit_cnt')
                                                )

In [13]:
agg_.show(5)

+-----+--------------------+--------------------+--------------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------+---------------+---------+
|label|                 uid|          urls_words|             domains|     time_max|     time_min|     time_avg|times_diff_max|times_diff_min|      times_diff_avg|domain_cnt|change_site_cnt|visit_cnt|
+-----+--------------------+--------------------+--------------------+-------------+-------------+-------------+--------------+--------------+--------------------+----------+---------------+---------+
|  -_-|0108d217-e476-493...|http kvartblog ru...|kvartblog.ru kvar...|1419843739446|1419843601415|1419843670062|         70121|         67910|             69015.5|         1|              0|        3|
|  -_-|0192cc54-559c-4c8...|http deita ru new...|deita.ru deita.ru...|1427206710000|1426772398000|1426883967767|     303620000|             1|1.0340761904761905E7|         7|             12|      

In [14]:
agg_.count()

41138

### PIPELINE
Фичи:
- time_min - раннее время посещения сайтов
- time_max - позднее время посещения сайтов
- time_avg - среднее время посещения сайтов
- times_diff_max - макисмальная разница между посещениями сайта
- times_diff_min - минимальная разница между посещениями сайта
- times_diff_avg - средняя разница между посещениями сайта
- visit_cnt - количество посещения сайтов пользователем
- url_idf - частота посещений сайтов пользователем
- domain_idf - частота посещений сайтов пользователем
- domain_cnt - частота посещений доменов пользователем
- change_site_cnt - количество переходов на сайты пользователем

In [15]:
%%time
# Фичи
stopwords = ['www','http','html','htm','utm','php']

tkn_url = Tokenizer(inputCol = "urls_words", outputCol = "u_words")
tkn_dom = Tokenizer(inputCol = "domains", outputCol = "d_words")
stopwords = StopWordsRemover(inputCol = "u_words", outputCol = "u_words_clear", stopWords = stopwords)
tf_url = HashingTF(numFeatures = 75, inputCol = "u_words_clear", outputCol = "url_tf")
tf_dom = HashingTF(numFeatures = 100, inputCol = "d_words", outputCol = "domain_tf")

tfidf_url = IDF(inputCol = "url_tf",    outputCol = "url_idf")
tfidf_dom = IDF(inputCol = "domain_tf", outputCol = "domain_idf")


pipeline = Pipeline(
    stages =[
                tkn_url,
                tkn_dom,
                stopwords,
                tf_url,
                tf_dom,
                tfidf_url,
                tfidf_dom
                ])

tfidf_pipeline = pipeline.fit(agg_).transform(agg_)
#label
feature_data = StringIndexer(inputCol = "label", outputCol = "indexLabel").fit(tfidf_pipeline)\
                                                                          .transform(tfidf_pipeline)

CPU times: user 94.4 ms, sys: 13.6 ms, total: 108 ms
Wall time: 59.4 s


In [16]:
feature_data.show(1,truncate = False, vertical = True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 label           | -_-                                                                                                                                                                                                                                                                                                                                              
 uid             | 0108d217-e476-493d-8c81-a9744f12451a                                                                                                                                                                                                                                       

In [17]:
print(feature_data.columns)

['label', 'uid', 'urls_words', 'domains', 'time_max', 'time_min', 'time_avg', 'times_diff_max', 'times_diff_min', 'times_diff_avg', 'domain_cnt', 'change_site_cnt', 'visit_cnt', 'u_words', 'd_words', 'u_words_clear', 'url_tf', 'domain_tf', 'url_idf', 'domain_idf', 'indexLabel']


In [18]:
feature_list=['time_min', 'time_max', 'time_avg'
             ,'times_diff_max', 'times_diff_min', 'times_diff_avg', 'visit_cnt'
             ,'url_idf', 'domain_idf','domain_cnt','change_site_cnt']
assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [19]:
train = assembler.transform(feature_data.fillna({'times_diff_min': 0, 'times_diff_max': 0,'times_diff_avg': 0}))\
                 .select('uid','indexLabel','features')

In [20]:
!hdfs dfs -rm -r -skipTrash train_lab04

Deleted train_lab04


In [21]:
train.write.parquet('train_lab04',mode='overwrite')
train = spark.read.parquet('train_lab04')

### Обучение DecisionTreeClassifier и RandomForestClassifier

In [22]:
(trainData, testData) = train.randomSplit([0.7, 0.3])

trainData = train.sampleBy("indexLabel", 
    fractions={0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7
              ,6: 0.7, 7: 0.7, 8: 0.7, 9: 0.7,10: 0.7}, seed = 115)
testData = train.join(trainData, on=['uid'], how="leftanti")

In [23]:
trainData.show(1)

+--------------------+----------+--------------------+
|                 uid|indexLabel|            features|
+--------------------+----------+--------------------+
|003b4b27-61f1-44e...|       3.0|(184,[0,1,2,3,4,5...|
+--------------------+----------+--------------------+
only showing top 1 row



### DecisionTree

In [24]:
%%time
decisiontreecls = DecisionTreeClassifier(labelCol = "indexLabel", featuresCol = "features")
model = decisiontreecls.fit(trainData)
prediction_train = model.transform(trainData)
prediction_test = model.transform(testData)

CPU times: user 13.3 ms, sys: 7.58 ms, total: 20.9 ms
Wall time: 13.1 s


In [25]:
%%time
evaluator = MulticlassClassificationEvaluator(
                         labelCol = "indexLabel"
                        ,predictionCol = "prediction"
                        ,metricName = "accuracy")
accuracy_train = evaluator.evaluate(prediction_train)
accuracy_test = evaluator.evaluate(prediction_test)

CPU times: user 0 ns, sys: 6.64 ms, total: 6.64 ms
Wall time: 13.4 s


In [26]:
print(accuracy_train, accuracy_test)

0.22518235498436956 0.2193877551020408


### RandomForest

In [27]:
%%time
randomforestcls = RandomForestClassifier(labelCol = "indexLabel", featuresCol = "features"
                                        ,numTrees = 100,maxDepth = 10)
model_randomforest = randomforestcls.fit(trainData)
prediction_rf_train = model_randomforest.transform(trainData)
predictions_rf_test = model_randomforest.transform(testData)

CPU times: user 29.4 ms, sys: 5.22 ms, total: 34.6 ms
Wall time: 2min 21s


In [28]:
%%time
accuracy_rf_test = evaluator.evaluate(predictions_rf_test)
accuracy_rf_train = evaluator.evaluate(prediction_rf_train)

CPU times: user 0 ns, sys: 7.12 ms, total: 7.12 ms
Wall time: 26.8 s


In [29]:
print(accuracy_rf_train, accuracy_rf_test)

0.34588398749565824 0.22376093294460642


In [30]:
!hadoop fs -rm -r -skipTrash lab04_model.sav

rm: `lab04_model.sav': No such file or directory


In [31]:
model_randomforest.save('lab04_model.sav')

In [32]:
spark.stop()