### Евгений Шенк

## Лабораторная раота №4. Прогнозирование пола и возрастной категории — Spark Streaming
#### Model development

In [41]:
import json
import os
import sys
import re
from urllib.parse import urlparse
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


### Spark Session

In [42]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml.feature import HashingTF, IDF, Normalizer, StopWordsRemover
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.classification import GBTClassifier, LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("ESShenk_spark_session")
         .getOrCreate())

### Data

In [43]:
!hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [44]:
gender_age_df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", header='true', sep='\t')

In [45]:
gender_age_df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- user_json: string (nullable = true)



In [46]:
gender_age_df.show(n=2, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [47]:
gender_to_label = {"M": 0,
                   "F": 1,}

label_to_gender = {0: "M",
                   1: "F",}

age_to_label = {"18-24": 0,
                "25-34": 1,
                "35-44": 2,
                "45-54": 3,
                ">=55": 4,}

label_to_age = {0: "18-24",
                1: "25-34",
                2: "35-44",
                3: "45-54",
                4: ">=55",}

In [48]:
@F.udf(IntegerType())
def get_label(col_gender, col_age):
#     return params_to_label[str(col_age) + str(col_gender)]
    return age_to_label[str(col_age)]

@F.udf(IntegerType())
def get_label_g(col_gender):
    return gender_to_label[str(col_gender)]

@F.udf(IntegerType())
def get_label_a(col_age):
#     return params_to_label[str(col_age) + str(col_gender)]
    return age_to_label[str(col_age)]

@F.udf(StringType())
def get_age(col_label):
    return label_to_params[col_label][0]

@F.udf(StringType())
def get_gender(col_label):
    return label_to_params[col_label][1]

In [49]:
@F.udf(ArrayType(StringType()))
def get_urls(col_1):
    url_col = json.loads(col_1)
    url_list = url_col["visits"]
    result = []
    for el in url_list:
        parsed = urlparse(el["url"])
#         result = [*result, *[parsed.netloc]]
        result = [*result, *([parsed.netloc] + re.split("-|/", parsed.path.strip("/")))]
#         result.append(el["url"].split("://")[1].split("/")[0])
        
    return result

In [50]:
df = gender_age_df.select("*") \
        .withColumn("urls", get_urls(F.col("user_json")))

In [51]:
df = df.select("*") \
        .withColumn('label_g', get_label_g(F.col("gender"))) \
        .withColumn('label_a', get_label_a(F.col("age")))

In [52]:
# df.show(n=2, truncate=150, vertical=True)

In [53]:
eng_stopwords = StopWordsRemover.loadDefaultStopWords("english") + ['ru']

In [54]:
swRemover = StopWordsRemover(inputCol="urls", outputCol="pure_words", stopWords=eng_stopwords)

In [55]:
# swRemover.write().overwrite().save(path='./swRemover')

In [56]:
pure_document = swRemover.transform(df)

In [57]:
hashingTF = HashingTF(numFeatures=35000, binary=True, inputCol="pure_words", outputCol="tf")

In [58]:
tf = hashingTF.transform(pure_document)

In [59]:
hashingTF.write().overwrite().save(path='./hashingTF')

In [20]:
# idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=500).fit(tf)

In [21]:
# tfidf = idf.transform(tf)

In [22]:
pure_df = tf.select("*") \
    .filter(F.col("age") != '-') \
    .filter(F.col("gender") != '-')

In [23]:
pure_df.show(n=2, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
# idf.write().overwrite().save(path='./idf')

## Model

In [25]:
# (train_data, test_data) = tfidf.randomSplit([0.5, 0.5], seed = 2177)

In [26]:
assembler = VectorAssembler(inputCols=["tf"], outputCol='features')

In [27]:
train_data = (assembler.transform(pure_df).select("label_g", "label_a", "features"))
# test_data = (assembler.transform(test_data).select("label_g", "label_a", "features"))
train_data = train_data.repartition(16).cache()
# test_data = train_data.repartition(16).cache()
train_data.count()
# test_data.count()

36138

In [28]:
# assembler.write().overwrite().save(path='./assembler')

In [29]:
# train_data.show(n=1, truncate=False, vertical=True)

In [30]:
# train_data.printSchema()

In [31]:
# gbt = GBTClassifier(maxIter=5, maxDepth=2, seed=27, labelCol="label_g")
# rfc_g = RandomForestClassifier(numTrees=100, maxDepth=15, seed=27, labelCol="label_g")
# rfc_a = RandomForestClassifier(numTrees=50, maxDepth=5, seed=27, labelCol="label_a")
# lr_gender = LogisticRegression(maxIter=5, regParam=0.0, elasticNetParam=0.5, labelCol="label_g")
# lr_age = LogisticRegression(maxIter=5, regParam=0.0, elasticNetParam=0.5, labelCol="label_a")
nb_gender = NaiveBayes(smoothing=30.0, labelCol='label_g')
nb_age = NaiveBayes(smoothing=1.0, labelCol='label_a')
evaluator_gender = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='label_g')
evaluator_age = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='label_a')
paramGrid = ParamGridBuilder().build()

In [32]:
crossval_g = CrossValidator(estimator=nb_gender, estimatorParamMaps=paramGrid,
                            evaluator=evaluator_gender, numFolds=5, parallelism=1)

In [33]:
%%time
model_g = crossval_g.fit(train_data)

CPU times: user 100 ms, sys: 30.3 ms, total: 131 ms
Wall time: 16.4 s


In [34]:
accuracy = model_g.avgMetrics[0]
print("CV accuracy = %g" % accuracy)  0.694192

CV accuracy = 0.694192


In [35]:
model_g.write().overwrite().save("./model_g")
# cvModelRead = CrossValidatorModel.read().load("./model_g")

In [36]:
crossval_a = CrossValidator(estimator=nb_age, estimatorParamMaps=paramGrid,
                            evaluator=evaluator_age, numFolds=5, parallelism=1)

In [37]:
%%time
model_a = crossval_a.fit(train_data)

CPU times: user 112 ms, sys: 29.2 ms, total: 142 ms
Wall time: 13.7 s


In [38]:
accuracy = model_a.avgMetrics[0]
print("CV accuracy = %g" % accuracy)  # 0.3709

CV accuracy = 0.3709


In [39]:
model_a.write().overwrite().save("./model_a")
# cvModelRead = CrossValidatorModel.read().load("./model_a")

### Сохранить и выйти

In [40]:
spark.stop()