In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("nikita.mospan lab4") \
    .getOrCreate()

In [2]:
GENDER = "gender"
AGE = "age"
UID = "uid"
VISITS = "visits"
GENDER_AGE = "gender_age"
DOMAIN_COL = "domains"
LABEL = "label"

In [3]:
inputPath = "/labs/slaba04/gender_age_dataset.txt"

In [None]:
# ! hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, expr, udf
from pyspark.sql.types import StructType, StringType, ArrayType, StructField, LongType

inputSchema = StructType([
    StructField(GENDER, StringType(), False),
    StructField(AGE, StringType(), False),
    StructField(UID, StringType(), False),
    StructField("user_data", StringType(), False)
])

In [5]:
inputDf = spark.read \
    .format("csv") \
    .schema(inputSchema)\
    .option("header", "true") \
    .option("sep", "\t") \
    .option("mode", "failfast") \
    .load(inputPath) \
    .where(F.col(GENDER) != '-') \
    .where(F.col(AGE) != '-') \
    .withColumn(GENDER_AGE, F.concat(GENDER, AGE))\
    .withColumn(VISITS, F.from_json("user_data", StructType([StructField("dummy", StringType()),
                                                             StructField("visits", StringType())])))\
    .select(UID, GENDER_AGE, F.col("visits.visits").alias(VISITS))

In [6]:
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer().setInputCol(GENDER_AGE).setOutputCol(LABEL)
inputWithLabelDf = stringIndexer.fit(inputDf).transform(inputDf)
pathToLabelGenderAgeMapping = "lab4_label_to_age_gender"
inputWithLabelDf.select(GENDER_AGE, LABEL).distinct()\
    .withColumn(GENDER, F.substring(F.col(GENDER_AGE), 1, 1))\
    .withColumn(AGE, F.substring(F.col(GENDER_AGE), 2, 100))\
    .select(AGE, GENDER, LABEL) \
    .repartition(1).write.format("parquet").mode("overwrite").save(pathToLabelGenderAgeMapping)

In [7]:
%run 'VisitsTransformer.ipynb'
m = __import__("__main__")
setattr(m, 'VisitsTransformer', VisitsTransformer)

In [8]:
# visitsTransformer = VisitsTransformer()
# visitsTransformer.transform(inputWithLabelDf).show(1)

+--------------------+----------+--------------------+-----+--------------------+
|                 uid|gender_age|              visits|label|             domains|
+--------------------+----------+--------------------+-----+--------------------+
|d50192e5-c44e-4ae...|    F18-24|[{"url":"http://z...|  4.0|[zebra-zoya.ru, n...|
+--------------------+----------+--------------------+-----+--------------------+
only showing top 1 row



In [9]:
train, test = inputWithLabelDf.randomSplit([0.8, 0.2], seed = 41)

In [10]:
from pyspark.ml.feature import CountVectorizer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

visitsTransformer = VisitsTransformer()
countVectorizer = CountVectorizer().setVocabSize(1000).setInputCol(DOMAIN_COL).setOutputCol("features")
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features")
pipeline = Pipeline().setStages([visitsTransformer, countVectorizer, rf])

In [11]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().build()

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator().setMetricName("accuracy").setLabelCol("label")

In [13]:
from pyspark.ml.tuning import TrainValidationSplit
trainValidationSplit = TrainValidationSplit()\
    .setTrainRatio(0.85)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

fittedTrainValidationSplit = trainValidationSplit.fit(train)

In [None]:
evaluator.evaluate(fittedTrainValidationSplit.transform(test))

In [14]:
lab4ModelPath = 'lab4_model'

In [15]:
fittedTrainValidationSplit.bestModel.write().overwrite().save(lab4ModelPath)

In [None]:
spark.stop()