In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 64 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("nikita.mospan lab3") \
    .getOrCreate()

In [2]:
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, ByteType, TimestampType, DoubleType
from pyspark.sql import functions as F

itemsSchema = StructType([
    StructField("item_id", LongType(), False),
    StructField("channel_id", StringType(), True), 
    StructField("datetime_availability_start", TimestampType(), True),
    StructField("datetime_availability_stop", TimestampType(), True),
    StructField("datetime_show_start", TimestampType(), True),
    StructField("datetime_show_stop", TimestampType(), True),
    StructField("content_type", ByteType(), False),
    StructField("title", StringType(), True),
    # year contains nulls so its type has to be string
    StructField("year", StringType(), True),
    StructField("genres", StringType(), False),
    StructField("region_id", StringType(), True),
])

itemsDf = spark.read.format("csv")\
    .schema(itemsSchema) \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "null") \
    .option("mode", "failfast") \
    .load("/labs/slaba03/laba03_items.csv")

In [3]:
itemsDf = itemsDf.selectExpr("item_id", "genres")

In [4]:
fullTrainSchema = StructType([
    StructField("user_id", LongType(), False),
    StructField("item_id", LongType(), False), 
    StructField("purchase", ByteType(), False),
])

fullTrainDf = spark.read.format("csv")\
    .schema(fullTrainSchema) \
    .option("header", "true") \
    .option("mode", "failfast") \
    .load("/labs/slaba03/laba03_train.csv")

In [None]:
# 5032624
# fullTrainDf.count()

In [5]:
unknownTestSchema = StructType([
    StructField("user_id", LongType(), False),
    StructField("item_id", LongType(), False),
    StructField("purchase", DoubleType(), True),
])

unknownTestDf = spark.read.format("csv")\
    .schema(unknownTestSchema) \
    .option("header", "true") \
    .option("mode", "failfast") \
    .load("/labs/slaba03/laba03_test.csv") \
    .select("user_id", "item_id")

In [None]:
# unknownTestDf.count()

In [6]:
viewItemsSchema = StructType([
    StructField("user_id", LongType(), False),
    StructField("item_id", LongType(), False),
    StructField("ts_start", LongType(), False),
    StructField("ts_end", LongType(), False),
    StructField("item_type", StringType(), False),
])

viewItemsDf = spark.read.format("csv")\
    .schema(viewItemsSchema) \
    .option("header", "true") \
    .option("mode", "failfast") \
    .load("/labs/slaba03/laba03_views_programmes.csv")

In [7]:
fullTrainDf.createOrReplaceTempView("full_train")
itemsDf.createOrReplaceTempView("items")
unknownTestDf.createOrReplaceTempView("unknown_test")
viewItemsDf.createOrReplaceTempView("views_of_items")

In [8]:
itemsGenresDf = spark.sql("""select item_id, genre
from (select item_id, split(genres, ',') as genres_arr from items)
lateral view outer explode(genres_arr) as genre""").groupBy("item_id")\
    .pivot("genre").count().na.fill(0)

In [None]:
# itemsGenresDf.count()

In [9]:
itemsGenresDf.createOrReplaceTempView("items_genres")

In [10]:
boughtGenresCols = ', '.join(["nvl(`bought_" + genreName + "`, 0) as `bought_" + genreName + "`" 
                              for genreName in itemsGenresDf.columns \
     if genreName not in ['item_id']])

boughtGenresAggrCols = ', '.join(["sum(items_genres.`" + genreName + "`) as `bought_" + genreName + "`" 
                            for genreName in itemsGenresDf.columns \
     if genreName not in ['item_id']])

In [11]:
genresBoughtByUserDf = spark.sql("select full_train.user_id, " + boughtGenresAggrCols + 
    """ from full_train join items_genres on full_train.item_id = items_genres.item_id where purchase = 1 
    group by user_id""")
genresBoughtByUserDf.createOrReplaceTempView("genres_bought_by_user")

In [12]:
userItemsBought = spark.sql("""select user_id, count(*) as user_bought_cnt from full_train where purchase = 1
    group by user_id""")
userItemsBought.createOrReplaceTempView("user_items_bought")

In [13]:
itemsBoughtCnt = spark.sql("""select item_id, count(*) as items_bought_cnt from full_train where purchase = 1
    group by item_id""")
itemsBoughtCnt.createOrReplaceTempView("items_bought_cnt")

In [14]:
viewGenresCols = ', '.join(["nvl(`view_" + genreName + "`, 0) as `view_" + genreName + "`" for genreName in itemsGenresDf.columns \
     if genreName not in ['item_id']])

viewGenresAggrCols = ', '.join(["sum(items_genres.`" + genreName + "`) as `view_" + genreName + "`" for genreName in itemsGenresDf.columns \
     if genreName not in ['item_id']])

In [15]:
genresViewedByUserDf = spark.sql("select views_of_items.user_id, " + viewGenresAggrCols + 
    " from views_of_items join items_genres on views_of_items.item_id = items_genres.item_id group by user_id")
genresViewedByUserDf.createOrReplaceTempView("genres_viewed_by_user")

In [16]:
def getFeatureSql(inputTableName, withLabelColumn=True):
    labelColumnExpr = ""
    if withLabelColumn:
        labelColumnExpr = ",full_train.purchase as label"
    return """select {0}.user_id,  
            nvl(items_bought_cnt, 0) as items_bought_cnt,
            nvl(user_bought_cnt, 0) as user_bought_cnt,
            items_genres.* ,
            {2},
            {3}
            {1}
            from {0} 
                join items_genres on {0}.item_id = items_genres.item_id
                left join items_bought_cnt on {0}.item_id = items_bought_cnt.item_id
                left join user_items_bought on {0}.item_id = user_items_bought.user_id
                left join genres_viewed_by_user on {0}.user_id = genres_viewed_by_user.user_id
                left join genres_bought_by_user on {0}.user_id = genres_bought_by_user.user_id"""\
            .format(inputTableName, labelColumnExpr, viewGenresCols, boughtGenresCols)

In [17]:
fullTrainWithFeaturesDf = spark.sql(getFeatureSql("full_train"))

In [None]:
# 5032624
# fullTrainWithFeaturesDf.count()

In [18]:
train = fullTrainWithFeaturesDf.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=41)
test = fullTrainWithFeaturesDf.join(train, on=["user_id", "item_id"], how="leftanti")

In [19]:
features_list = [column for column in fullTrainWithFeaturesDf.columns if column not in ["label", "user_id", "item_id"]]

In [20]:
len(features_list)

257

In [21]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline

vectorAssembler = VectorAssembler()\
    .setInputCols(features_list)\
    .setOutputCol("features")
scaler = StandardScaler().setInputCol("features").setOutputCol("norm_features")
layers = [len(features_list), 64, 2]
# layers = [len(features_list), 22, 2]
# layers = [len(features_list), 20, 5, 2]
mlp = MultilayerPerceptronClassifier(labelCol="label", \
                                     featuresCol="norm_features", \
                                     layers=layers, \
                                         seed=41)
pipelineStages = [vectorAssembler, scaler, mlp]
pipeline = Pipeline().setStages(pipelineStages)

In [22]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
    .addGrid(mlp.maxIter, [100])\
    .addGrid(mlp.blockSize, [128])\
    .build()

In [23]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setLabelCol("label")

In [24]:
from pyspark.ml.tuning import TrainValidationSplit
trainValidationSplit = TrainValidationSplit()\
    .setTrainRatio(0.999)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

fittedTrainValidationSplit = trainValidationSplit.fit(train)

In [25]:
evaluator.evaluate(fittedTrainValidationSplit.transform(test))

0.9661337195768572

In [26]:
unknownTestWithFeaturesDf = spark.sql(getFeatureSql("unknown_test", withLabelColumn=False))

In [27]:
submissionPredictions = fittedTrainValidationSplit.transform(unknownTestWithFeaturesDf)

In [None]:
# from pyspark.storagelevel import StorageLevel
# submissionPredictions.persist(StorageLevel.MEMORY_AND_DISK)

In [None]:
# submissionPredictions.select("user_id", "item_id", "rawPrediction", "probability", "prediction").show(1, False, True)

In [28]:
from pyspark.sql.window import Window

vectorSecondElementUdf = F.udf(lambda v:float(v[1]),DoubleType())

resultDf = submissionPredictions.select("user_id", "item_id", "probability")\
    .withColumn("purchase", vectorSecondElementUdf(F.col("probability")))\
    .select((F.row_number().over(Window.orderBy(F.col("user_id"), F.col("item_id"))) -1 ).alias("idx"),
            "user_id", "item_id", "purchase") \
    .orderBy(F.asc("user_id"), F.asc("item_id"))

In [29]:
resultDf.repartition(1).write.format("csv").mode("overwrite").option("header", "true").save("lab03")

In [30]:
! hdfs dfs -ls lab03

Found 2 items
-rw-r--r--   3 nikita.mospan nikita.mospan          0 2021-03-17 10:36 lab03/_SUCCESS
-rw-r--r--   3 nikita.mospan nikita.mospan   89958090 2021-03-17 10:36 lab03/part-00000-997f6cce-350f-4e1d-a02a-a67b26d2ff92-c000.csv


In [31]:
! rm lab03.csv

In [32]:
! hdfs dfs -get lab03/part-00000-997f6cce-350f-4e1d-a02a-a67b26d2ff92-c000.csv lab03.csv

In [None]:
# ! ls -al

In [None]:
spark.stop()