In [1]:
import os 
import sys


os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 6 --executor-memory 5g --executor-cores 4 --driver-memory 6g --conf spark.memory.storageFraction=0.3 --conf spark.executor.memoryOverhead=1g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("lab 3")
         .getOrCreate())

In [3]:
sc = spark.sparkContext

In [4]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [5]:
schema = T.StructType([
    StructField("user_id", T.IntegerType(), True),
    StructField("item_id", T.IntegerType(), True),
    StructField("purchase", T.IntegerType(), True),
])   

In [6]:
df_train = spark.read.format("csv"). \
                      schema(schema). \
                      option("header", True). \
                      load("/labs/slaba03/laba03_train.csv")
df_train.cache()

DataFrame[user_id: int, item_id: int, purchase: int]

In [7]:
df_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [8]:
df_train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [9]:
item_purchase_prob = df_train.groupBy("item_id").agg(F.mean("purchase").alias("item_purchase_prob"))

In [10]:
item_purchase_prob = item_purchase_prob.withColumn("item_purchase_prob", F.col("item_purchase_prob") * 100)

In [11]:
item_purchase_prob.summary().show()

+-------+------------------+-------------------+
|summary|           item_id| item_purchase_prob|
+-------+------------------+-------------------+
|  count|              3704|               3704|
|   mean| 66877.31425485961| 0.2165993075229777|
| stddev|35242.702380266725|0.44829320161235064|
|    min|               326|                0.0|
|    25%|             60351|0.07267441860465117|
|    50%|             79853|0.07429420505200594|
|    75%|             93602| 0.1516300227445034|
|    max|            104165|  7.153284671532846|
+-------+------------------+-------------------+



In [12]:
df_user_features = df_train.groupBy("user_id").agg(F.mean("purchase").alias("user_purchase_prob"))

In [13]:
df_user_features = df_user_features.withColumn("user_purchase_prob", F.col("user_purchase_prob") * 100)

In [14]:
df_user_features.summary().show()

+-------+-----------------+--------------------+
|summary|          user_id|  user_purchase_prob|
+-------+-----------------+--------------------+
|  count|             1941|                1941|
|   mean|869672.3745492015|  0.2164949883315816|
| stddev|60648.36081128855|  0.5807849230483886|
|    min|             1654|                 0.0|
|    25%|           846231|0.038387715930902115|
|    50%|           885247|  0.0761904761904762|
|    75%|           908588| 0.19282684149633628|
|    max|           941450|  18.617021276595743|
+-------+-----------------+--------------------+



In [15]:
df_user_features.cache()

DataFrame[user_id: int, user_purchase_prob: double]

In [16]:
df_user_features = F.broadcast(df_user_features)

In [17]:
df_train = df_train.join(F.broadcast(item_purchase_prob), "item_id", "left")

In [18]:
schema = T.StructType([
    StructField("user_id", T.IntegerType(), True),
    StructField("item_id", T.IntegerType(), True),
])

df_test = spark.read.format("csv"). \
                      schema(schema). \
                      option("header", True). \
                      load("/labs/slaba03/laba03_test.csv")
df_test.cache()
df_test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)



In [19]:
df_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [20]:
df_test.count()

2156840

In [21]:
df_test = df_test.join(F.broadcast(item_purchase_prob), "item_id", "left")

In [22]:
df_test.count()

2156840

In [23]:
df_test.select("user_id").distinct().count()

1941

In [24]:
df_user_features.count()

1941

In [25]:
schema = T.StructType([
    StructField("user_id", T.IntegerType(), True),
    StructField("item_id", T.IntegerType(), True),
    StructField("ts_start", T.IntegerType(), True),
    StructField("ts_end", T.IntegerType(), True),
    StructField("item_type", T.StringType(), True)
])

df_views = spark.read.format("csv"). \
                      schema(schema). \
                      option("header", True). \
                      load("/labs/slaba03/laba03_views_programmes.csv")
df_views.cache()
df_views.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- ts_start: integer (nullable = true)
 |-- ts_end: integer (nullable = true)
 |-- item_type: string (nullable = true)



In [26]:
df_views.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



In [27]:
df_views.select("user_id").distinct().count()

79385

In [28]:
df_views = df_views.withColumn("ts_duration", F.col("ts_end") - F.col("ts_start"))

In [29]:
df_views = df_views.withColumn("is_live", (F.col("item_type") == F.lit("live")).cast(T.IntegerType()))

In [30]:
df_views = df_views.drop("item_type")

In [31]:
df_views = df_views.withColumn('ts_year', F.year(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_month', F.month(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_day', F.dayofmonth(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_hour', F.hour(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_minute', F.minute(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_second', F.second(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_week', F.weekofyear(F.to_timestamp('ts_start')))
df_views = df_views.withColumn('ts_day_of_week', F.dayofweek(F.to_timestamp('ts_start')))

In [32]:
df_views = df_views.withColumn("ts_duration_min", (F.col("ts_duration") / 60).cast(T.IntegerType()))

In [33]:
df_views = df_views.withColumn("ts_duration_hour", (F.col("ts_duration") / 3600).cast(T.IntegerType()))

In [34]:
df_views.select("ts_duration_hour").groupBy("ts_duration_hour").count().sort("ts_duration_hour").show(100)

+----------------+--------+
|ts_duration_hour|   count|
+----------------+--------+
|               0|15283245|
|               1| 2918778|
|               2|  987671|
|               3|  629477|
|               4|  655463|
|               5|  101911|
|               6|   62170|
|               7|   42483|
|               8|   36792|
|               9|   23090|
|              10|   18789|
|              11|   15700|
|              12|   13246|
|              13|    9487|
|              14|    7502|
|              15|    6169|
|              16|    5219|
|              17|    4451|
|              18|    4200|
|              19|    4150|
|              20|    3990|
|              21|    4121|
|              22|    3786|
|              23|    3713|
|              24|       3|
|              26|       1|
+----------------+--------+



In [34]:
df_views = df_views.drop("ts_start", "ts_end")

In [35]:
df_views.select("ts_year").groupBy("ts_year").count().show()

+-------+--------+
|ts_year|   count|
+-------+--------+
|   2017|20845607|
+-------+--------+



In [36]:
df_views.select("is_live").groupBy("is_live").count().show()

+-------+--------+
|is_live|   count|
+-------+--------+
|      1|17704201|
|      0| 3141406|
+-------+--------+



In [37]:
df_views.cache()

DataFrame[user_id: int, item_id: int, ts_duration: int, is_live: int, ts_year: int, ts_month: int, ts_day: int, ts_hour: int, ts_minute: int, ts_second: int, ts_week: int, ts_day_of_week: int, ts_duration_min: int, ts_duration_hour: int]

In [38]:
views_user_live_prob = df_views.groupBy("user_id").agg(F.mean("is_live").alias("views_user_live_prob"))
views_user_live_prob.summary().show()

+-------+-----------------+--------------------+
|summary|          user_id|views_user_live_prob|
+-------+-----------------+--------------------+
|  count|            79385|               79385|
|   mean|856331.6499464634|  0.8907404781862069|
| stddev|67407.01028043882| 0.20262261743898266|
|    min|                0|                 0.0|
|    25%|           817453|  0.8896551724137931|
|    50%|           867425|  0.9919354838709677|
|    75%|           909325|                 1.0|
|    max|           941970|                 1.0|
+-------+-----------------+--------------------+



In [39]:
df_user_features = df_user_features.join(F.broadcast(views_user_live_prob), "user_id", "left")

In [40]:
views_user_mean_sec = df_views.groupBy("user_id").agg(F.mean("ts_duration").alias("views_user_mean_sec"))
df_user_features = df_user_features.join(F.broadcast(views_user_mean_sec), "user_id", "left")
views_user_mean_sec.summary().show()

+-------+-----------------+-------------------+
|summary|          user_id|views_user_mean_sec|
+-------+-----------------+-------------------+
|  count|            79385|              79385|
|   mean|856331.6499464634|  4225.128783183427|
| stddev|67407.01028043882| 3617.5646767763205|
|    min|                0|              300.0|
|    25%|           817462|           2169.252|
|    50%|           867425|  3370.603550295858|
|    75%|           909346|  5132.411764705882|
|    max|           941970|            85916.0|
+-------+-----------------+-------------------+



In [41]:
views_user_mean_sec_live = df_views.filter(F.col("is_live") == 1).groupBy("user_id").agg(F.mean("ts_duration").alias("views_user_mean_sec_live"))
df_user_features = df_user_features.join(F.broadcast(views_user_mean_sec_live), "user_id", "left")
views_user_mean_sec_live.summary().show()

+-------+-----------------+------------------------+
|summary|          user_id|views_user_mean_sec_live|
+-------+-----------------+------------------------+
|  count|            78943|                   78943|
|   mean|856288.1839808469|       4487.200668822623|
| stddev|67386.24789242803|      3895.9746899889105|
|    min|                0|                   300.0|
|    25%|           817432|       2179.160714285714|
|    50%|           867371|       3551.238738738739|
|    75%|           909275|       5567.416666666667|
|    max|           941970|                 85916.0|
+-------+-----------------+------------------------+



In [42]:
views_user_mean_sec_pvr = df_views.filter(F.col("is_live") != 1).groupBy("user_id").agg(F.mean("ts_duration").alias("views_user_mean_sec_pvr"))
df_user_features = df_user_features.join(F.broadcast(views_user_mean_sec_pvr), "user_id", "left")
views_user_mean_sec_pvr.summary().show()

+-------+-----------------+-----------------------+
|summary|          user_id|views_user_mean_sec_pvr|
+-------+-----------------+-----------------------+
|  count|            45777|                  45777|
|   mean|852349.5113048038|       2574.39233133019|
| stddev|66283.71685162785|     1997.4501158390106|
|    min|             1654|                  300.0|
|    25%|           812351|                 1614.0|
|    50%|           861555|               2348.875|
|    75%|           904010|                3121.75|
|    max|           941872|                81744.0|
+-------+-----------------+-----------------------+



In [43]:
df_user_features.cache()

DataFrame[user_id: int, user_purchase_prob: double, views_user_live_prob: double, views_user_mean_sec: double, views_user_mean_sec_live: double, views_user_mean_sec_pvr: double]

In [44]:
for d in range(7):
    df_views = df_views.withColumn(f"ts_day_of_week_{d}", (F.col("ts_day_of_week") == d).cast(T.IntegerType()))
    
    df = df_views.filter(F.col("is_live") == 1).groupBy("user_id"). \
        agg(F.mean(f"ts_day_of_week_{d}").alias(f"views_user_day_{d}_mean_live"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    df = df_views.filter(F.col("is_live") == 1).groupBy("user_id"). \
        agg(F.mean(F.col(f"ts_day_of_week_{d}") * F.col("ts_duration")).alias(f"views_user_day_{d}_mean_sec_live"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    
    df = df_views.filter(F.col("is_live") != 1).groupBy("user_id"). \
        agg(F.mean(f"ts_day_of_week_{d}").alias(f"views_user_day_{d}_mean_pvr"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    df = df_views.filter(F.col("is_live") != 1).groupBy("user_id"). \
        agg(F.mean(F.col(f"ts_day_of_week_{d}") * F.col("ts_duration")).alias(f"views_user_day_{d}_mean_sec_pvr"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    

In [45]:
for h in range(24):
    df_views = df_views.withColumn(f"ts_hour_{h}", (F.col("ts_hour") == h).cast(T.IntegerType()))
    
    df = df_views.filter(F.col("is_live") == 1).groupBy("user_id"). \
        agg(F.mean(f"ts_hour_{h}").alias(f"views_user_hour_{h}_mean_live"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    df = df_views.filter(F.col("is_live") == 1).groupBy("user_id"). \
        agg(F.mean(F.col(f"ts_hour_{h}") * F.col("ts_duration")).alias(f"views_user_hour_{h}_mean_sec_live"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    
    df = df_views.filter(F.col("is_live") != 1).groupBy("user_id"). \
        agg(F.mean(f"ts_hour_{h}").alias(f"views_user_hour_{h}_mean_pvr"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    df = df_views.filter(F.col("is_live") != 1).groupBy("user_id"). \
        agg(F.mean(F.col(f"ts_hour_{h}") * F.col("ts_duration")).alias(f"views_user_hour_{h}_mean_sec_pvr"))
    df_user_features = df_user_features.join(F.broadcast(df), "user_id", "left")
    

In [46]:
schema = T.StructType([
    StructField("item_id", T.IntegerType(), True),
    StructField("channel_id", T.IntegerType(), True),
    StructField("datetime_availability_start", T.TimestampType(), True),
    StructField("datetime_availability_stop", T.TimestampType(), True),
    StructField("datetime_show_start", T.TimestampType(), True),
    StructField("datetime_show_stop", T.TimestampType(), True),
    StructField("content_type", T.IntegerType(), True),
    StructField("title", T.StringType(), True),
    StructField("year", T.FloatType(), True),
    StructField("genres", T.StringType(), True),
    StructField("region_id", T.IntegerType(), True)
])

df_items = spark.read.format("csv"). \
                      schema(schema). \
                      option("header", True). \
                      option("sep", "\t"). \
                      load("/labs/slaba03/laba03_items.csv")
df_items = df_items.withColumn("year", F.col("year").cast(T.IntegerType()))

df_items.cache()
df_items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: integer (nullable = true)
 |-- datetime_availability_start: timestamp (nullable = true)
 |-- datetime_availability_stop: timestamp (nullable = true)
 |-- datetime_show_start: timestamp (nullable = true)
 |-- datetime_show_stop: timestamp (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: integer (nullable = true)



In [47]:
df_items.show(5)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|year| genres|region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+
|  65667|      null|        1970-01-01 03:00:00|       2018-01-01 03:00:00|               null|              null|           1|на пробах только ...|2013|Эротика|     null|
|  65669|      null|        1970-01-01 03:00:00|       2018-01-01 03:00:00|               null|              null|           1|скуби ду: эротиче...|2011|Эротика|     null|
|  65668|      null|        1970-01-01 03:00:00|       2018-01-01 03:00:00|               null|              null|           1|горячие девоч

In [48]:
df_items = df_items.withColumn("genres_list", F.split(F.col("genres"), ","))

In [49]:
genres = [x[0] for x in df_items.select(F.explode("genres_list")).distinct().collect()]

In [50]:
genres

['Ужасы',
 'Анимация',
 'Мелодрама',
 'Исторические',
 'Мистические',
 'Мультфильмы',
 'Триллер',
 'Документальный',
 'Хочу всё знать',
 'Зарубежные',
 'Приключение',
 'Семейный',
 'Союзмультфильм',
 ' сказка',
 'Западные мультфильмы',
 'Фильмы',
 'Юмористические',
 'Детские песни',
 'Развлекательные',
 'Игры',
 'Передачи',
 'Короткометражки',
 'Русские мультфильмы',
 'Боевики',
 'Мелодрамы',
 'Мультфильм',
 'Для детей',
 'Эротика',
 'Аниме',
 'Фантастические',
 'Короткометражные',
 'Наши',
 'Военные',
 'Спортивные',
 'Детские',
 'Семейные',
 'Советское кино',
 'Драмы',
 'Комедия',
 'Криминал',
 'О здоровье',
 'Мюзиклы',
 'Для взрослых',
 'Приключения',
 'Фильмы в 3D',
 'Военный',
 'Романтические',
 'Познавательные',
 'Спорт',
 'Охота и рыбалка',
 'Фильмы-спектакли',
 'Комедии',
 'Полнометражные',
 'Сериалы',
 'Для всей семьи',
 'Мультфильмы в 3D',
 'Экранизации',
 'Документальные',
 'Арт-хаус',
 'Для самых маленьких',
 'Боевик',
 'Развивающие',
 'Фантастика',
 'Биография',
 'Сказки',


In [51]:
t = df_items.select("genres").distinct().collect()

In [52]:
t = [x.genres for x in t if x.genres and "сказка" in x.genres]

In [53]:
t

['Мелодрама,Фэнтези, сказка,Криминал,Триллер,Драма',
 'Ужасы,Боевик,Фэнтези, сказка,Комедия',
 'Ужасы,Фэнтези, сказка,Драма']

In [54]:
genres.remove(" сказка")

In [55]:
df_items = df_items.select("*",
    *[F.array_contains("genres_list", x).alias(f"genre_{x}").cast("integer") for x in genres]
)

In [56]:
df_items = df_items.drop("genres", "genres_list")

In [57]:
df_items = df_items.withColumn("datetime_availability_start", F.to_date("datetime_availability_start"))
df_items = df_items.withColumn("datetime_availability_stop", F.to_date("datetime_availability_stop"))
df_items = df_items.withColumn("datetime_show_start", F.to_date("datetime_show_start"))
df_items = df_items.withColumn("datetime_show_stop", F.to_date("datetime_show_stop"))

In [58]:
df_items.select("datetime_availability_start", "datetime_availability_stop", "datetime_show_start", "datetime_show_stop"). \
where(F.col("datetime_show_stop").isNotNull() | F.col("datetime_show_start").isNotNull()).show(100)

+---------------------------+--------------------------+-------------------+------------------+
|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|
+---------------------------+--------------------------+-------------------+------------------+
+---------------------------+--------------------------+-------------------+------------------+



In [59]:
df_items = df_items.drop("datetime_show_start", "datetime_show_stop")

In [60]:
df_items.select("channel_id").distinct().show()

+----------+
|channel_id|
+----------+
|      null|
+----------+



In [61]:
df_items.select("content_type").groupBy("content_type").count().show()

+------------+------+
|content_type| count|
+------------+------+
|        null|631864|
|           1|  3704|
+------------+------+



In [62]:
df_items.select("region_id").distinct().show()

+---------+
|region_id|
+---------+
|     null|
+---------+



In [63]:
df_items.select("datetime_availability_start").groupBy("datetime_availability_start").count().show()

+---------------------------+------+
|datetime_availability_start| count|
+---------------------------+------+
|                       null|631864|
|                 2017-01-01|     2|
|                 1970-01-01|  3702|
+---------------------------+------+



In [64]:
df_items.select("datetime_availability_stop").groupBy("datetime_availability_stop").count().show()

+--------------------------+------+
|datetime_availability_stop| count|
+--------------------------+------+
|                      null|631864|
|                2017-08-21|     1|
|                2018-12-31|    10|
|                2100-01-01|  3599|
|                2018-01-01|    94|
+--------------------------+------+



In [65]:
df_items = df_items.drop("channel_id", "region_id")

In [66]:
df_items = df_items.withColumn("is_limited", (F.col("datetime_availability_stop") < F.lit("2100-01-01")).cast("int"))

In [67]:
df_items.select("datetime_availability_start", "datetime_availability_stop", "content_type", "is_limited").groupBy("datetime_availability_start", "datetime_availability_stop", "content_type", "is_limited").count().show()

+---------------------------+--------------------------+------------+----------+------+
|datetime_availability_start|datetime_availability_stop|content_type|is_limited| count|
+---------------------------+--------------------------+------------+----------+------+
|                 2017-01-01|                2018-01-01|           1|         1|     2|
|                 1970-01-01|                2018-01-01|           1|         1|    92|
|                       null|                      null|        null|      null|631864|
|                 1970-01-01|                2018-12-31|           1|         1|    10|
|                 1970-01-01|                2017-08-21|           1|         1|     1|
|                 1970-01-01|                2100-01-01|           1|         0|  3599|
+---------------------------+--------------------------+------------+----------+------+



In [68]:
df_items = df_items.filter(F.col("content_type").isNotNull())

In [69]:
df_items.select("genre_Эротика").groupBy("genre_Эротика").count().show()

+-------------+-----+
|genre_Эротика|count|
+-------------+-----+
|         null|   33|
|            1|  124|
|            0| 3547|
+-------------+-----+



In [70]:
df_items = df_items.na.fill(value=0,subset=[col for col in df_items.columns if col.startswith("genre_")])

In [71]:
df_items.filter(F.col("year").isNull()).show()

+-------+---------------------------+--------------------------+------------+--------------------+----+-----------+--------------+---------------+------------------+-----------------+-----------------+-------------+--------------------+--------------------+----------------+-----------------+--------------+--------------------+--------------------------+------------+--------------------+-------------------+---------------------+----------+--------------+---------------------+-------------------------+-------------+---------------+----------------+---------------+-------------+-----------+--------------------+----------------------+----------+-------------+----------------+-------------+--------------+--------------------+-----------+-------------+--------------+----------------+-------------+------------------+-----------------+-----------------+-------------+-------------------+--------------------+-----------+---------------------+----------------------+-------------+-----------------

In [72]:
df_items = df_items.na.fill(value=2000, subset=["year"])

In [73]:
df_items = df_items.drop("title", "datetime_availability_start", "datetime_availability_stop")

In [74]:
df_train = df_train.join(F.broadcast(df_user_features), "user_id", "left")


In [75]:
df_test = df_test.join(F.broadcast(df_user_features), "user_id", "left")

In [76]:
df_test.count()

2156840

In [77]:
df_train_items = df_train.join(df_items, "item_id", "left")
df_train_items.cache()
print(df_train_items.count(), len(df_train_items.columns))

5032624 218


In [78]:
df_test_items = df_test.join(df_items, "item_id", "left")
df_test_items.cache()

DataFrame[item_id: int, user_id: int, item_purchase_prob: double, user_purchase_prob: double, views_user_live_prob: double, views_user_mean_sec: double, views_user_mean_sec_live: double, views_user_mean_sec_pvr: double, views_user_day_0_mean_live: double, views_user_day_0_mean_sec_live: double, views_user_day_0_mean_pvr: double, views_user_day_0_mean_sec_pvr: double, views_user_day_1_mean_live: double, views_user_day_1_mean_sec_live: double, views_user_day_1_mean_pvr: double, views_user_day_1_mean_sec_pvr: double, views_user_day_2_mean_live: double, views_user_day_2_mean_sec_live: double, views_user_day_2_mean_pvr: double, views_user_day_2_mean_sec_pvr: double, views_user_day_3_mean_live: double, views_user_day_3_mean_sec_live: double, views_user_day_3_mean_pvr: double, views_user_day_3_mean_sec_pvr: double, views_user_day_4_mean_live: double, views_user_day_4_mean_sec_live: double, views_user_day_4_mean_pvr: double, views_user_day_4_mean_sec_pvr: double, views_user_day_5_mean_live: do

In [None]:
for col in df_train_items.columns:
    df_train_items = df_train_items.withColumnRenamed(col, col.replace(" ", "_"))
    
for col in df_test_items.columns:
    df_test_items = df_test_items.withColumnRenamed(col, col.replace(" ", "_"))
    
df_train_items.write.parquet("df_train_items", mode="overwrite")
df_test_items.write.parquet("df_test_items", mode="overwrite")

In [90]:
df_train_items = spark.read.parquet("df_train_items")

In [91]:
df_test_items = spark.read.parquet("df_test_items")

In [92]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [99]:
assembler = VectorAssembler(
    inputCols=[x for x in df_train_items.columns if x != "purchase"],
    outputCol='features', handleInvalid="keep")

In [85]:
train_data = assembler.transform(df_train_items).select("purchase", "features")

In [86]:
train_data.cache()

DataFrame[purchase: int, features: vector]

In [94]:
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel

iteration = 50
model = GBTClassificationModel.load(f"model_{iteration}")

In [232]:
from datetime import datetime

st = datetime.now()

iteration = 50
gbt = GBTClassifier(labelCol="purchase",
                    featuresCol="features", maxIter=iteration, seed=56456)

model = gbt.fit(train_data)

print(datetime.now() - st)

4:03:09.831484


In [181]:
from datetime import datetime

st = datetime.now()


iteration = 50
gbt = GBTClassifier(labelCol="purchase",
                    featuresCol="features", maxIter=iteration, seed=56456)

evaluator = BinaryClassificationEvaluator(labelCol="purchase")

# no parameter search
paramGrid = ParamGridBuilder().build()

# 6-fold cross validation
crossval = CrossValidator(
    estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=6)


model = crossval.fit(train_data)

print(datetime.now() - st)

print("trained GBT classifier:%s" % model)

# display CV score
auc_roc = model.avgMetrics[0]
print("AUC ROC = %g" % auc_roc)

Py4JJavaError: An error occurred while calling o2586.fit.
: org.apache.spark.SparkException: Job 1718 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:954)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:952)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:952)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2164)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2077)
	at org.apache.spark.SparkContext$$anonfun$stop$6.apply$mcV$sp(SparkContext.scala:1949)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1948)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:121)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:129)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:330)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:55)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:206)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:156)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:156)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [233]:
model.write().overwrite().save(f"model_{iteration}")

In [None]:
# TODO: fill Null in dt_test_items with for each value

for col in dt_test_items.columns:
    c = dt_test_items.select(col).filter(F.col(col).isNull()).count()
    
    if c > 0:
        print(col, c)

In [None]:
# TODO: fill Null in dt_user_features with for each value

for col in dt_user_features.columns:
    c = dt_user_features.select(col).filter(F.col(col).isNull()).count()
    
    if c > 0:
        print(col, c)

views_user_live_prob 3
views_user_mean_sec 3
views_user_mean_sec_live 5
views_user_mean_sec_pvr 251


In [101]:
predictions = model.transform(assembler.transform(df_test_items))

In [102]:
get_probability = F.udf(lambda x: float(x[1]), T.FloatType())
predictions = predictions.withColumn('purchase', get_probability(F.col('probability')))

In [103]:
predictions = predictions.select("user_id", "item_id", "purchase").orderBy("user_id", "item_id")

In [104]:
submission = predictions.toPandas()
submission.to_csv("lab03.csv", header="true")

In [105]:
submission.shape

(2156840, 3)

In [106]:
submission.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.021804
1,1654,678,0.021804
2,1654,691,0.021804
3,1654,696,0.021893
4,1654,763,0.021804


In [None]:

spark.stop()