In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 5g --executor-cores 3 --driver-memory 5g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "lab03") 
conf.set("spark.sql.shuffle.partitions", 20)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.sql.functions import col, desc, pandas_udf, PandasUDFType, udf, regexp_replace, when, asc, lit, broadcast
from pyspark.sql.types import StructType, IntegerType, StructField, DateType, StringType, TimestampType, FloatType, ArrayType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

In [115]:
train_schema = StructType(fields = [
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('purchase', IntegerType()),
])

In [116]:
data_train = spark.read.option("header","true") \
                        .schema(train_schema) \
                        .csv('/labs/slaba03/laba03_train.csv')

In [117]:
data_train = data_train.na.fill(0)
data_train.show(10)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [118]:
test_schema = StructType(fields = [
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
])

In [119]:
data_test = spark.read.option("header","true") \
                        .schema(test_schema) \
                        .csv('/labs/slaba03/laba03_test.csv')

In [120]:
data_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [121]:
items_schema = StructType(fields = [
    StructField('item_id', IntegerType()),
    StructField('channel_id', IntegerType()),
    StructField('datetime_availability_start', StringType()),
    StructField('datetime_availability_stop', StringType()),
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType()),
    StructField('year', FloatType()),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType()),
])

In [122]:
data_items = spark.read.option("header","true") \
                        .option("sep", "\t") \
                        .schema(items_schema) \
                        .csv('/labs/slaba03/laba03_items.csv')

In [123]:
data_items = data_items.na.fill(0)
data_items.show(5)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+-------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|  year| genres|region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+-------+---------+
|  65667|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013.0|Эротика|        0|
|  65669|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|скуби ду: эротиче...|2011.0|Эротика|        0|
|  65668|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|гор

In [124]:
data_items.schema

StructType(List(StructField(item_id,IntegerType,true),StructField(channel_id,IntegerType,true),StructField(datetime_availability_start,StringType,true),StructField(datetime_availability_stop,StringType,true),StructField(datetime_show_start,StringType,true),StructField(datetime_show_stop,StringType,true),StructField(content_type,IntegerType,true),StructField(title,StringType,true),StructField(year,FloatType,false),StructField(genres,StringType,true),StructField(region_id,IntegerType,true)))

In [125]:
views_schema = StructType(fields = [
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType()),
])

In [126]:
data_views = spark.read.option("header","true") \
                    .schema(views_schema) \
                    .csv('/labs/slaba03/laba03_views_programmes.csv')

In [127]:
data_views = data_views.na.fill(0)
data_views.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



In [128]:
data_views.summary().show()

+-------+------------------+-----------------+------------------+--------------------+---------+
|summary|           user_id|          item_id|          ts_start|              ts_end|item_type|
+-------+------------------+-----------------+------------------+--------------------+---------+
|  count|          20845607|         20845607|          20845607|            20845607| 20845607|
|   mean| 843101.3175987152| 6814314.24027048|1.49020644555557E9|1.4902099667723315E9|     null|
| stddev|62833.218456075265|753861.9254537829| 2192980.211751445|   2192944.005817249|     null|
|    min|                 0|           -99996|        1485907219|          1485915673|     live|
|    25%|            805172|          6545947|        1488468074|          1488471193|     null|
|    50%|            850748|          6799520|        1490175289|          1490178721|     null|
|    75%|            891235|          7336145|        1492458177|          1492461073|     null|
|    max|            941970|  

In [129]:
data_views.select('item_type').distinct().show()

+---------+
|item_type|
+---------+
|      pvr|
|     live|
+---------+



In [130]:
data_views = data_views.withColumn('item_type', regexp_replace('item_type', 'live', '1'))
data_views = data_views.withColumn('item_type', regexp_replace('item_type', 'pvr', '0').cast(IntegerType()))

data_views.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|        1|
|      0|7101054|1491412481|1491451571|        1|
|      0|7101054|1491411640|1491412481|        1|
|      0|6184414|1486191290|1486191640|        1|
|    257|4436877|1490628499|1490630256|        1|
+-------+-------+----------+----------+---------+
only showing top 5 rows



In [131]:
data_views = data_views.join(data_views.groupBy('user_id').mean('item_type'), on='user_id', how='left')
data_views.show(5)

+-------+-------+----------+----------+---------+--------------+
|user_id|item_id|  ts_start|    ts_end|item_type|avg(item_type)|
+-------+-------+----------+----------+---------+--------------+
| 508880|6688384|1489584900|1489588500|        1|           1.0|
| 508880|6599570|1488911700|1488912294|        1|           1.0|
| 508880|6691613|1489605634|1489606500|        1|           1.0|
| 508880|6651555|1488913200|1488915300|        1|           1.0|
| 508880|6691615|1489607100|1489608000|        1|           1.0|
+-------+-------+----------+----------+---------+--------------+
only showing top 5 rows



In [132]:
data_views = data_views.withColumn('ts_duration', data_views['ts_end'] - data_views['ts_start'])
data_views.show(5)

+-------+-------+----------+----------+---------+--------------+-----------+
|user_id|item_id|  ts_start|    ts_end|item_type|avg(item_type)|ts_duration|
+-------+-------+----------+----------+---------+--------------+-----------+
| 508880|6688384|1489584900|1489588500|        1|           1.0|       3600|
| 508880|6599570|1488911700|1488912294|        1|           1.0|        594|
| 508880|6691613|1489605634|1489606500|        1|           1.0|        866|
| 508880|6651555|1488913200|1488915300|        1|           1.0|       2100|
| 508880|6691615|1489607100|1489608000|        1|           1.0|        900|
+-------+-------+----------+----------+---------+--------------+-----------+
only showing top 5 rows



In [133]:
data_views = data_views.withColumn('ts_duration_norm', data_views['ts_duration']/ \
                                   data_views.agg({'ts_duration': 'max'}).first()['max(ts_duration)'])
data_views.show(5)

+-------+-------+----------+----------+---------+--------------+-----------+--------------------+
|user_id|item_id|  ts_start|    ts_end|item_type|avg(item_type)|ts_duration|    ts_duration_norm|
+-------+-------+----------+----------+---------+--------------+-----------+--------------------+
| 508880|6688384|1489584900|1489588500|        1|           1.0|       3600| 0.03838239527470067|
| 508880|6599570|1488911700|1488912294|        1|           1.0|        594|0.006333095220325611|
| 508880|6691613|1489605634|1489606500|        1|           1.0|        866| 0.00923309841885855|
| 508880|6651555|1488913200|1488915300|        1|           1.0|       2100|0.022389730576908726|
| 508880|6691615|1489607100|1489608000|        1|           1.0|        900|0.009595598818675168|
+-------+-------+----------+----------+---------+--------------+-----------+--------------------+
only showing top 5 rows



In [134]:
data_views = data_views.withColumn('ts_start_norm', data_views['ts_start']/ \
                                   data_views.agg({'ts_start': 'max'}).first()['max(ts_start)'])
data_views = data_views.withColumn('ts_end_norm', data_views['ts_end']/ \
                                   data_views.agg({'ts_end': 'max'}).first()['max(ts_end)'])
data_views.show(5)

+-------+-------+----------+----------+---------+--------------+-----------+--------------------+------------------+------------------+
|user_id|item_id|  ts_start|    ts_end|item_type|avg(item_type)|ts_duration|    ts_duration_norm|     ts_start_norm|       ts_end_norm|
+-------+-------+----------+----------+---------+--------------+-----------+--------------------+------------------+------------------+
| 508880|6688384|1489584900|1489588500|        1|           1.0|       3600| 0.03838239527470067|0.9971387050439648|0.9971408919667917|
| 508880|6599570|1488911700|1488912294|        1|           1.0|        594|0.006333095220325611|0.9966880601856317|0.9966882349719282|
| 508880|6691613|1489605634|1489606500|        1|           1.0|        866| 0.00923309841885855|0.9971525845307335|0.9971529412918606|
| 508880|6651555|1488913200|1488915300|        1|           1.0|       2100|0.022389730576908726|0.9966890642962787|0.9966902472092146|
| 508880|6691615|1489607100|1489608000|        1

In [137]:
data_train = data_train.join(data_views.select('item_id', 'item_type').dropDuplicates(['item_id']), \
                             on='item_id', how='left')
data_test = data_test.join(data_views.select('item_id', 'item_type').dropDuplicates(['item_id']), \
                             on='item_id', how='left')
data_train = data_train.na.fill(0)
data_test = data_test.na.fill(0)

In [139]:
data_train.show(5)

+-------+-------+--------+---------+
|item_id|user_id|purchase|item_type|
+-------+-------+--------+---------+
|    326| 778082|       0|        0|
|    326| 778182|       0|        0|
|    326| 778808|       0|        0|
|    326| 778899|       0|        0|
|    326| 779484|       0|        0|
+-------+-------+--------+---------+
only showing top 5 rows



In [140]:
data_test.show(5)

+-------+-------+---------+
|item_id|user_id|item_type|
+-------+-------+---------+
|    326| 520446|        0|
|    326| 523860|        0|
|    326| 566701|        0|
|    326| 619378|        0|
|    326| 632495|        0|
+-------+-------+---------+
only showing top 5 rows



In [141]:
data_items.summary().show()

+-------+-----------------+----------+---------------------------+--------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------------+---------+
|summary|          item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|        content_type|               title|              year|              genres|region_id|
+-------+-----------------+----------+---------------------------+--------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------------+---------+
|  count|           635568|    635568|                       3704|                      3704|                  0|                 0|              635568|                3704|            635568|                3671|   635568|
|   mean|389.7514852856028|       0.0|                       null|                      null|       

In [142]:
data_items = data_items.withColumn('year', data_items['year'].cast(IntegerType()))

In [143]:
data_items.select('year').distinct().show(5)

+----+
|year|
+----+
|2009|
|1959|
|2011|
|1990|
|2014|
+----+
only showing top 5 rows



In [144]:
data_items.select('datetime_availability_start').distinct().show()

+---------------------------+
|datetime_availability_start|
+---------------------------+
|       1970-01-01T00:00:00Z|
|       2017-01-01T00:00:00Z|
+---------------------------+



In [145]:
data_items.select('datetime_availability_stop').distinct().count()

44071

In [146]:
datetime_index = StringIndexer(inputCol="datetime_availability_stop", outputCol="datetime_stop", handleInvalid='skip')
data_items = datetime_index.fit(data_items).transform(data_items)
data_items.show()

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+--------------------+---------+-------------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|year|              genres|region_id|datetime_stop|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+--------------------+---------+-------------+
|  65667|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013|             Эротика|        0|       1263.0|
|  65669|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|скуби ду: эротиче...|2011|             Эротика|        0|       1263.0|
|  65

In [147]:
data_items = data_items.withColumn('datetime_stop_norm', data_items['datetime_stop']/ \
                                   data_items.agg({'datetime_stop': 'max'}).first()['max(datetime_stop)'])
data_items.show(5)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+-------------+-------------------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|year| genres|region_id|datetime_stop| datetime_stop_norm|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+-------------+-------------------+
|  65667|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013|Эротика|        0|       1263.0|0.03234977716305517|
|  65669|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|скуби ду: эротиче...|2011|Эротика|        0|     

In [148]:
data_items = data_items.select('*', when(col('datetime_availability_start').contains("2017-01-01T00:00:00Z"), 1).otherwise(0).alias('datetime_start'))
data_items.show(5)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+-------------+-------------------+--------------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|year| genres|region_id|datetime_stop| datetime_stop_norm|datetime_start|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+-------------+-------------------+--------------+
|  65667|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013|Эротика|        0|       1263.0|0.03234977716305517|             0|
|  65669|         0|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|  

In [149]:
data_items.select('genres').distinct().count()

1077

In [150]:
genres = data_items.select('genres').rdd.flatMap(lambda x: x).collect()
genres

['Эротика',
 'Эротика',
 'Эротика',
 'Эротика',
 'Эротика',
 'Комедии',
 'Комедии,Мелодрамы',
 'Ужасы,Триллеры,Драмы,Фантастика,Зарубежные',
 'Ужасы,Комедии,Фантастика,Зарубежные',
 'Комедии,Мелодрамы,Наши',
 'Детективы,Триллеры,Драмы,Фантастика,Зарубежные',
 'Фантастика,Боевики,Зарубежные',
 'Детективы,Триллеры,Драмы,Криминал,Зарубежные',
 'Детективы,Триллеры,Драмы,Зарубежные',
 'Ужасы,Детективы,Триллеры,Зарубежные',
 'Комедии,Драмы,Зарубежные',
 'Военные,Боевики,Наши',
 'Приключения,Исторические,Мелодрамы,Боевики,Зарубежные',
 'Военные,Драмы,Исторические,Зарубежные',
 'Комедии,Драмы,Мелодрамы,Зарубежные',
 'Семейные,Документальные,Зарубежные',
 'Триллеры,Драмы,Зарубежные',
 'Ужасы,Детективы,Триллеры,Зарубежные',
 'Ужасы,Боевики,Зарубежные',
 'Зарубежные',
 'Триллеры,Драмы,Криминал,Зарубежные',
 'Приключения,Комедии,Боевики,Зарубежные',
 'Мультфильмы,Мультсериалы,Детские,Наши',
 'Ужасы,Зарубежные',
 'Приключения,Мультсериалы,Детские,Зарубежные',
 'Комедии,Наши',
 'Мультфильмы,Полномет

In [151]:
genres = list(set(genres))
genres.remove(None)
genres

['Драмы,Для взрослых,Наши',
 'Ужасы,Драмы,Зарубежные',
 'Ужасы,Детективы,Триллеры,Мелодрамы,Криминал',
 'Русские мультфильмы,Русские,Сериалы,Для всей семьи,Для детей,Развивающие,Детские песни',
 'Приключения,Фэнтези,Русские',
 'Приключения,Мелодрамы,Наши',
 'Русские мультфильмы,Сказки,Русские,Сериалы,Для детей',
 'Детективы,Триллеры,Криминал,Зарубежные',
 'Приключения,Фантастика',
 'Детективы,Триллеры,Русские,Боевики',
 'Русские,Сериалы,Для всей семьи,Для детей,Развивающие',
 'Комедии,Фантастика,Боевики',
 'Детективы,Боевики,Криминал',
 'Мистические,Приключения,Фэнтези,Боевики,Зарубежные',
 'Драмы,Исторические',
 'Приключения,Драмы,Мелодрамы,Боевики',
 'Детективы,Мелодрамы',
 'Военные,Приключения,Драмы,Наши',
 'Комедии,Триллеры,Наши',
 'Советское кино,Драмы,Исторические,Русские',
 'Западные мультфильмы,Сериалы,Для детей,Развивающие,Хочу всё знать,Зарубежные',
 'Военные,Драмы,Боевики,Наши',
 'Эротика,Драмы,Зарубежные',
 'Артхаус,Эротика',
 'Приключения,Комедии,Советское кино,Фильмы,Фант

In [152]:
genr = set()
for i in genres:
    for j in i.split(','):
        genr.add(j)

In [154]:
data_items.groupBy('genres').count().sort(desc('count')).show(truncate=False)

+------------------------------------------------------------+------+
|genres                                                      |count |
+------------------------------------------------------------+------+
|General                                                     |631864|
|Ужасы,Триллеры,Зарубежные                                   |79    |
|Мультфильмы,Детские,Союзмультфильм,Наши                     |72    |
|Комедии,Зарубежные                                          |66    |
|Эротика,Зарубежные                                          |58    |
|Комедии,Наши                                                |53    |
|Эротика                                                     |52    |
|Комедии,Драмы,Зарубежные                                    |50    |
|Драмы,Зарубежные                                            |48    |
|Триллеры,Драмы,Зарубежные                                   |46    |
|Ужасы,Зарубежные                                            |45    |
|Комедии,Мелодрамы,З

In [155]:
data_items = data_items.select('*', when((col('genres').contains("Сказки")) | (col('genres').contains("сказка")), 1).otherwise(0).alias('fairy_tale'))
data_items = data_items.select('*', when(col('genres').contains("Анимация"), 1).otherwise(0).alias('animation'))
data_items = data_items.select('*', when(col('genres').contains("Арт"), 1).otherwise(0).alias('art_house'))
data_items = data_items.select('*', when(col('genres').contains("Аниме"), 1).otherwise(0).alias('anime'))
data_items = data_items.select('*', when(col('genres').contains("Биография"), 1).otherwise(0).alias('biography'))
data_items = data_items.select('*', when(col('genres').contains("Боевик"), 1).otherwise(0).alias('action_movie'))
data_items = data_items.select('*', when(col('genres').contains("Вестерн"), 1).otherwise(0).alias('western'))
data_items = data_items.select('*', when(col('genres').contains("Видеоигры"), 1).otherwise(0).alias('video_game'))
data_items = data_items.select('*', when(col('genres').contains("Игры"), 1).otherwise(0).alias('games'))
data_items = data_items.select('*', when(col('genres').contains("Военны"), 1).otherwise(0).alias('military'))
data_items = data_items.select('*', when(col('genres').contains("Детективы"), 1).otherwise(0).alias('detective'))
data_items = data_items.select('*', when((col('genres').contains("Семейны")) | (col('genres').contains("Для всей семьи")), 1).otherwise(0).alias('family'))
data_items = data_items.select('*', when(col('genres').contains("Для взрослых"), 1).otherwise(0).alias('adults'))
data_items = data_items.select('*', when(col('genres').contains("Эротика"), 1).otherwise(0).alias('erotica'))
data_items = data_items.select('*', when(col('genres').contains("Развлекательные"), 1).otherwise(0).alias('entertainment'))
data_items = data_items.select('*', when((col('genres').contains("Наши")) | (col('genres').contains("Русские")), 1).otherwise(0).alias('our'))
data_items = data_items.select('*', when(col('genres').contains("Зарубежные"), 1).otherwise(0).alias('foreign'))
data_items = data_items.select('*', when(col('genres').contains("Криминал"), 1).otherwise(0).alias('crime'))
data_items = data_items.select('*', when(col('genres').contains("Комеди"), 1).otherwise(0).alias('comedy'))
data_items = data_items.select('*', when(col('genres').contains("Драм"), 1).otherwise(0).alias('drama'))
data_items = data_items.select('*', when(col('genres').contains("Мультфильм"), 1).otherwise(0).alias('cartoon'))
data_items = data_items.select('*', when(col('genres').contains("Приключени"), 1).otherwise(0).alias('adventure'))
data_items = data_items.select('*', when(col('genres').contains("General"), 1).otherwise(0).alias('general'))
data_items = data_items.select('*', when(col('genres').contains("Исторически"), 1).otherwise(0).alias('history'))
data_items = data_items.select('*', when(col('genres').contains("Советские"), 1).otherwise(0).alias('ussr'))
data_items = data_items.select('*', when(col('genres').contains("Триллер"), 1).otherwise(0).alias('thriller'))
data_items = data_items.select('*', when(col('genres').contains("Ужасы"), 1).otherwise(0).alias('horror'))
data_items = data_items.select('*', when((col('genres').contains("Мистические")), 1).otherwise(0).alias('mystical'))
data_items = data_items.select('*', when(col('genres').contains("Фантасти"), 1).otherwise(0).alias('fantastic'))
data_items = data_items.select('*', when(col('genres').contains("Короткометр"), 1).otherwise(0).alias('short_film'))
data_items = data_items.select('*', when(col('genres').contains("Полнометражные"), 1).otherwise(0).alias('full_length'))
data_items = data_items.select('*', when(col('genres').contains("Музыкальн"), 1).otherwise(0).alias('music'))
data_items = data_items.select('*', when(col('genres').contains("Спорт"), 1).otherwise(0).alias('sports'))
data_items = data_items.select('*', when(col('genres').contains("Мелодрам"), 1).otherwise(0).alias('melodrama'))
data_items = data_items.select('*', when(col('genres').contains("Сериалы"), 1).otherwise(0).alias('serials'))
data_items = data_items.select('*', when(col('genres').contains("Экранизации"), 1).otherwise(0).alias('film_adaptations'))
data_items = data_items.select('*', when(col('genres').contains("Романтические"), 1).otherwise(0).alias('romantic'))
data_items = data_items.select('*', when(col('genres').contains("Развивающие"), 1).otherwise(0).alias('developing'))
data_items = data_items.select('*', when(col('genres').contains("Хочу всё знать"), 1).otherwise(0).alias('want_know'))
data_items = data_items.select('*', when(col('genres').contains("Познавательные"), 1).otherwise(0).alias('educational'))
data_items = data_items.select('*', when(col('genres').contains("Советское кино"), 1).otherwise(0).alias('ussr_movie'))
data_items = data_items.select('*', when(col('genres').contains("Документальны"), 1).otherwise(0).alias('documentary'))
data_items = data_items.select('*', when(col('genres').contains("Кулинария"), 1).otherwise(0).alias('cooking'))
data_items = data_items.select('*', when(col('genres').contains("О здоровье"), 1).otherwise(0).alias('health'))
data_items = data_items.select('*', when(col('genres').contains("Прочие"), 1).otherwise(0).alias('other'))
data_items = data_items.select('*', when(col('genres').contains("Про животных"), 1).otherwise(0).alias('animal'))
data_items = data_items.select('*', when(col('genres').contains("Мюзиклы"), 1).otherwise(0).alias('musical'))
data_items = data_items.select('*', when(col('genres').contains("Охота и рыбалка"), 1).otherwise(0).alias('hunting'))
data_items = data_items.select('*', when(col('genres').contains("Передачи"), 1).otherwise(0).alias('broadcast'))
data_items = data_items.select('*', when(col('genres').contains("Научная фантастика"), 1).otherwise(0).alias('nauchpop'))
data_items = data_items.select('*', when(col('genres').contains("Юмористические"), 1).otherwise(0).alias('humorous'))
data_items = data_items.select('*', when(col('genres').contains("Реалити-шоу"), 1).otherwise(0).alias('reality_show'))
data_items = data_items.select('*', when(col('genres').contains("Западные мультфильмы"), 1).otherwise(0).alias('western cartoons'))
data_items = data_items.select('*', when(col('genres').contains("Союзмультфильм"), 1).otherwise(0).alias('union cartoons'))
data_items = data_items.select('*', when(col('genres').contains("Мультсериалы"), 1).otherwise(0).alias('animated_series'))
data_items = data_items.select('*', when(col('genres').contains("Фэнтези"), 1).otherwise(0).alias('fantasy'))
data_items = data_items.select('*', when(col('genres').contains("Для самых маленьких"), 1).otherwise(0).alias('small'))
data_items = data_items.select('*', when(col('genres').contains("Фильмы"), 1).otherwise(0).alias('movie'))
data_items = data_items.select('*', when(col('genres').contains("Русские мультфильмы"), 1).otherwise(0).alias('russian_cartoon'))
data_items = data_items.select('*', when(col('genres').contains("Фильмы-спектакли"), 1).otherwise(0).alias('films_performances'))
data_items = data_items.select('*', when((col('genres').contains("Детские")) | (col('genres').contains("Для детей")), 1).otherwise(0).alias('childish'))

In [156]:
data_items.show(5)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+----+-------+---------+-------------+-------------------+--------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|year| genres|region_id|datetime_stop| datetime_stop_n

In [157]:
data_items.columns

['item_id',
 'channel_id',
 'datetime_availability_start',
 'datetime_availability_stop',
 'datetime_show_start',
 'datetime_show_stop',
 'content_type',
 'title',
 'year',
 'genres',
 'region_id',
 'datetime_stop',
 'datetime_stop_norm',
 'datetime_start',
 'fairy_tale',
 'animation',
 'art_house',
 'anime',
 'biography',
 'action_movie',
 'western',
 'video_game',
 'games',
 'military',
 'detective',
 'family',
 'adults',
 'erotica',
 'entertainment',
 'our',
 'foreign',
 'crime',
 'comedy',
 'drama',
 'cartoon',
 'adventure',
 'general',
 'history',
 'ussr',
 'thriller',
 'horror',
 'mystical',
 'fantastic',
 'short_film',
 'full_length',
 'music',
 'sports',
 'melodrama',
 'serials',
 'film_adaptations',
 'romantic',
 'developing',
 'want_know',
 'educational',
 'ussr_movie',
 'documentary',
 'cooking',
 'health',
 'other',
 'animal',
 'musical',
 'hunting',
 'broadcast',
 'nauchpop',
 'humorous',
 'reality_show',
 'western cartoons',
 'union cartoons',
 'animated_series',
 'fantas

In [158]:
need_features = data_items.drop('channel_id', 'datetime_availability_start', 'datetime_availability_stop', 'datetime_show_start', \
                        'datetime_show_stop', 'title', 'genres', 'region_id', 'datetime_stop').dropDuplicates(['item_id'])

In [159]:
sample = data_train.sampleBy('purchase', fractions={0: 0.5, 1: 0.5}, seed=42).cache()

In [161]:
data_train = data_train.join(sample.groupBy('user_id').mean('purchase').dropDuplicates(['user_id']).\
                             select('user_id', col('avg(purchase)').alias('avg(purchase)_user')), on='user_id', how='left')
data_train = data_train.join(sample.groupBy('item_id').mean('purchase').dropDuplicates(['item_id']).\
                             select('item_id', col('avg(purchase)').alias('avg(purchase)_item')), on='item_id', how='left')

data_test = data_test.join(sample.groupBy('user_id').mean('purchase').dropDuplicates(['user_id']).\
                           select('user_id', col('avg(purchase)').alias('avg(purchase)_user')), on='user_id', how='left')
data_test = data_test.join(sample.groupBy('item_id').mean('purchase').dropDuplicates(['item_id']).\
                           select('item_id', col('avg(purchase)').alias('avg(purchase)_item')), on='item_id', how='left')

data_train = data_train.na.fill(0)
data_test = data_test.na.fill(0)

In [163]:
data_train.show(5)

+-------+-------+--------+---------+--------------------+--------------------+
|item_id|user_id|purchase|item_type|  avg(purchase)_user|  avg(purchase)_item|
+-------+-------+--------+---------+--------------------+--------------------+
|    326|   1654|       0|        0|0.003276003276003276|0.001490312965722...|
|    326| 625638|       0|        0|0.003861003861003861|0.001490312965722...|
|    326| 746687|       0|        0|                 0.0|0.001490312965722...|
|    326| 751383|       0|        0|                 0.0|0.001490312965722...|
|    326| 766411|       0|        0|0.005507474429583...|0.001490312965722...|
+-------+-------+--------+---------+--------------------+--------------------+
only showing top 5 rows



In [164]:
data_test.show(5)

+-------+-------+---------+--------------------+--------------------+
|item_id|user_id|item_type|  avg(purchase)_user|  avg(purchase)_item|
+-------+-------+---------+--------------------+--------------------+
|    326| 747780|        0|                 0.0|0.001490312965722...|
|    326| 752784|        0|0.001547987616099...|0.001490312965722...|
|    326| 760146|        0|0.001492537313432...|0.001490312965722...|
|    326| 774191|        0|0.002396166134185...|0.001490312965722...|
|    326| 780033|        0|0.001563721657544957|0.001490312965722...|
+-------+-------+---------+--------------------+--------------------+
only showing top 5 rows



In [165]:
data_train = data_train.join(need_features.select('*'), on='item_id', how='left')
data_train = data_train.na.fill(0)

data_test = data_test.join(need_features.select('*'), on='item_id', how='left')
data_test = data_test.na.fill(0)

In [166]:
train = data_train.sampleBy('purchase', fractions={0: 0.8, 1: 0.8}, seed=42).cache()
val = data_train.join(train, on=['user_id', 'item_id'], how='leftanti').cache()

In [167]:
assembler = VectorAssembler(inputCols=data_train.drop('purchase').columns, outputCol="features")

In [168]:
dataset = assembler.transform(train)

In [169]:
dataset.show(5)

+-------+-------+--------+---------+--------------------+--------------------+------------+----+------------------+--------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+--------------------+
|item_id|user_id|purchase|item_type|  avg(purchase)_user|  avg(purchase)_item|content_type|year|datetime_stop_norm|datetime_start|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|military|detective|family|adults|erotica|entertainment|

In [170]:
gbt = GBTClassifier(featuresCol='features', labelCol='purchase', maxIter=100, maxDepth=10)

In [171]:
model = gbt.fit(dataset)

In [172]:
val_vector = assembler.transform(val)

In [173]:
valid = model.transform(val_vector)

In [174]:
valid.show(5)

+-------+-------+--------+---------+--------------------+--------------------+------------+----+------------------+--------------+----------+---------+---------+-----+---------+------------+-------+----------+-----+--------+---------+------+------+-------+-------------+---+-------+-----+------+-----+-------+---------+-------+-------+----+--------+------+--------+---------+----------+-----------+-----+------+---------+-------+----------------+--------+----------+---------+-----------+----------+-----------+-------+------+-----+------+-------+-------+---------+--------+--------+------------+----------------+--------------+---------------+-------+-----+-----+---------------+------------------+--------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|purchase|item_type|  avg(purchase)_user|  avg(purchase)_item|content_type|year|datetime_stop_norm|datetime_start|fairy_tale|animation|art_house|anime|biography|action_movie|western|video_game|games|mi

In [175]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

In [176]:
evaluator.evaluate(valid)

0.921105151541694

In [177]:
model.featureImportances

SparseVector(70, {0: 0.1013, 1: 0.2727, 3: 0.1825, 4: 0.1407, 6: 0.0737, 7: 0.0192, 8: 0.0001, 9: 0.0013, 10: 0.0, 11: 0.0013, 12: 0.001, 13: 0.0032, 14: 0.0095, 15: 0.0027, 16: 0.0006, 17: 0.0003, 18: 0.0042, 19: 0.007, 20: 0.008, 21: 0.0023, 22: 0.0073, 23: 0.0004, 24: 0.0084, 25: 0.0107, 26: 0.0069, 27: 0.0114, 28: 0.0096, 29: 0.0071, 30: 0.01, 32: 0.0028, 33: 0.0016, 34: 0.0079, 35: 0.0085, 36: 0.0033, 37: 0.009, 38: 0.001, 39: 0.0086, 40: 0.0014, 41: 0.0016, 42: 0.0066, 43: 0.003, 44: 0.0003, 45: 0.0006, 46: 0.0011, 47: 0.0011, 48: 0.0005, 49: 0.0008, 50: 0.0046, 51: 0.0, 53: 0.0002, 54: 0.0028, 55: 0.0005, 56: 0.0002, 57: 0.0014, 58: 0.0, 59: 0.0003, 60: 0.001, 61: 0.0023, 62: 0.002, 63: 0.0044, 64: 0.0043, 65: 0.0031, 66: 0.0012, 67: 0.0023, 69: 0.0062})

In [178]:
best_pred = udf(lambda s: s.values.item(1), FloatType())

In [179]:
out_vector = assembler.transform(data_test)
out_predict = model.transform(out_vector)

In [180]:
out_predict = out_predict.withColumn('purchase', best_pred(col('probability')))

In [181]:
out_predict.select('user_id', 'item_id', 'purchase', 'probability').sort('user_id', 'item_id').show(5, truncate=False)

+-------+-------+-----------+-----------------------------------------+
|user_id|item_id|purchase   |probability                              |
+-------+-------+-----------+-----------------------------------------+
|1654   |336    |0.011937511|[0.9880624890424398,0.011937510957560238]|
|1654   |678    |0.0119286  |[0.9880713997059982,0.011928600294001779]|
|1654   |691    |0.011951895|[0.9880481043477533,0.011951895652246747]|
|1654   |696    |0.012546711|[0.9874532891297846,0.012546710870215416]|
|1654   |763    |0.011951895|[0.9880481043477533,0.011951895652246747]|
+-------+-------+-----------+-----------------------------------------+
only showing top 5 rows



In [182]:
print('Count on predictions data: ', out_predict.count())
print('Count on check data: ', data_test.count())

Count on predictions data:  2156840
Count on check data:  2156840


In [183]:
out_predict.select('user_id', 'item_id', 'purchase').sort('user_id', 'item_id').toPandas().to_csv('lab03.csv', sep=',', index=True)

In [None]:
spark.stop()