## Лабораторная работа 3

### Задача

В вашем распоряжении имеется уже предобработанный и очищенный датасет с фактами покупок абонентами телепередач от компании E-Contenta. 

По доступным вам данным, нужно предсказать вероятность покупки других передач этими, а, возможно, и другими абонентами. При решении задачи запрещено использовать библиотеки pandas, sklearn (кроме sklearn.metrics), xgboost и другие. Если scikit-learn (например, но и другие тоже) обернут в классы Transformer и Estimator, то их можно использовать.

In [1]:
import os
import sys
import json

In [2]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 4g --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, FloatType
from pyspark.sql.functions import col,array_contains


### Данные

In [4]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


###  laba03_train.csv 


* purchase - факты покупки 
* user_id - id пользователя
* item_id - id телепередачи


In [5]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) \
      .add("purchase", IntegerType(), True)
      
df_user = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_train.csv")


In [6]:
df_user.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



###  laba03_test.csv 


* purchase - факты покупки 
* user_id - id пользователя
* item_id - id телепередачи

In [7]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) 
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")


In [8]:
df_user_test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



###  laba03_items.csv


* item_id - Соответствует item_id в предыдущем файле.
* content_type - тип контента
* title - название передачи, текстовое поле.
* year - год выпуска передачи, число.
* genres - поле с жанрами передачи, разделёнными через запятую.


In [9]:
read_items_schema = StructType(fields=[StructField('item_id', IntegerType()), 
StructField('channel_id', IntegerType()),
StructField('datetime_availability_start', StringType()),
StructField('datetime_availability_stop', StringType()),
StructField('datetime_show_start', StringType()),
StructField('datetime_show_stop', StringType()),
StructField('content_type', IntegerType()),
StructField('title', StringType(), nullable=True),
StructField('year', FloatType(), nullable=True),
StructField('genres', StringType()),
StructField('region_id', IntegerType()),
]) 




df_items = spark.read.format("csv") \
      .option("header", True) \
      .option("sep", "\t")\
      .schema(read_items_schema) \
      .load("/labs/slaba03/laba03_items.csv")

In [10]:
df_items.show(5, False, False)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title                                                                                 |year  |genres |region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-------+---------+
|65667  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |на пробах только девушки (all girl auditions)                                         |2013.0|Эротика|null     |
|65669  |null      |1970-01-01T00:00:00Z

## laba03_views_programmes.csv

In [11]:
read_users_schema = StructType(fields=[StructField('user_id', IntegerType()), 
StructField('item_id', IntegerType()),
StructField('ts_start', IntegerType()),
StructField('ts_end', IntegerType()),
StructField('item_type', StringType()),
]) 



df_views_programmes = spark.read.format("csv") \
      .option("header", True) \
      .schema(read_users_schema) \
      .load("/labs/slaba03/laba03_views_programmes.csv")

In [12]:
df_views_programmes.show(5, False, False)

+-------+-------+----------+----------+---------+
|user_id|item_id|ts_start  |ts_end    |item_type|
+-------+-------+----------+----------+---------+
|0      |7101053|1491409931|1491411600|live     |
|0      |7101054|1491412481|1491451571|live     |
|0      |7101054|1491411640|1491412481|live     |
|0      |6184414|1486191290|1486191640|live     |
|257    |4436877|1490628499|1490630256|live     |
+-------+-------+----------+----------+---------+
only showing top 5 rows



----------

In [20]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
# define evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="purchase", metricName="areaUnderROC")

In [23]:
# Fit ALS on the training data
als = ALS(maxIter=20, regParam=2.2, rank=6, coldStartStrategy="nan", \
          userCol='user_id', itemCol='item_id', ratingCol='purchase', \
          nonnegative=False, implicitPrefs=True, alpha=5.0, seed=87)
%time als_model = als.fit(df_user)

CPU times: user 10.3 ms, sys: 0 ns, total: 10.3 ms
Wall time: 18.3 s


In [24]:
predict_train = als_model.transform(df_user)
%time predict_train.show(5)

+-------+-------+--------+-------------+
|user_id|item_id|purchase|   prediction|
+-------+-------+--------+-------------+
| 754230|   8389|       0|  0.064516366|
| 780033|   8389|       0| 0.0013112826|
| 798454|   8389|       0|-2.7597987E-4|
| 825061|   8389|       0| -0.007014811|
| 833685|   8389|       0|   0.13137318|
+-------+-------+--------+-------------+
only showing top 5 rows

CPU times: user 0 ns, sys: 4.61 ms, total: 4.61 ms
Wall time: 31.7 s


In [25]:
predict_train.rdd.getNumPartitions()

200

In [26]:
predict_train = predict_train.coalesce(4).cache()

In [27]:
predict_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [28]:
predict_train = predict_train.withColumn("prediction", predict_train.prediction.cast(DoubleType()))
%time predict_train.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- prediction: double (nullable = false)

CPU times: user 298 µs, sys: 71 µs, total: 369 µs
Wall time: 419 µs


In [29]:
%time predict_train.summary().show()

+-------+-----------------+------------------+--------------------+--------------------+
|summary|          user_id|           item_id|            purchase|          prediction|
+-------+-----------------+------------------+--------------------+--------------------+
|  count|          5032624|           5032624|             5032624|             5032624|
|   mean|869680.9464782189| 66869.30485865823|0.002166662957534...|0.005121428529973152|
| stddev|60601.09821562932|35242.282055382544|0.046496977952915616|0.019777955581597964|
|    min|             1654|               326|                   0|-0.21212543547153473|
|    25%|           846231|             60351|                   0|-2.30404548346996...|
|    50%|           885247|             79853|                   0|                 0.0|
|    75%|           908726|             93602|                   0|0.003308809362351...|
|    max|           941450|            104165|                   1| 0.45947134494781494|
+-------+------------

In [30]:
# check roc_auc on the train set
%time rocauc_train = evaluator.evaluate(predict_train)
print(f'ROC AUC for train data: {rocauc_train}')

CPU times: user 1.38 ms, sys: 4.03 ms, total: 5.41 ms
Wall time: 12.8 s
ROC AUC for train data: 0.9685476387336736


In [32]:
# predict test data
predict_test = als_model.transform(df_user_test)
%time predict_test.show(5)

+-------+-------+-------------+
|user_id|item_id|   prediction|
+-------+-------+-------------+
| 822709|   8389|3.1188553E-19|
| 824008|   8389|-0.0017191223|
| 890476|   8389|          0.0|
| 899993|   8389|  8.513293E-4|
| 937345|   8389|  0.032060243|
+-------+-------+-------------+
only showing top 5 rows

CPU times: user 1.66 ms, sys: 395 µs, total: 2.06 ms
Wall time: 13.4 s


In [33]:
predict_test.rdd.getNumPartitions()

200

In [34]:
predict_test = predict_test.coalesce(4).cache()

In [35]:
%time predict_test.summary().show()

+-------+-----------------+-----------------+--------------------+
|summary|          user_id|          item_id|          prediction|
+-------+-----------------+-----------------+--------------------+
|  count|          2156840|          2156840|             2156840|
|   mean|869652.3733920922|66896.00283609354|0.005018660489555...|
| stddev|60706.51616333836|35227.83130704636| 0.01914327488526151|
|    min|             1654|              326|         -0.20062923|
|    25%|           846231|            65667|       -2.3222984E-4|
|    50%|           885247|            79856|                 0.0|
|    75%|           908588|            93606|        0.0033056643|
|    max|           941450|           104165|          0.44166976|
+-------+-----------------+-----------------+--------------------+

CPU times: user 4.1 ms, sys: 0 ns, total: 4.1 ms
Wall time: 17.1 s


In [36]:
output = predict_test.select('user_id', 'item_id', col('prediction').alias('purchase')) \
                     .orderBy(['user_id', 'item_id'])
output.show(5)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|         0.0|
|   1654|    678|         0.0|
|   1654|    691|         0.0|
|   1654|    696|1.7609971E-4|
|   1654|    763|0.0017800244|
+-------+-------+------------+
only showing top 5 rows



In [37]:
output.toPandas().to_csv('lab03.csv')

In [38]:
spark.stop()