# Лаба 3. Рекомендательная система видео-контента

In [31]:
_spark.stop()

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("lab03").getOrCreate()

In [3]:
print(spark._sc.applicationId)
spark

application_1667306389915_2234


In [5]:
import numpy as np 
import pandas as pd
from pyspark.ml.regression import GBTRegressor, RandomForestRegressor
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, lower
from urllib.parse import urlparse
# import pyspark.sql.functions as f

from pyspark.sql.functions import lit


spark.sparkContext.setCheckpointDir('checkpoint/')

In [34]:
sc = spark.sparkContext

In [16]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
! hdfs dfs -ls -h /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs     86.8 M 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs     28.6 M 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs     71.5 M 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs    830.9 M 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


## Читаем данные

In [39]:
spark.read \
     .format("csv") \
     .option("sep", "\t") \
     .option("header", True) \
     .load("/labs/slaba03/laba03_items.csv").show(40, vertical=False, truncate=False)

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-----------------------------------------------------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title                                                                                 |year  |genres                                               |region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------------------------------------------------------------------------+------+-----------------------------------------------------+---------+
|65667  |null      |1970-01-01T00:00:00Z       |2018-01-01T00:00:00Z      |null               |null              |1           |на пробах тольк

In [38]:
items = spark.read.csv("/labs/slaba03/laba03_items.csv", sep="\t",header=True,  inferSchema="true").cache()
items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: double (nullable = true)
 |-- datetime_availability_start: timestamp (nullable = true)
 |-- datetime_availability_stop: timestamp (nullable = true)
 |-- datetime_show_start: timestamp (nullable = true)
 |-- datetime_show_stop: timestamp (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: double (nullable = true)



In [43]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", sep=",",header=True, inferSchema="true").cache()
test.printSchema()
test.show(10)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)



In [45]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", sep=",",header=True, inferSchema="true").cache()
train = train.na.fill(-1)
train.printSchema()
train.show(10)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
|   1654|  66187|       0|
|   1654|  84350|       0|
|   1654|  92854|       0|
|   1654|  72811|       0|
|   1654|  86876|       0|
+-------+-------+--------+
only showing top 10 rows



In [47]:
%%time
views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", sep=",",inferSchema="true",header=True).cache()
views = views.na.fill(-1)
views.printSchema()
views.show(10)


root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- ts_start: integer (nullable = true)
 |-- ts_end: integer (nullable = true)
 |-- item_type: string (nullable = true)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
|   1654|7489015|1493434801|1493435401|     live|
|   1654|7489023|1493444101|1493445601|     live|
|   1654|6617053|1489186156|1489200834|     live|
|   1654|6438693|1487840070|1487840433|     live|
|   1654|6526859|1488705452|1488706154|     live|
+-------+-------+----------+----------+---------+
only showing top 10 rows



In [48]:
views = views.withColumn("time_spend", col("ts_end") - col("ts_start"))
v_time_spend = views.groupBy('user_id').sum('time_spend')
v_count = views.groupBy('user_id').count()
views = v_time_spend.join(v_count, on="user_id", how="inner" )
views.show(10)

+-------+---------------+-----+
|user_id|sum(time_spend)|count|
+-------+---------------+-----+
| 561425|          37699|    5|
| 612390|           4406|    1|
| 612597|         687126|  132|
| 632436|        2383871|  464|
| 701909|         843853|  165|
| 730383|         489816|  282|
| 738276|        1581380|  202|
| 741712|         660541|   51|
| 745576|           9956|    9|
| 747718|         169489|    5|
+-------+---------------+-----+
only showing top 10 rows



## Предобработка данных

In [49]:
%%time
train_avg_purchase = train.groupBy('user_id').sum('purchase')
train_avg_item = train.groupBy('item_id').sum('purchase')

mod_train = train.join(train_avg_purchase, on="user_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(purchase)", "sum_user")
mod_train = mod_train.join(train_avg_item, on="item_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(purchase)", "sum_item")

mod_train = mod_train.join(views, on="user_id", how="left")
mod_train = mod_train.withColumnRenamed("sum(time_spend)", "time_spend")

mod_train = mod_train.na.fill(0.5)

In [50]:
%%time
mod_train.show(30)

+-------+-------+--------+--------+--------+----------+-----+
|user_id|item_id|purchase|sum_user|sum_item|time_spend|count|
+-------+-------+--------+--------+--------+----------+-----+
| 754230|  77748|       0|      72|       1|   2256455| 1164|
| 754230|  89249|       0|      72|       2|   2256455| 1164|
| 754230|  77917|       0|      72|       4|   2256455| 1164|
| 754230|   7773|       0|      72|       2|   2256455| 1164|
| 754230|  79875|       0|      72|       5|   2256455| 1164|
| 754230|  99703|       0|      72|       1|   2256455| 1164|
| 754230|  91958|       0|      72|       1|   2256455| 1164|
| 754230|  74423|       0|      72|       3|   2256455| 1164|
| 754230|  98670|       0|      72|       2|   2256455| 1164|
| 754230|  95773|       0|      72|       1|   2256455| 1164|
| 754230|  75005|       0|      72|       5|   2256455| 1164|
| 754230|  99698|       0|      72|       1|   2256455| 1164|
| 754230|  79413|       0|      72|      23|   2256455| 1164|
| 754230

In [51]:
%%time
mod_test = test.join(train_avg_purchase, on="user_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(purchase)", "sum_user")

mod_test = mod_test.join(train_avg_item, on="item_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(purchase)", "sum_item")

mod_test = mod_test.join(views, on="user_id", how="left")
mod_test = mod_test.withColumnRenamed("sum(time_spend)", "time_spend")

mod_test = mod_test.na.fill(0.5)

CPU times: user 3.68 ms, sys: 0 ns, total: 3.68 ms
Wall time: 57.9 ms


# Модель

In [52]:
%%time
ignore = ['purchase', 'item_id','user_id']
assembler = VectorAssembler(
    inputCols=[x for x in mod_train.columns if x not in ignore],
    outputCol='features')

train_ = (assembler.transform(mod_train).select("purchase", "features"))
test_ = (assembler.transform(mod_test).select('features','item_id','user_id'))

CPU times: user 10.5 ms, sys: 0 ns, total: 10.5 ms
Wall time: 104 ms


In [53]:
%%time
train_.show(10)

+--------+--------------------+
|purchase|            features|
+--------+--------------------+
|       0|[72.0,8.0,2256455...|
|       1|[72.0,2.0,2256455...|
|       0|[72.0,1.0,2256455...|
|       0|[72.0,1.0,2256455...|
|       0|[72.0,1.0,2256455...|
|       0|[72.0,1.0,2256455...|
|       0|[72.0,2.0,2256455...|
|       0|[72.0,3.0,2256455...|
|       0|[72.0,1.0,2256455...|
|       0|[72.0,0.0,2256455...|
+--------+--------------------+
only showing top 10 rows

CPU times: user 6.9 ms, sys: 1.09 ms, total: 7.99 ms
Wall time: 29.1 s


In [54]:
%%time
test_.show(10)

+--------------------+-------+-------+
|            features|item_id|user_id|
+--------------------+-------+-------+
|[72.0,1.0,2256455...|  73041| 754230|
|[72.0,5.0,2256455...|  74440| 754230|
|[72.0,2.0,2256455...|  74452| 754230|
|[72.0,1.0,2256455...|  93131| 754230|
|[72.0,3.0,2256455...|  93633| 754230|
|[72.0,2.0,2256455...|  95151| 754230|
|[72.0,6.0,2256455...|  10788| 754230|
|[72.0,2.0,2256455...|  11520| 754230|
|[72.0,3.0,2256455...|  73185| 754230|
|[72.0,7.0,2256455...|  87168| 754230|
+--------------------+-------+-------+
only showing top 10 rows

CPU times: user 4.96 ms, sys: 3.16 ms, total: 8.12 ms
Wall time: 27.4 s


In [55]:
%%time
rf_regressor = RandomForestRegressor(labelCol='purchase', numTrees=50, maxDepth=5).fit(train_)

In [56]:
rf_regressor.featureImportances

SparseVector(4, {0: 0.43, 1: 0.4833, 2: 0.0421, 3: 0.0446})

In [57]:
rf_predictions=rf_regressor.transform(test_)


In [58]:
%%time
rf_predictions.show(10)

+--------------------+-------+-------+--------------------+
|            features|item_id|user_id|          prediction|
+--------------------+-------+-------+--------------------+
|[72.0,3.0,2256455...|  93486| 754230|0.020759866897381577|
|[72.0,1.0,2256455...|  94819| 754230|0.009885425471353444|
|[72.0,6.0,2256455...|  10788| 754230| 0.03782379359046834|
|[72.0,2.0,2256455...|  11520| 754230|0.014000425464224833|
|[72.0,3.0,2256455...|  73185| 754230|0.020759866897381577|
|[72.0,7.0,2256455...|  87168| 754230| 0.05122601759598174|
|[72.0,1.0,2256455...|  93556| 754230|0.009885425471353444|
|[72.0,0.0,2256455...|  95526| 754230|0.008465410667532291|
|[72.0,3.0,2256455...|  67107| 754230|0.020759866897381577|
|[72.0,3.0,2256455...|  73311| 754230|0.020759866897381577|
+--------------------+-------+-------+--------------------+
only showing top 10 rows



In [59]:
rf_predictions = rf_predictions.withColumnRenamed("prediction", "purchase")

In [60]:
rf_predictions = rf_predictions.na.fill(0.5)

In [61]:
%%time
rf_predictions.select("user_id","item_id","purchase").show(10)

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
| 754230|  11025|0.008465410667532291|
| 754230|  72912|0.008465410667532291|
| 754230|  86406|0.008465410667532291|
| 754230|  88999| 0.03782379359046834|
| 754230|  93487| 0.02222772517609979|
| 754230|  10257|0.020759866897381577|
| 754230|  10264|0.008465410667532291|
| 754230|  92855|0.009885425471353444|
| 754230|  94820|0.014000425464224833|
| 754230|  95751|0.009885425471353444|
| 754230|  98147|0.008465410667532291|
| 754230|  10128| 0.03782379359046834|
| 754230|  73421|0.008465410667532291|
| 754230|  74556|0.014000425464224833|
| 754230|  79417|0.009885425471353444|
| 754230|  92740|0.008465410667532291|
| 754230|  93020|0.008465410667532291|
| 754230|  94243|0.014000425464224833|
| 754230|  94910|0.014000425464224833|
| 754230|   4782|0.009885425471353444|
+-------+-------+--------------------+
only showing top 20 rows



In [62]:
cols = ['user_id','item_id']
rf_predictions = rf_predictions.orderBy(cols,ascending=True)

In [63]:
%%time
rf_predictions.select("user_id","item_id","purchase").show(10)

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654|    336|6.153590400224896E-4|
|   1654|    678|6.153590400224896E-4|
|   1654|    691|6.153590400224896E-4|
|   1654|    696|6.792772889960508E-4|
|   1654|    763|6.306601439152282E-4|
|   1654|    795|0.003518868826408...|
|   1654|    861|6.306601439152282E-4|
|   1654|   1137|8.863862399642905E-4|
|   1654|   1159|6.792772889960508E-4|
|   1654|   1428|6.306601439152282E-4|
+-------+-------+--------------------+
only showing top 10 rows

CPU times: user 3.07 ms, sys: 6.52 ms, total: 9.58 ms
Wall time: 34.8 s


In [64]:
%%time
rf_predictions.select("user_id","item_id","purchase").toPandas().to_csv("lab03.csv",header = True)

CPU times: user 31.3 s, sys: 806 ms, total: 32.1 s
Wall time: 54.7 s


In [33]:
sc.stop()