In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya ALS app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [2]:
spark

![ALS](pics/mf.png)

## Наша цель представить матрицу user-item ratings как произведение двух матриц меньшего ранга
## $$R = U \times P^\top $$ 
## где
## $$U \in \mathbb{R}^{n \times k}, k \ll n$$
## и 
## $$P \in \mathbb{R}^{m \times k}, k \ll m$$

## Как найти решение? Оптимизировать следующий функционал:
## $$J = \|R - U \times P^\top\|_2 + \lambda(\|U\|_2 + \|P\|_2)$$

## Как это сделать?
![GD](pics/gradient_descent.jpeg)

## Возникают 2 проблемы:
+ кол-во оптиимзируемых параметров $n \times k + m \times k$
+ этот функционал non-convex (https://www.quora.com/Why-is-the-matrix-factorization-optimization-function-in-recommender-systems-not-convex)

## Что же делать? ALS (alternating least squares)
## обычный Least Squares
## $$J(\beta) = \|y - X\beta\|_2$$
## $$\beta = (X^\top X)^{-1}X^\top y$$

## ALS это 2-х шаговый итеративный процесс
## $$ \forall u_i : J(u_i) = \|R_i - u_i \times P^\top\|_2 + \lambda\|u_i\|_2$$
## $$ \forall p_j : J(p_j) = \|R_i - U \times p^{\top}_{j}\|_2 + \lambda\|p_j\|_2$$
## Решение следующее
## $$u_i = (P^\top \times P + \lambda I)^{-1} \times P^\top \times R_i$$
## $$p_j = (U^\top \times U + \lambda I)^{-1} \times U^\top \times R_j$$

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

In [4]:
schema = StructType([
    StructField("user", IntegerType()),
    StructField("item", IntegerType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", LongType())
])

In [5]:
dataset = spark.read.csv("/lectures/lecture02/data/ml-100k/ua.base", sep="\t", schema=schema).drop("timestamp").repartition(20).cache()

In [6]:
dataset.show(5)

+----+----+------+
|user|item|rating|
+----+----+------+
| 297|   1|     3|
| 119| 924|     4|
| 373| 230|     4|
| 900| 508|     3|
|  13| 828|     1|
+----+----+------+
only showing top 5 rows



In [7]:
dataset.rdd.getNumPartitions()

20

In [8]:
from pyspark.ml.recommendation import ALS

In [9]:
als = ALS(rank=10, maxIter=5, seed=5757)

In [10]:
model = als.fit(dataset)

In [11]:
model.rank

10

In [12]:
test = spark.read.csv("/lectures/lecture02/data/ml-100k/ua.test", sep="\t", schema=schema).drop("timestamp").repartition(4).cache()

In [13]:
predictions = model.transform(test)

In [14]:
predictions.show(5)

+----+----+------+----------+
|user|item|rating|prediction|
+----+----+------+----------+
| 251| 148|     2|  3.038124|
| 580| 148|     4| 3.0317912|
| 602| 148|     4| 3.5022647|
| 372| 148|     5| 3.8346112|
| 274| 148|     2| 3.3601744|
+----+----+------+----------+
only showing top 5 rows



In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

In [16]:
evaluator = RegressionEvaluator(labelCol="rating", metricName="rmse")

In [17]:
evaluator.evaluate(predictions)

nan

![wtf](pics/Jackie-Chan-WTF.jpg)

In [18]:
predictions.groupBy("rating").count().show()

+------+-----+
|rating|count|
+------+-----+
|     1|  542|
|     3| 2424|
|     5| 2153|
|     4| 3316|
|     2|  995|
+------+-----+



In [19]:
import pyspark.sql.functions as f

In [20]:
predictions.filter(f.isnan("prediction")).count()

2

In [21]:
predictions.filter(f.isnan("prediction")).collect()

[Row(user=675, item=1653, rating=5, prediction=nan),
 Row(user=405, item=1582, rating=1, prediction=nan)]

In [22]:
dataset.filter(dataset.user == 675).show()

+----+----+------+
|user|item|rating|
+----+----+------+
| 675| 235|     1|
| 675| 311|     3|
| 675| 937|     1|
| 675| 258|     3|
| 675| 900|     4|
| 675| 242|     4|
| 675| 750|     4|
| 675|  86|     4|
| 675| 272|     3|
| 675| 244|     3|
| 675| 318|     5|
| 675| 891|     2|
| 675| 344|     4|
| 675|1628|     5|
| 675|1007|     4|
| 675| 303|     5|
| 675| 223|     1|
| 675| 427|     5|
| 675| 286|     4|
| 675| 896|     5|
+----+----+------+
only showing top 20 rows



In [23]:
dataset.filter(dataset.item == 1653).show()

+----+----+------+
|user|item|rating|
+----+----+------+
+----+----+------+



In [24]:
dataset.filter(dataset.item == 1582).show()

+----+----+------+
|user|item|rating|
+----+----+------+
+----+----+------+



In [25]:
predictions = predictions.dropna()

In [26]:
evaluator.evaluate(predictions)

0.9590533627741923

## Что делать с cold start в Spark?!

In [27]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop"})

In [28]:
model.getOrDefault("coldStartStrategy")

'drop'

In [29]:
predictions = model.transform(test)

In [30]:
evaluator.evaluate(predictions)

0.9590533627741923

## Можем ли мы лучше?

In [31]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20})

In [32]:
predictions = model.transform(test)

In [33]:
evaluator.evaluate(predictions)

0.9558872242636991

## А еще лучше?

In [34]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20, als.rank: 100})

In [35]:
predictions = model.transform(test)

In [36]:
evaluator.evaluate(predictions)

0.9475039684939921

## Заглянем внутрь?

In [37]:
model.recommendForAllItems(5).take(5)

[Row(item=1580, recommendations=[Row(user=405, rating=0.9766721129417419), Row(user=507, rating=0.8202009201049805), Row(user=38, rating=0.8191449046134949), Row(user=175, rating=0.8150184154510498), Row(user=388, rating=0.7907520532608032)]),
 Row(item=471, recommendations=[Row(user=939, rating=4.705004692077637), Row(user=907, rating=4.694290637969971), Row(user=357, rating=4.665385723114014), Row(user=849, rating=4.568941116333008), Row(user=532, rating=4.490878582000732)]),
 Row(item=1591, recommendations=[Row(user=519, rating=4.819264888763428), Row(user=440, rating=4.734762668609619), Row(user=688, rating=4.221287727355957), Row(user=427, rating=3.889539957046509), Row(user=260, rating=3.769451379776001)]),
 Row(item=1342, recommendations=[Row(user=662, rating=3.8292653560638428), Row(user=849, rating=3.3050215244293213), Row(user=212, rating=3.2683603763580322), Row(user=369, rating=3.1982271671295166), Row(user=157, rating=3.150738000869751)]),
 Row(item=463, recommendations=[R

In [38]:
model.recommendForAllUsers(5).take(5)

[Row(user=471, recommendations=[Row(item=932, rating=4.59334659576416), Row(item=8, rating=4.515803337097168), Row(item=422, rating=4.430621147155762), Row(item=102, rating=4.418323040008545), Row(item=465, rating=4.4073286056518555)]),
 Row(user=463, recommendations=[Row(item=19, rating=4.3579511642456055), Row(item=887, rating=4.278357982635498), Row(item=1449, rating=4.222947597503662), Row(item=221, rating=4.208576679229736), Row(item=253, rating=4.199032783508301)]),
 Row(user=833, recommendations=[Row(item=1597, rating=4.580961227416992), Row(item=1019, rating=4.381778240203857), Row(item=1187, rating=4.379821300506592), Row(item=589, rating=4.34868049621582), Row(item=488, rating=4.308185577392578)]),
 Row(user=496, recommendations=[Row(item=56, rating=4.302997589111328), Row(item=320, rating=4.296531677246094), Row(item=42, rating=4.181881427764893), Row(item=921, rating=4.170877456665039), Row(item=1240, rating=4.105301856994629)]),
 Row(user=148, recommendations=[Row(item=169

In [39]:
model.itemFactors.take(5)

[Row(id=10, features=[-0.17320072650909424, 0.13949139416217804, 0.08175767958164215, -0.02062435820698738, 0.12047522515058517, 0.08650190383195877, -0.05590004846453667, 0.6714649796485901, -0.004701962228864431, -0.2840914726257324, -0.2685687839984894, -0.33872973918914795, -0.17306829988956451, 0.6884441375732422, -0.01208356861025095, 0.5139780640602112, -0.16328637301921844, -0.10118523985147476, -0.02909262850880623, -0.1980876475572586, -0.21409818530082703, 0.10442737489938736, 0.05759561434388161, 0.3170529305934906, -0.10258181393146515, -0.2047143429517746, 0.06586514413356781, 0.24610474705696106, 0.2574028968811035, -0.11565375328063965, 0.2906339168548584, 0.09016047418117523, -0.011517101898789406, -0.37064695358276367, -0.08438008278608322, -0.3516891896724701, -0.11993979662656784, 0.5081120729446411, -0.3401847779750824, 0.35814446210861206, -0.055438581854104996, 0.013906936161220074, -0.062037624418735504, 0.02051541954278946, 0.08401115983724594, -0.1073991358280

In [41]:
spark.stop()