In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sergey Grishaev ALS app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark

## Наша цель представить матрицу user-item ratings как произведение двух матриц меньшего ранга
## $$R = U \times P^\top $$ 
## где
## $$U \in \mathbb{R}^{n \times k}, k \ll n$$
## и 
## $$P \in \mathbb{R}^{m \times k}, k \ll m$$

## Как найти решение? Оптимизировать следующий функционал:
## $$J = \|R - U \times P^\top\|_2 + \lambda(\|U\|_2 + \|P\|_2)$$

## Как это сделать?
![GD](pics/gradient_descent.png)

## Возникают 2 проблемы:
+ кол-во оптиимзируемых параметров $n \times k + m \times k$
+ этот функционал non-convex (https://www.quora.com/Why-is-the-matrix-factorization-optimization-function-in-recommender-systems-not-convex)

## Что же делать? ALS (alternating least squares)
## обычный Least Squares
## $$J(\beta) = \|y - X\beta\|_2$$
## $$\beta = (X^\top X)^{-1}X^\top y$$

## ALS это 2-х шаговый итеративный процесс
## $$ \forall u_i : J(u_i) = \|R_i - u_i \times P^\top\|_2 + \lambda\|u_i\|_2$$
## $$ \forall p_j : J(p_j) = \|R_i - U \times p^{\top}_{j}\|_2 + \lambda\|p_j\|_2$$
## Решение следующее
## $$u_i = (P^\top \times P + \lambda I)^{-1} \times P^\top \times R_i$$
## $$p_j = (U^\top \times U + \lambda I)^{-1} \times U^\top \times R_j$$

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

In [None]:
schema = StructType([
    StructField("user", IntegerType()),
    StructField("item", IntegerType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", LongType())
])

In [None]:
dataset = spark.read.csv("/lectures/lecture02/data/ml-100k/ua.base", sep="\t", schema=schema).drop("timestamp").repartition(20).cache()

In [None]:
dataset.show(5)

In [None]:
dataset.rdd.getNumPartitions()

In [None]:
from pyspark.ml.recommendation import ALS

In [None]:
als = ALS(rank=10, maxIter=5, seed=5757)

In [None]:
model = als.fit(dataset)

In [None]:
model.rank

In [None]:
test = spark.read.csv("/lectures/lecture02/data/ml-100k/ua.test", sep="\t", schema=schema).drop("timestamp").repartition(4).cache()

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show(5)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
evaluator = RegressionEvaluator(labelCol="rating", metricName="rmse")

In [None]:
evaluator.evaluate(predictions)

In [None]:
predictions.groupBy("rating").count().show()

In [None]:
import pyspark.sql.functions as f

In [None]:
predictions.filter(f.isnan("prediction")).count()

In [None]:
predictions.filter(f.isnan("prediction")).collect()

In [None]:
dataset.filter(dataset.user == 675).show()

In [None]:
dataset.filter(dataset.item == 1653).show()

In [None]:
dataset.filter(dataset.item == 1582).show()

In [None]:
predictions = predictions.dropna()

In [None]:
evaluator.evaluate(predictions)

## Что делать с cold start в Spark?!

In [None]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop"})

In [None]:
model.getOrDefault("coldStartStrategy")

In [None]:
predictions = model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## Можем ли мы лучше?

In [None]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20})

In [None]:
predictions = model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## А еще лучше?

In [None]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20, als.rank: 100})

In [None]:
predictions = model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## Заглянем внутрь?

In [None]:
model.recommendForAllItems(5).take(5)

In [None]:
model.recommendForAllUsers(5).take(5)

In [None]:
model.itemFactors.take(5)

In [None]:
spark.stop()