In [2]:
import joblib
import numpy as np
from dask.distributed import Client
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Group 7 - Recommendation System')\
.config('spark.sql.execution.arrow.pyspark.enabled', True)\
.config('spark.driver.memory','8G')\
.config('spark.ui.showConsoleProgress', True)\
.config('spark.sql.repl.eagerEval.enabled', True)\
.getOrCreate()

client = Client()

In [3]:
# Data is downloaded from https://www.kaggle.com/bandikarthik/movie-recommendation-system
movies = spark.read.csv('../MovieLens/movie.csv', header=True, inferSchema=True)
ratings = spark.read.csv('../MovieLens/rating.csv',  header=True, inferSchema=True)

In [8]:
!pip install git+https://github.com/gbolmier/funk-svd

Collecting git+https://github.com/gbolmier/funk-svd
  Cloning https://github.com/gbolmier/funk-svd to c:\users\zas\appdata\local\temp\pip-req-build-1hn_yk1s
Collecting numba>=0.38.0
  Using cached numba-0.53.1-cp39-cp39-win_amd64.whl (2.3 MB)


  Running command git clone -q https://github.com/gbolmier/funk-svd 'C:\Users\zas\AppData\Local\Temp\pip-req-build-1hn_yk1s'
ERROR: Package 'funk-svd' requires a different Python: 3.9.5 not in '<=3.9.1,>=3.6.5'


In [9]:
import pandas as pd
from funk_svd import SVD

with joblib.parallel_backend('dask'):
    movies_df = movies.toPandas()
    rating_df = ratings.toPandas()

ModuleNotFoundError: No module named 'funk_svd'

In [4]:
rating_df.columns = ['u_id', 'i_id', 'rating', 'timestamps']
movies_df.columns = ['i_id', 'title', 'genres']
rating_df

Unnamed: 0,u_id,i_id,rating,timestamps
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# movielens18.drop(columns = 'timestamp', inplace = True)

with joblib.parallel_backend('dask'):
  train = rating_df.sample(frac=0.8)
  val = rating_df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
  test = rating_df.drop(train.index.tolist()).drop(val.index.tolist())

In [6]:
lr, reg, factors = (0.01, 0.03, 90)

with joblib.parallel_backend('dask'):
  svd = SVD(lr=lr, reg=reg, n_epochs=20, n_factors=factors,
            min_rating=0.5, max_rating=5)
  svd.fit(X=train, X_val=val)

pred = svd.predict(test)
mae = mean_absolute_error(test["rating"], pred)
rmse = np.sqrt(mean_squared_error(test["rating"], pred))
print("Test MAE:  {:.2f}".format(mae))
print("Test RMSE: {:.2f}".format(rmse))
print('{} factors, {} lr, {} reg'.format(factors, lr, reg))

Preprocessing data...

Preprocessing data...

Epoch 1/20  | val_loss: 0.76 - val_rmse: 0.87 - val_mae: 0.67 - took 7.1 sec
Epoch 2/20  | val_loss: 0.74 - val_rmse: 0.86 - val_mae: 0.66 - took 6.4 sec
Epoch 3/20  | val_loss: 0.71 - val_rmse: 0.84 - val_mae: 0.65 - took 6.3 sec
Epoch 4/20  | val_loss: 0.69 - val_rmse: 0.83 - val_mae: 0.64 - took 6.1 sec
Epoch 5/20  | val_loss: 0.67 - val_rmse: 0.82 - val_mae: 0.63 - took 6.3 sec
Epoch 6/20  | val_loss: 0.66 - val_rmse: 0.81 - val_mae: 0.62 - took 6.4 sec
Epoch 7/20  | val_loss: 0.65 - val_rmse: 0.80 - val_mae: 0.62 - took 6.2 sec
Epoch 8/20  | 

IOStream.flush timed out


val_loss: 0.64 - val_rmse: 0.80 - val_mae: 0.61 - took 6.5 sec
Epoch 9/20  | val_loss: 0.63 - val_rmse: 0.79 - val_mae: 0.61 - took 6.3 sec
Epoch 10/20 | val_loss: 0.63 - val_rmse: 0.79 - val_mae: 0.60 - took 6.2 sec
Epoch 11/20 | val_loss: 0.62 - val_rmse: 0.79 - val_mae: 0.60 - took 6.2 sec
Epoch 12/20 | val_loss: 0.62 - val_rmse: 0.79 - val_mae: 0.60 - took 6.0 sec
Epoch 13/20 | val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.60 - took 6.0 sec
Epoch 14/20 | 

IOStream.flush timed out


val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.60 - took 6.0 sec
Epoch 15/20 | val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.60 - took 6.0 sec
Epoch 16/20 | val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.59 - took 5.9 sec
Epoch 17/20 | val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.59 - took 6.0 sec
Epoch 18/20 | val_loss: 0.61 - val_rmse: 0.78 - val_mae: 0.59 - took 6.5 sec
Epoch 19/20 | val_loss: 0.60 - val_rmse: 0.78 - val_mae: 0.59 - took 6.5 sec
Epoch 20/20 | 

IOStream.flush timed out


val_loss: 0.60 - val_rmse: 0.78 - val_mae: 0.59 - took 6.5 sec

Training took 2 min and 15 sec
Test MAE:  0.59
Test RMSE: 0.78
90 factors, 0.01 lr, 0.03 reg


In [7]:
n_m = len(rating_df.i_id.unique())

#  Initialize my ratings
my_ratings = np.zeros(n_m)


my_ratings[4993] = 5
my_ratings[1080] = 5
my_ratings[260] = 5
my_ratings[4896] = 5
my_ratings[1196] = 5
my_ratings[1210] = 5
my_ratings[2628] = 5
my_ratings[5378] = 5

print('User ratings:')
print('-----------------')

for i, val in enumerate(my_ratings):
    if val > 0:
        print('Rated %d stars: %s' % (val, movies_df.loc[movies_df.i_id==i].title.values))

User ratings:
-----------------
Rated 5 stars: ['Star Wars: Episode IV - A New Hope (1977)']
Rated 5 stars: ["Monty Python's Life of Brian (1979)"]
Rated 5 stars: ['Star Wars: Episode V - The Empire Strikes Back (1980)']
Rated 5 stars: ['Star Wars: Episode VI - Return of the Jedi (1983)']
Rated 5 stars: ['Star Wars: Episode I - The Phantom Menace (1999)']
Rated 5 stars: ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)"]
Rated 5 stars: ['Lord of the Rings: The Fellowship of the Ring, The (2001)']
Rated 5 stars: ['Star Wars: Episode II - Attack of the Clones (2002)']


In [8]:

print("Adding your recommendations!")
items_id = [item[0] for item in np.argwhere(my_ratings>0)]
ratings_list = my_ratings[np.where(my_ratings>0)]
user_id = np.asarray([0] * len(ratings_list))

user_ratings = pd.DataFrame(list(zip(user_id, items_id, ratings_list)), columns=['u_id', 'i_id', 'rating'])



Adding your recommendations!


In [9]:
try:
    rating_df = rating_df.drop(columns=['timestamps'])
except:
    pass
data_with_user = rating_df.append(user_ratings, ignore_index=True)



with joblib.parallel_backend('dask'):
  train_user = data_with_user.sample(frac=0.8)
  val_user = data_with_user.drop(train_user.index.tolist()).sample(frac=0.5, random_state=8)
  test_user = data_with_user.drop(train_user.index.tolist()).drop(val_user.index.tolist())



In [10]:
from itertools import product


def funk_svd_predict(userID, data_with_user, movies_df):
    userID = [userID]

    # all_users = data_with_user.u_id.unique()
    all_movies = data_with_user.i_id.unique()
    recommendations = pd.DataFrame(list(product(userID, all_movies)), columns=['u_id', 'i_id'])

    #Getting predictions for the selected userID
    pred_train = svd.predict(recommendations)
    recommendations['prediction'] = pred_train
    recommendations.head(10)

    sorted_user_predictions = recommendations.sort_values(by='prediction', ascending=False)

    user_ratings = data_with_user[data_with_user.u_id == userID[0]]
    user_ratings.columns = ['u_id',	'i_id', 'rating']
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = movies_df[~movies_df['i_id'].isin(user_ratings['i_id'])].\
        merge(pd.DataFrame(sorted_user_predictions).reset_index(drop=True), how = 'inner', left_on = 'i_id', right_on = 'i_id').\
        sort_values(by='prediction', ascending = False)#.drop(['i_id'],axis=1)

    rated_df = movies_df[movies_df['i_id'].isin(user_ratings['i_id'])].\
        merge(pd.DataFrame(data_with_user).reset_index(drop=True), how = 'inner', left_on = 'i_id', right_on = 'i_id')
    rated_df = rated_df.loc[rated_df.u_id==userID[0]].sort_values(by='rating', ascending = False)
    
    return recommendations, rated_df


In [23]:
ratings.join(movies, on='movieId').filter('userId = 1') \
.sort('rating', ascending=False).limit(10)

                                                                                

movieId,userId,rating,timestamp,title,genres
4993,1,5.0,2005-04-02 23:31:22,Lord of the Rings...,Adventure|Fantasy
5952,1,5.0,2005-04-02 23:30:19,Lord of the Rings...,Adventure|Fantasy
7153,1,5.0,2005-04-02 23:30:33,Lord of the Rings...,Action|Adventure|...
8507,1,5.0,2004-09-10 03:13:47,Freaks (1932),Crime|Drama|Horror
1198,1,4.5,2005-04-02 23:30:24,Raiders of the Lo...,Action|Adventure
1196,1,4.5,2005-04-02 23:32:22,Star Wars: Episod...,Action|Adventure|...
8636,1,4.5,2005-04-02 23:44:53,Spider-Man 2 (2004),Action|Adventure|...
296,1,4.0,2005-04-02 23:32:47,Pulp Fiction (1994),Comedy|Crime|Dram...
318,1,4.0,2005-04-02 23:33:18,Shawshank Redempt...,Crime|Drama
541,1,4.0,2005-04-02 23:30:03,Blade Runner (1982),Action|Sci-Fi|Thr...


In [19]:
## Recommend for user 1
recommendations, rated_df = funk_svd_predict(0, rating_df, movies_df)
recommendations.head(10)

Unnamed: 0,i_id,title,genres,u_id,prediction
20471,100553,Frozen Planet (2011),Documentary,0,4.490969
315,318,"Shawshank Redemption, The (1994)",Crime|Drama,0,4.404353
23694,113315,Zero Motivation (Efes beyahasei enosh) (2014),Comedy|Drama,0,4.363161
17571,88570,Welfare (1975),Documentary,0,4.325765
8525,25975,"Life of Oharu, The (Saikaku ichidai onna) (1952)",Drama,0,4.306362
20852,102217,Bill Hicks: Revelations (1993),Comedy,0,4.301948
20747,101850,Death on the Staircase (Soupçons) (2004),Crime|Documentary,0,4.278176
19180,95604,The War (2007),Documentary|War,0,4.269997
16793,85012,"Given Word, The (O Pagador de Promessas) (1962)",Drama,0,4.261922
49,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0,4.259476
