<a href="https://colab.research.google.com/github/dinhhungGM/RecommendationSystemUsingBigData/blob/main/Recommendation_System_BIGDATA_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [5]:
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

In [6]:
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

In [7]:
!pip install -q findspark py4j


In [8]:
!pip install pandas --upgrade

Requirement already up-to-date: pandas in /usr/local/lib/python3.7/dist-packages (1.3.0)


# Approach with Spark

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from pyspark.sql.functions import col, explode
from pyspark import SparkContext

from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Group 7 - Recommendation System').config('spark.sql.execution.arrow.pyspark.enabled', True) \
.config('spark.driver.memory','8G') \
.config('spark.ui.showConsoleProgress', True) \
.config('spark.sql.repl.eagerEval.enabled', True) \
.config('spark.sql.pivotMaxValues', 100000000)\
.getOrCreate()

In [3]:
# Data is downloaded from https://www.kaggle.com/bandikarthik/movie-recommendation-system
movies = spark.read.csv('drive/MyDrive/BigDataProject/movies.csv', header=True, inferSchema=True)
ratings = spark.read.csv('drive/MyDrive/BigDataProject/ratings.csv',  header=True, inferSchema=True)

In [32]:
movies.limit(5).show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+



# Calculating sparsity of data

In [33]:
numerator = ratings.select("Rating").count()

# Count the number of distinct userIds and distinct movieIds
unique_users = ratings.select("UserID").distinct().count()
unique_movies = ratings.select("MovieID").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = unique_users * unique_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings data is ", "%.2f" % sparsity + "% empty.")

The ratings data is  99.73% empty.


# Implementing ALS(Alternating Least Square) algorithm in Spark

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [5]:
(trainData, testData) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False
          , coldStartStrategy="drop")

In [6]:
%%time
param_grid = ParamGridBuilder() \
.addGrid(als.rank, [14]) \
.addGrid(als.maxIter, [5]) \
.addGrid(als.regParam, [.01]) \
.build()


# rank is the number of latent factors in the model (defaults to 10).
# maxIter is the maximum number of iterations to run (defaults to 10).
# regParam specifies the regularization parameter in ALS (defaults to 1.0).


evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
cv = CrossValidator(estimator=als,
                            estimatorParamMaps=param_grid,
                            evaluator=evaluator,
                            numFolds=3) 

model = cv.fit(trainData)


CPU times: user 4.6 s, sys: 584 ms, total: 5.19 s
Wall time: 13min 16s


In [7]:
best_model = model.bestModel
predictions = best_model.transform(testData)
rmse = evaluator.evaluate(predictions)
print(f"Root mean square error: {rmse}")
print("====BEST MODEL ====")
print(f"BEST RANK: {best_model.rank}")
print(f"maxIter: {best_model._java_obj.parent().getMaxIter()}")
print(f"regParam: {best_model._java_obj.parent().getRegParam()}")

Root mean square error: 0.8435874393514843
====BEST MODEL ====
BEST RANK: 14
maxIter: 5
regParam: 0.01


In [8]:
predictions

userId,movieId,rating,timestamp,prediction
137389,148,3.0,830778220,1.9708189
90446,148,2.0,941903738,2.901287
77231,148,2.0,1030400425,1.8130058
224425,148,3.0,837811440,2.939558
236731,148,3.0,843889974,3.0693476
246361,148,3.0,834673760,2.570979
136989,148,1.0,833673768,3.275115
135040,148,5.0,958498293,4.768713
187508,148,2.0,874577512,3.3299313
204347,148,3.0,834040799,4.3313537


# Movie Recommendation

In [9]:
# Generate top 10 movie recommendations for each user
# Recommend Film based on Users
# Output will be movieId and rating
recommendations = best_model.recommendForAllUsers(10)
recommendations.limit(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[{144708, 23.4303...|
|   463|[{144708, 22.5578...|
|   471|[{107516, 24.2774...|
|   496|[{107516, 23.2242...|
|   833|[{144708, 24.6131...|
|  1088|[{107516, 24.0861...|
|  1238|[{107516, 17.8833...|
|  1342|[{130576, 15.7069...|
|  1580|[{144708, 21.0237...|
|  1591|[{144708, 17.6400...|
+------+--------------------+



### 7th User’s Actual Preference:

In [19]:
ratings.join(movies, on='movieId').filter('userId = 7') \
.sort('rating', ascending=False).limit(10).show()

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
| 134853|     7|   5.0|1451817861|   Inside Out (2015)|Animation|Childre...|
|   4226|     7|   5.0|1451817880|      Memento (2000)|    Mystery|Thriller|
|   5618|     7|   5.0|1451817882|Spirited Away (Se...|Adventure|Animati...|
|  58559|     7|   5.0|1451817836|Dark Knight, The ...|Action|Crime|Dram...|
|  79132|     7|   5.0|1451817881|    Inception (2010)|Action|Crime|Dram...|
| 109487|     7|   5.0|1451817912| Interstellar (2014)|         Sci-Fi|IMAX|
| 122886|     7|   5.0|1451817862|Star Wars: Episod...|Action|Adventure|...|
| 134130|     7|   5.0|1451817860| Martian, The (2015)|Action|Adventure|...|
|   3147|     7|   4.5|1451817855|Green Mile, The (...|         Crime|Drama|
|   1196|     7|   4.0|1451817837|Star Wars: Episod...|Action|Adventure|...|

### 7th User’s ALS Recommentions

In [28]:
recommendations = recommendations.withColumn("rec_exp", explode("recommendations")).select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))
recommendations.join(movies, on='movieId').filter('userId = 7').show()

AnalysisException: ignored

# Approach with Dask

# KNN

In [21]:
!pip install "dask[complete]"

Collecting fsspec>=0.6.0; extra == "complete"
[?25l  Downloading https://files.pythonhosted.org/packages/40/e1/7111d8afc76ee3171f4f99592cd29bac9d233ae1aa34623011506f955434/fsspec-2021.7.0-py3-none-any.whl (118kB)
[K     |████████████████████████████████| 122kB 26.0MB/s 
Collecting distributed>=2.0; extra == "complete"
[?25l  Downloading https://files.pythonhosted.org/packages/31/8b/0d704fdaa170a05797057c4676ceb9f53e139111b9b37f53e90a62c4c770/distributed-2021.7.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 25.1MB/s 
Collecting partd>=0.3.10; extra == "complete"
  Downloading https://files.pythonhosted.org/packages/41/94/360258a68b55f47859d72b2d0b2b3cfe0ca4fbbcb81b78812bd00ae86b7c/partd-1.2.0-py3-none-any.whl
Collecting locket
  Downloading https://files.pythonhosted.org/packages/50/b8/e789e45b9b9c2db75e9d9e6ceb022c8d1d7e49b2c085ce8c05600f90a96b/locket-0.2.1-py2.py3-none-any.whl
[31mERROR: distributed 2021.7.0 has requirement cloudpickle>=1.5.0, but you'

In [22]:
import joblib
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [23]:
!python -m pip install dask distributed --upgrade

Collecting dask
[?25l  Downloading https://files.pythonhosted.org/packages/a9/22/c99e0377c0b8d4679ae93d7d495349333c8b1455938c528eb9f66b850b04/dask-2021.7.0-py3-none-any.whl (977kB)
[K     |▍                               | 10kB 13.8MB/s eta 0:00:01[K     |▊                               | 20kB 18.6MB/s eta 0:00:01[K     |█                               | 30kB 18.1MB/s eta 0:00:01[K     |█▍                              | 40kB 19.2MB/s eta 0:00:01[K     |█▊                              | 51kB 17.9MB/s eta 0:00:01[K     |██                              | 61kB 17.3MB/s eta 0:00:01[K     |██▍                             | 71kB 17.7MB/s eta 0:00:01[K     |██▊                             | 81kB 17.8MB/s eta 0:00:01[K     |███                             | 92kB 18.0MB/s eta 0:00:01[K     |███▍                            | 102kB 16.0MB/s eta 0:00:01[K     |███▊                            | 112kB 16.0MB/s eta 0:00:01[K     |████                            | 122kB 16.0MB/

In [24]:
from dask.distributed import Client
client = Client(n_workers=4, threads_per_worker=4, processes=False, memory_limit='8GB')

# If we doesn't convert userId to category then will met errors

model_knn= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
movies_users= ratings.limit(1000000).toPandas().pivot(index='movieId', columns='userId',values='rating').fillna(0)
    

In [25]:
import dask
import dask.dataframe as dd
ratings_dask_df = dd.read_csv('./drive/MyDrive/BigDataProject/ratings.csv')

In [26]:
ratings_dask_df.head()

Function:  safe_head
args:      ((<Serialize: subgraph_callable-17b9ef91-0ec5-45c3-a134-5d1202ec06f6>, <Serialize: [(<function read_block_from_file at 0x7f374eb3b5f0>, <OpenFile '/content/./drive/MyDrive/BigDataProject/ratings.csv'>, 0, 64000000, b'\n'), None, True]>), 5)
kwargs:    {}
Exception: AttributeError("'tuple' object has no attribute 'head'")



AttributeError: ignored

In [None]:
with joblib.parallel_backend('dask'):
    mat_movies_users=csr_matrix(movies_users.values)
    model_knn.fit(mat_movies_users)

In [None]:
!pip install fuzzywuzzy
from fuzzywuzzy import process
def recommender(movie_name, data, model, n_recommendations ):
    df_movies = movies.toPandas()
    model.fit(data)
    idx=process.extractOne(movie_name, df_movies['title'])[2]
    print('Movie Selected: ', df_movies['title'][idx], 'Index: ',idx)
    print('Searching for recommendations.....')
    distances, indices=model.kneighbors(data[idx], n_neighbors=n_recommendations)
    for i in indices:
        print(df_movies['title'][i].where(i!=idx))
    
recommender('Toy Story (1995)', mat_movies_users, model_knn,20)

# SVD

## Basend on funk-svd is a Python 3 library implementing a fast version of the famous SVD algorithm popularized by Simon Funk during the Neflix Prize contest.

In [None]:
!pip install git+https://github.com/gbolmier/funk-svd

In [None]:
import pandas as pd
from funk_svd import SVD

In [None]:
%%time
with joblib.parallel_backend('dask'):
  movies_df = movies.toPandas()
  rating_df = ratings.toPandas()

In [None]:
movies_df.head(5)

In [None]:
rating_df.columns = ['u_id', 'i_id', 'rating', 'timestamps']
movies_df.columns = ['i_id', 'title', 'genres']
rating_df

In [None]:
%%time
from sklearn.metrics import mean_squared_error, mean_absolute_error
# movielens18.drop(columns = 'timestamp', inplace = True)

with joblib.parallel_backend('dask'):
  train = rating_df.sample(frac=0.8)
  val = rating_df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
  test = rating_df.drop(train.index.tolist()).drop(val.index.tolist())

In [None]:
train

In [None]:
lr, reg, factors = (0.01, 0.03, 90)

with joblib.parallel_backend('dask'):
  svd = SVD(lr=lr, reg=reg, n_epochs=20, n_factors=factors,
            min_rating=0.5, max_rating=5)
  svd.fit(X=train, X_val=val)

pred = svd.predict(test)
mae = mean_absolute_error(test["rating"], pred)
rmse = np.sqrt(mean_squared_error(test["rating"], pred))
print("Test MAE:  {:.2f}".format(mae))
print("Test RMSE: {:.2f}".format(rmse))
print('{} factors, {} lr, {} reg'.format(factors, lr, reg))

#User Recommendations

In [None]:
n_m = len(rating_df.i_id.unique())

#  Initialize my ratings
my_ratings = np.zeros(n_m)


my_ratings[4993] = 5
my_ratings[1080] = 5
my_ratings[260] = 5
my_ratings[4896] = 5
my_ratings[1196] = 5
my_ratings[1210] = 5
my_ratings[2628] = 5
my_ratings[5378] = 5

print('User ratings:')
print('-----------------')

for i, val in enumerate(my_ratings):
    if val > 0:
        print('Rated %d stars: %s' % (val, movies_df.loc[movies_df.i_id==i].title.values))

In [None]:

print("Adding your recommendations!")
items_id = [item[0] for item in np.argwhere(my_ratings>0)]
ratings_list = my_ratings[np.where(my_ratings>0)]
user_id = np.asarray([0] * len(ratings_list))

user_ratings = pd.DataFrame(list(zip(user_id, items_id, ratings_list)), columns=['u_id', 'i_id', 'rating'])



In [None]:
try:
    rating_df = rating_df.drop(columns=['timestamps'])
except:
    pass
data_with_user = rating_df.append(user_ratings, ignore_index=True)



with joblib.parallel_backend('dask'):
  train_user = data_with_user.sample(frac=0.8)
  val_user = data_with_user.drop(train_user.index.tolist()).sample(frac=0.5, random_state=8)
  test_user = data_with_user.drop(train_user.index.tolist()).drop(val_user.index.tolist())



In [None]:
from itertools import product


def funk_svd_predict(userID, data_with_user, movies_df):
    userID = [userID]

    # all_users = data_with_user.u_id.unique()
    all_movies = data_with_user.i_id.unique()
    recommendations = pd.DataFrame(list(product(userID, all_movies)), columns=['u_id', 'i_id'])

    #Getting predictions for the selected userID
    pred_train = svd.predict(recommendations)
    recommendations['prediction'] = pred_train
    recommendations.head(10)

    sorted_user_predictions = recommendations.sort_values(by='prediction', ascending=False)

    user_ratings = data_with_user[data_with_user.u_id == userID[0]]
    user_ratings.columns = ['u_id',	'i_id', 'rating']
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = movies_df[~movies_df['i_id'].isin(user_ratings['i_id'])].\
        merge(pd.DataFrame(sorted_user_predictions).reset_index(drop=True), how = 'inner', left_on = 'i_id', right_on = 'i_id').\
        sort_values(by='prediction', ascending = False)#.drop(['i_id'],axis=1)

    rated_df = movies_df[movies_df['i_id'].isin(user_ratings['i_id'])].\
        merge(pd.DataFrame(data_with_user).reset_index(drop=True), how = 'inner', left_on = 'i_id', right_on = 'i_id')
    rated_df = rated_df.loc[rated_df.u_id==userID[0]].sort_values(by='rating', ascending = False)
    
    return recommendations, rated_df
recommendations, rated_df = funk_svd_predict(0, data_with_user, movies_df)

In [None]:
rated_df

In [None]:
recommendations.head(10)