<a href="https://colab.research.google.com/github/dinhhungGM/RecommendationSystemUsingBigData/blob/main/Recommendation_System_BIGDATA_COS_JAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Content-based Filtering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark py4j
!pip install pandas --upgrade

[K     |████████████████████████████████| 204kB 12.0MB/s 
[?25hCollecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/99/f7/01cea7f6c963100f045876eb4aa1817069c5c9eca73d2dbfb5d31ff9a39f/pandas-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (10.8MB)
[K     |████████████████████████████████| 10.8MB 11.2MB/s 
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 1.3.0 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
Successfully installed pandas-1.3.0


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
import findspark
findspark.init()

In [None]:
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
# Get distance functions from Sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from pprint import pprint
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.linalg import Vectors
import numpy as np
from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Group 7 - Recommendation System')\
.config('spark.sql.execution.arrow.pyspark.enabled', True) \
.config('spark.driver.memory','8G') \
.config('spark.ui.showConsoleProgress', True) \
.config('spark.sql.repl.eagerEval.enabled', True) \
.config('spark.sql.pivotMaxValues', 100000000)\
.getOrCreate()

In [None]:
# Data is downloaded from https://www.kaggle.com/bandikarthik/movie-recommendation-system
movies = spark.read.csv('drive/MyDrive/BigDataProject/movies.csv', header=True, inferSchema=True)
ratings = spark.read.csv('drive/MyDrive/BigDataProject/ratings.csv',  header=True, inferSchema=True)

In [None]:
movies.limit(5).show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+



In [None]:
movies

movieId,title,genres
1,Toy Story (1995),Adventure|Animati...
2,Jumanji (1995),Adventure|Childre...
3,Grumpier Old Men ...,Comedy|Romance
4,Waiting to Exhale...,Comedy|Drama|Romance
5,Father of the Bri...,Comedy
6,Heat (1995),Action|Crime|Thri...
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|...


In [None]:
movies_df = spark.createDataFrame(movies.rdd.map(lambda x: (x[0], x[2].lower()\
.replace('"',"").replace(' ',"").split('|'))), ['movieId','genre'])
movies_df.show(5)

+-------+--------------------+
|movieId|               genre|
+-------+--------------------+
|      1|[adventure, anima...|
|      2|[adventure, child...|
|      3|   [comedy, romance]|
|      4|[comedy, drama, r...|
|      5|            [comedy]|
+-------+--------------------+
only showing top 5 rows



In [None]:
#Find Count of unique Genre
count  = []
for i in movies_df.collect():
  count.extend(i[1])
print(len(count), len(set(count)))
count_genre = len(set(count))

66668 20


In [None]:
#For Vectorize the data 

#Count Vectorizer Fitting 
cv = CountVectorizer(inputCol="genre", outputCol="features", vocabSize=count_genre, minDF=2.0)
cvmodel = cv.fit(movies_df)


In [None]:

# Transform Data using Count Vectorizer
movies_transformed = cvmodel.transform(movies_df)
movies_transformed.show(5)


+-------+--------------------+--------------------+
|movieId|               genre|            features|
+-------+--------------------+--------------------+
|      1|[adventure, anima...|(20,[1,8,11,12,13...|
|      2|[adventure, child...|(20,[8,11,12],[1....|
|      3|   [comedy, romance]|(20,[1,3],[1.0,1.0])|
|      4|[comedy, drama, r...|(20,[0,1,3],[1.0,...|
|      5|            [comedy]|      (20,[1],[1.0])|
+-------+--------------------+--------------------+
only showing top 5 rows



In [None]:
# Convert Sparse Vector to Dense
fnldata = spark.createDataFrame(movies_transformed.select('movieId', 'features')\
                                .rdd.map(lambda x: (x[0], Vectors.dense(x[1]))), ['id', 'DenseVector'])
fnldata.take(2)
fnldata.cache()

id,DenseVector
1,"[0.0,1.0,0.0,0.0,..."
2,"[0.0,0.0,0.0,0.0,..."
3,"[0.0,1.0,0.0,1.0,..."
4,"[1.0,1.0,0.0,1.0,..."
5,"[0.0,1.0,0.0,0.0,..."
6,"[0.0,0.0,1.0,0.0,..."
7,"[0.0,1.0,0.0,1.0,..."
8,"[0.0,0.0,0.0,0.0,..."
9,"[0.0,0.0,0.0,0.0,..."
10,"[0.0,0.0,1.0,0.0,..."


In [None]:
# Test the 
test_id = 45
test_vector= fnldata.rdd.lookup(test_id)

In [None]:
cosine_dist =spark.createDataFrame(fnldata.rdd.map(lambda x: (x[0], 
float(cosine_similarity(np.array(x[1]).reshape(1, -1), np.array(test_vector)\
.reshape(1, -1))[0][0]))), ['movieId', 'cosine_sim'])

In [None]:
cosine_recomm=cosine_dist.join(movies_df, movies_df['movieId']==cosine_dist.movieId)\
.sort('cosine_sim',ascending=False).take(10)
cosine_recomm_df = spark.createDataFrame(cosine_recomm)
cosine_recomm_df.join(movies, on="movieId")

movieId,cosine_sim,movieId.1,genre,title,genres
105835,1.0000000000000002,105835,"[comedy, drama, t...","Double, The (2013)",Comedy|Drama|Thri...
147845,1.0000000000000002,147845,"[comedy, drama, t...",Manson Family Vac...,Comedy|Drama|Thri...
64327,1.0000000000000002,64327,"[comedy, drama, t...",Fools' Parade (1971),Comedy|Drama|Thri...
6193,1.0000000000000002,6193,"[comedy, drama, t...",Poolhall Junkies ...,Comedy|Drama|Thri...
5416,1.0000000000000002,5416,"[comedy, drama, t...",Cherish (2002),Comedy|Drama|Thri...
2438,1.0000000000000002,2438,"[comedy, drama, t...",Outside Ozona (1998),Comedy|Drama|Thri...
92906,1.0000000000000002,92906,"[comedy, drama, t...",Girls on the Road...,Comedy|Drama|Thri...
82097,1.0000000000000002,82097,"[comedy, drama, t...",Karthik Calling K...,Comedy|Drama|Thri...
8330,1.0000000000000002,8330,"[comedy, drama, t...",Our Man in Havana...,Comedy|Drama|Thri...
30767,1.0000000000000002,30767,"[comedy, drama, t...",Sitcom (1998),Comedy|Drama|Thri...


In [None]:
euclidean_dist =spark.createDataFrame(fnldata.rdd.map(lambda x: (x[0], float( euclidean_distances(np.array(x[1]).reshape(1, -1), np.array(test_vector).reshape(1, -1))[0][0]))), ['movieId', 'euclidean_distances'])

In [None]:
# take 10 of the values for recommendation
euclidean_recomm=euclidean_dist.join(movies_df, movies_df['movieId']==euclidean_dist.movieId).sort('euclidean_distances',ascending=True).take(10)
pprint(euclidean_recomm)

[Row(movieId=105835, euclidean_distances=0.0, movieId=105835, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=5416, euclidean_distances=0.0, movieId=5416, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=64327, euclidean_distances=0.0, movieId=64327, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=92906, euclidean_distances=0.0, movieId=92906, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=6193, euclidean_distances=0.0, movieId=6193, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=147845, euclidean_distances=0.0, movieId=147845, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=2438, euclidean_distances=0.0, movieId=2438, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=82097, euclidean_distances=0.0, movieId=82097, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=8330, euclidean_distances=0.0, movieId=8330, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=319, euclidean_distances=0.0, movieId=319, genre=['comedy', 'drama', 'thriller'])]


In [None]:
manhattan_dist =spark.createDataFrame(fnldata.rdd.map(lambda x: (x[0], float( manhattan_distances(np.array(x[1])\
.reshape(1, -1), np.array(test_vector).reshape(1, -1))[0][0]))), ['movieId', 'manhattan_distances'])

In [None]:
# take 10 of the values for recommendation
manhattan_recomm=manhattan_dist.join(movies_df, movies_df.movieId==manhattan_dist.movieId).sort('manhattan_distances',ascending=True).take(10)
pprint(manhattan_recomm)



[Row(movieId=105835, manhattan_distances=0.0, movieId=105835, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=5416, manhattan_distances=0.0, movieId=5416, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=64327, manhattan_distances=0.0, movieId=64327, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=92906, manhattan_distances=0.0, movieId=92906, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=6193, manhattan_distances=0.0, movieId=6193, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=147845, manhattan_distances=0.0, movieId=147845, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=2438, manhattan_distances=0.0, movieId=2438, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=82097, manhattan_distances=0.0, movieId=82097, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=8330, manhattan_distances=0.0, movieId=8330, genre=['comedy', 'drama', 'thriller']),
 Row(movieId=319, manhattan_distances=0.0, movieId=319, genre=['comedy', 'drama', 'thriller'])]


In [None]:
jaccard_sim =spark.createDataFrame(fnldata.rdd.map(lambda x: (x[0], \
float(jaccard_similarity_score(np.array(test_vector[0]) \
.reshape(1, -1), np.array(x[1]).reshape(1, -1))))), ['movieId', 'jaccard_similarity'])

In [None]:
jaccard_recomm=jaccard_sim.join(movies_df, movies_df.movieId==jaccard_sim.movieId)\
.sort('jaccard_similarity',ascending=False).take(10)
jaccard_recomm_df = spark.createDataFrame(jaccard_recomm)
jaccard_recomm_df.join(movies, on="movieId")

movieId,jaccard_similarity,movieId.1,genre,title,genres
105835,1.0,105835,"[comedy, drama, t...","Double, The (2013)",Comedy|Drama|Thri...
147845,1.0,147845,"[comedy, drama, t...",Manson Family Vac...,Comedy|Drama|Thri...
64327,1.0,64327,"[comedy, drama, t...",Fools' Parade (1971),Comedy|Drama|Thri...
6193,1.0,6193,"[comedy, drama, t...",Poolhall Junkies ...,Comedy|Drama|Thri...
5416,1.0,5416,"[comedy, drama, t...",Cherish (2002),Comedy|Drama|Thri...
2438,1.0,2438,"[comedy, drama, t...",Outside Ozona (1998),Comedy|Drama|Thri...
92906,1.0,92906,"[comedy, drama, t...",Girls on the Road...,Comedy|Drama|Thri...
82097,1.0,82097,"[comedy, drama, t...",Karthik Calling K...,Comedy|Drama|Thri...
8330,1.0,8330,"[comedy, drama, t...",Our Man in Havana...,Comedy|Drama|Thri...
30767,1.0,30767,"[comedy, drama, t...",Sitcom (1998),Comedy|Drama|Thri...


In [None]:
len(jaccard_recomm)

26

In [None]:
jaccard_sim.join(movies_df, movies_df.movieId==jaccard_sim.movieId).sort('jaccard_similarity',ascending=True).