In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import col, explode, udf, lit
from pyspark.sql.types import IntegerType, FloatType, ArrayType, StringType

# Create Spark session
spark = SparkSession.builder \
    .appName("Recommendation") \
    .master("local") \
    .config("spark.mongodb.write.connection.uri","mongodb://localhost:27017/imdb.movies")\
    .config("spark.mongodb.read.connection.uri","mongodb://localhost:27017/imdb.movies")\
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector:10.0.5")\
    .getOrCreate()

In [None]:
SparkConf().getAll()

In [3]:
movies = spark.read.csv("data/movies.csv",header=True)
ratings = spark.read.csv("data/ratings.csv",header=True)
links = spark.read.csv("data/links.csv",header=True)
imdb = spark.read.option("multiline","true").json("data.json")


movies_links = movies.join(links, ['movieId'], 'left')\
    .withColumnRenamed('title', 'old_title')\
    .withColumnRenamed('genres', 'old_genres')\
    .drop('tmdbId')

movies_imdb = movies_links.join(imdb, ['imdbId'], 'left').dropna(how="any")\
    .select("imdbId","movieId","title","year","poster", "rating", "summary", "time", "genres" )

In [4]:
# insert movies_imdb to collection 
import json

def lower_case(x):
    res = []
    for x_ in x:
        res.append(x_.lower())
    return res

to_lower_case = udf(lower_case, ArrayType(StringType()))

movies_imdb_convert = movies_imdb.withColumn("imdbId",movies_imdb.imdbId.cast(IntegerType())) \
    .withColumn('movieId', movies_imdb.movieId.cast(IntegerType())) \
    .withColumn('rating', movies_imdb.rating.cast(FloatType()))\
    .withColumn("genres", to_lower_case(col("genres")))\
    .select("imdbId","movieId","title","year","poster", "rating", "summary", "time", "genres" )


# convert_to_lower = udf(lower_case, ArrayType(StringType()))

# movies_imdb_convert = movies_imdb

movies_imdb_convert.write.format("mongodb").mode("overwrite").save()
# genres = movies_imdb.select("genres")
# convert = genres.withColumn("new_genres", retrieve_array(col("genres")))

# movies_imdb.show()

                                                                                

In [91]:
# reading csvs
linksdf = pd.read_csv('data/links.csv', index_col='movieId',
                        dtype={'imdbId': str, 'tmdbId': str})
moviesdf = pd.read_csv('data/movies.csv', index_col='movieId')
df = pd.concat([moviesdf, linksdf], axis=1)
df = df.iloc[::-1]

# gettings imdb ids
movieIds = {}
movieGenres = df['genres'].tolist()

for i in range(len(movieGenres)):
    genre = movieGenres[i].split('|')[0]
    if genre in movieIds:
        # if len(movieIds[genre]) < 15:
        movieIds[genre].append(df.iloc[i]['imdbId'])
    else:
        movieIds[genre] = [df.iloc[i]['imdbId']]

del movieIds['(no genres listed)']
len(movieIds['Comedy'])

2779

In [92]:
# # Join both the data frames to add movie data into ratings
# movie_ratings = ratings.join(movies, ['movieId'], 'left')
# movie_ratings.printSchema()

ratings = ratings.withColumn("userId",ratings.userId.cast(IntegerType())) \
    .withColumn('movieId', ratings.movieId.cast(IntegerType())) \
    .withColumn('rating', ratings.rating.cast(FloatType()))\
    .drop('timestamp')
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [93]:
def get_sparsity(ratings):
    # Count the total number of ratings in the dataset
    count_nonzero = ratings.select("rating").count()

    # Count the number of distinct userIds and distinct movieIds
    denominator = ratings.select("userId").distinct().count() * ratings.select("movieId").distinct().count()

    

    # Divide the numerator by the denominator
    sparsity = (1.0 - (count_nonzero *1.0)/denominator)*100
    print("The ratings dataframe is ", "%.2f" % sparsity + "% sparse.")
    
get_sparsity(ratings)

The ratings dataframe is  98.30% sparse.


In [94]:
# Group data by userId, count ratings
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [95]:
# Group data by userId, count ratings
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|   2858|  204|
|     50|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



In [96]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [97]:
# Create test and train set
(train, test) = ratings.randomSplit([.8, .2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN 

# Create ALS model
als = ALS(
         userCol="userId", 
         itemCol="movieId",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

type(als)

# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

In [98]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [5, 10, 20, 30]) \
            .addGrid(als.regParam, [0.001, .01, .05, .1]) \
            .build()

print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [99]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)


In [100]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)
#Extract best model from the cv model above
best_model = model.bestModel

23/01/01 22:30:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/01/01 22:30:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/01/01 22:30:27 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

KeyboardInterrupt: 

In [None]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 100
  MaxIter: 10
  RegParam: 0.1


In [None]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)
# test_predictions.show()

0.8786363445184214


                                                                                

In [None]:
rank=10
maxIter=10
regParam=0.15

# Create ALS model again
final_als = ALS(
        rank=rank,
        maxIter=maxIter,
        regParam=regParam,
        userCol="userId", 
        itemCol="movieId",
        ratingCol="rating", 
        nonnegative = True, 
        implicitPrefs = False,
        coldStartStrategy="drop"
)

final_model = final_als.fit(train)

# View the predictions
test_predictions = final_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)
test_predictions.show()

0.8732556287231745
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|   5618|   3.0| 3.7377372|
|   148|  54001|   4.0| 3.7215872|
|   148|  81834|   4.0| 3.7021964|
|   148|  81847|   4.5|  3.556174|
|   148|  89745|   4.0| 3.4501503|
|   148|  98491|   5.0| 3.6914907|
|   148|  99149|   3.0| 3.3535335|
|   148| 122886|   3.5| 3.0934284|
|   463|    110|   4.5|  4.021294|
|   463|    520|   4.0| 3.2583194|
|   463|   1320|   4.0| 3.2176085|
|   463|   2019|   4.0| 3.9286242|
|   463|   2167|   3.0| 3.6318412|
|   463|   3448|   3.0| 3.6441896|
|   463|   3753|   4.0| 3.5524113|
|   471|   2571|   3.5|  3.708087|
|   471|   4886|   4.0| 3.4968083|
|   471|   7147|   4.0| 3.4445422|
|   471|   8874|   3.5| 3.7928448|
|   471|  68157|   4.0| 3.8135772|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
final_model = ALSModel.load("final_model")

In [None]:
# users = ratings.select(als.getUserCol()).distinct().limit(3)
users = spark.createDataFrame([148], IntegerType()).toDF('userId')
userSubsetRecs = final_model.recommendForUserSubset(users, 2)
nrecommendations = userSubsetRecs\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecommendations = userSubsetRecs\
            .withColumn("rec_exp", explode("recommendations"))\
            .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

# # users.show()
# for ranking, (movieId, rating) in enumerate(userSubsetRecs[0]['recommendations']):
#     title = movies.where(movies.movieId == movieId).take(1)[0]['title']
#     print(
#         f'Recommendation {ranking+1}: {title} | predicted score: {rating}'.format())
df = nrecommendations.toPandas()

df["movieId"].tolist()


[93008, 25906]

In [None]:
# Generate n Recommendations for all users
nrecommendations = final_model.recommendForAllUsers(10)
nrecommendations = nrecommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecommendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|     1|   5490|5.7319317|
|     1|   5915|5.7319317|
|     1| 171495| 5.560452|
|     1|   6818|5.5255785|
|     1|  33649| 5.514558|
|     1|  27523| 5.458926|
|     1|   4429|5.4469056|
|     1| 102217| 5.435365|
|     1|  33779| 5.435365|
|     1|  69524| 5.430213|
+------+-------+---------+



                                                                                

In [None]:
nrecommendations.join(movies, on='movieId').filter('userId = 100').show()

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|  42730|   100|5.1454678|   Glory Road (2006)|               Drama|
|  67618|   100|5.1370006|Strictly Sexual (...|Comedy|Drama|Romance|
| 112804|   100| 5.040798|    I Origins (2014)|        Drama|Sci-Fi|
|   3086|   100|5.0038996|Babes in Toyland ...|Children|Comedy|F...|
|  93008|   100| 4.972296|Very Potter Seque...|      Comedy|Musical|
|  77846|   100| 4.972296| 12 Angry Men (1997)|         Crime|Drama|
|  25906|   100| 4.972296|Mr. Skeffington (...|       Drama|Romance|
| 183897|   100| 4.956506| Isle of Dogs (2018)|    Animation|Comedy|
|   4495|   100|4.9558105|Crossing Delancey...|      Comedy|Romance|
|   6201|   100|4.9558105|    Lady Jane (1986)|       Drama|Romance|
+-------+------+---------+--------------------+--------------------+



In [None]:
ratings.join(movies, on='movieId').filter('userId = 100').sort('rating', ascending=False).limit(10).show()

+-------+------+------+--------------------+--------------------+
|movieId|userId|rating|               title|              genres|
+-------+------+------+--------------------+--------------------+
|   1101|   100|   5.0|      Top Gun (1986)|      Action|Romance|
|   1958|   100|   5.0|Terms of Endearme...|        Comedy|Drama|
|   2423|   100|   5.0|Christmas Vacatio...|              Comedy|
|   4041|   100|   5.0|Officer and a Gen...|       Drama|Romance|
|   5620|   100|   5.0|Sweet Home Alabam...|      Comedy|Romance|
|    368|   100|   4.5|     Maverick (1994)|Adventure|Comedy|...|
|    934|   100|   4.5|Father of the Bri...|              Comedy|
|    539|   100|   4.5|Sleepless in Seat...|Comedy|Drama|Romance|
|     16|   100|   4.5|       Casino (1995)|         Crime|Drama|
|    553|   100|   4.5|    Tombstone (1993)|Action|Drama|Western|
+-------+------+------+--------------------+--------------------+



In [None]:
irecommendations = best_model.recommendForAllItems(10)
irecommendations = irecommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select(col('movieId'), col("rec_exp.userId"), col("rec_exp.rating"))

irecommendations.limit(10).show()



+-------+------+---------+
|movieId|userId|   rating|
+-------+------+---------+
|     26|    53| 4.585873|
|     26|    43|4.4174333|
|     26|   171|4.3215876|
|     26|   389| 4.232469|
|     26|   452| 4.170912|
|     26|   122| 4.124021|
|     26|   250| 4.111661|
|     26|   269|4.0804043|
|     26|    40| 4.076036|
|     26|    93|4.0574923|
+-------+------+---------+



                                                                                

In [None]:
final_model.save("final_model")

In [None]:
load_model = ALSModel.load("final_model")

test_predictions = load_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

test_predictions.show()

0.8732556287231764
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|      6|   4.0| 4.5949717|
|     1|     47|   5.0| 4.5823603|
|     1|    163|   5.0|  4.010098|
|     1|    216|   5.0|  4.128422|
|     1|    316|   3.0|  3.763847|
|     1|    367|   4.0| 3.8843725|
|     1|    552|   4.0| 3.7210417|
|     1|    553|   5.0| 4.6246066|
|     1|    590|   4.0| 4.3124084|
|     1|    593|   4.0| 4.7583666|
|     1|    648|   3.0|  4.013749|
|     1|   1009|   3.0| 3.2530835|
|     1|   1023|   5.0|  4.094334|
|     1|   1090|   4.0|  4.602919|
|     1|   1092|   5.0| 3.6413941|
|     1|   1136|   5.0| 4.8654294|
|     1|   1196|   5.0| 4.9686337|
|     1|   1206|   5.0|  4.465945|
|     1|   1256|   5.0|  4.551629|
|     1|   1265|   4.0| 4.7060924|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
def new_user_recomendatios(user_id, ratings, movies, num_ratings, num_recs):
    samples = ratings.sample(False, .001, seed=100).collect()
    # get list movieId
    sample_list = [i[1] for i in samples]
    new_ratings = []
    # get nre user rating
    for i in range(len(sample_list)):
        # print movie title by movie id in sample list
        print(movies.where(movies.movieId == sample_list[i]).take(1)[0]['title'])
        rating = input('rate this movie 1-5, press n if you have not seen:\n')
        
        if rating == 'n':
            continue
        else:
            new_ratings.append((user_id, sample_list[i], float(rating)))
            num_ratings -= 1
            if num_ratings == 0 : 
                break

    # new_ratings into dataframe base on ratings column
    new_user_ratings = spark.createDataFrame(new_ratings, ratings.columns)

    combined_movie_ratings = ratings.union(new_user_ratings)

    # Create ALS model again
    als = ALS(
            rank=10,
            maxIter=50,
            regParam=0.15,
            userCol="userId", 
            itemCol="movieId",
            ratingCol="rating", 
            nonnegative = True, 
            implicitPrefs = False,
            coldStartStrategy="drop"
    )

    model = als.fit(combined_movie_ratings)

    recomendations = model.recommendForAllUsers(num_recs)

    recomendation_for_user = recomendations.where(recomendations.userId == user_id).take(1)
    # enumerate for  ranking
    for ranking, (movieId, rating) in enumerate(recomendation_for_user[0]['recommendations']):
        title = movies.where(movies.movieId == movieId).take(1)[0]['title']
        print(f'Recommendation {ranking+1}: {title} | predicted score: {rating}'.format())

new_user_recomendatios(2138, ratings=ratings, movies=movies, num_ratings=5, num_recs=10)

In [None]:
# test_ratings = ratings
# new_ratings = []
# new_ratings.append((8382,101,5.0))
# # new ratings into dataframe base on ratings column
# new_user_ratings = spark.createDataFrame(new_ratings, test_ratings.columns)

# combined_movie_ratings = test_ratings.union(new_user_ratings)


# # Create ALS model again
# als = ALS(
#         rank=10,
#         maxIter=50,
#         regParam=0.15,
#         userCol="userId", 
#         itemCol="movieId",
#         ratingCol="rating", 
#         nonnegative = True, 
#         implicitPrefs = False,
#         coldStartStrategy="drop"
# )

# model = als.fit(combined_movie_ratings)

# # recomendations = model.recommendForAllUsers(10)
# nrecomendations = best_model.recommendForAllUsers(10)

# recs_for_user = nrecomendations.where(nrecomendations.userId == 100).take(1)
# recs_for_user[0]['recommendations']

# for ranking, (movieId, rating) in enumerate(recs_for_user[0]['recommendations']):
#     movie_string = movies.where(movies.movieId == movieId).take(1)[0]['title']
#     print('Recommendation {}: {} | predicted score: {}'.format(ranking+1, movie_string, rating))