In [1]:
# Import packages
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.spatial import distance
import surprise
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from pickle import dump
from pickle import load

In [28]:
data = pd.read_csv('Data/Movie_Ratings.csv')
data = data.drop(['Unnamed: 0'], axis = 1)

print('\n')
data.info()
data.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607734 entries, 0 to 607733
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   movieId       607734 non-null  int64  
 1   movie         607734 non-null  object 
 2   userId        607734 non-null  int64  
 3   rating        607734 non-null  float64
 4   timestamp     607734 non-null  int64  
 5   movieIDCount  607734 non-null  int64  
 6   userIDCount   607734 non-null  int64  
 7   youtubeId     607734 non-null  object 
dtypes: float64(1), int64(5), object(2)
memory usage: 37.1+ MB


Unnamed: 0,movieId,movie,userId,rating,timestamp,movieIDCount,userIDCount,youtubeId
0,1090,Platoon (1986),111982,4.0,1390131211,15808,1225,pPi8EQzJ2Bg
1,2294,Antz (1998),111982,1.5,1215994731,10163,1225,6kqGO1c70ak
2,296,Pulp Fiction (1994),111982,5.0,1202678932,67310,1225,s7EdQ4FqbhY
3,1394,Raising Arizona (1987),111982,3.0,1338657149,15483,1225,2AIfVoGUs6c
4,58559,"Dark Knight, The (2008)",111982,4.5,1217474947,20438,1225,GVx5K8WfFJY


In [30]:
print('\n' + 'number of movies: ' + str(len(data['movie'].unique())))
print('\n' + 'number of users: ' + str(len(data['userId'].unique())))
print('\n' + 'number of reviews: ' + str(len(data['userId'])) + '\n')

data = data[['userId', 'movie', 'rating']]


number of movies: 462

number of users: 1894

number of reviews: 607734



In [31]:
# Preprocess data set
reader = surprise.Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(data[['userId', 'movie', 'rating']], reader)

In [10]:
algo = SVD(n_factors = 200, n_epochs = 200, random_state = 100)
cross_val = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 10, verbose = False)
values = cross_val.values()
values_list = list(values)

# RMSE
rmse = values_list[0]
rmse_avg = round(sum(rmse) / len(rmse),2)
rmse_std = round(rmse.std(),2)

#MAE
mae = values_list[1]
mae_avg = round(sum(mae) / len(mae),2)
mae_std = round(mae.std(),2)

# Print results
print('\n' + 'rmse average: ' + str(rmse_avg) + '\n' + 'rmse standard deviation: ' + str(rmse_std) + '\n')
print('\n' + 'mae average: ' + str(mae_avg) + '\n' + 'mae standard deviation: ' + str(mae_std) + '\n')


rmse average: 0.74
rmse standard deviation: 0.0


mae average: 0.56
mae standard deviation: 0.0



In [11]:
# Split data into an 80/20 split
trainset, testset = train_test_split(data, test_size = 0.20, random_state = 100)
# Generate SVD Algorithm
algo = SVD(n_factors = 200, n_epochs = 200, random_state = 100)
# Fit trainset to SVD algorithm
algo.fit(trainset)
# Generate rating predictions
predictions_test = algo.test(testset)

# Print test results
print('\n')
print(str(round(accuracy.rmse(predictions_test),2)))
print(str(round(accuracy.mae(predictions_test),2)))
print('\n')
print(str(algo.qi.shape) + '\n')



RMSE: 0.7496
0.75
MAE:  0.5702
0.57


(462, 200)



In [12]:
# Save as pickle file
dump(algo, open('SVD_Model.pkl', 'wb'))

In [13]:
# Load algo as pickle file
algo = load(open('SVD_Model.pkl', 'rb'))

In [147]:
# Create recommendation function
def recommendation (mov1, rating1, mov2, rating2, mov3, rating3):
    
    # Load algo as pickle file
    algo = load(open('SVD_Model.pkl', 'rb'))
    
    # Cosine distance between vectors calculation
    def cosine_distance(vector_a = np.array, vector_b = np.array):
        return cosine(vector_a, vector_b)
    
    # Retrieve vectors by movie name
    def get_vector_by_movie_name(movie_name, trained_model):
        movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_name]
        return trained_model.qi[movie_row_idx]
    
    # Get vectors by restaurant name for three restaurants
    vector1 = get_vector_by_movie_name(mov1, algo)
    score1 = rating1
    vector2 = get_vector_by_movie_name(mov2, algo)
    score2 = rating2
    vector3 = get_vector_by_movie_name(mov3, algo)
    score3 = rating3
    
    ##############################################################################################################
    
    # Calculate cosine similarity for all three chosen movies' vectors against all other movie vectors
    similarity_table1 = []
    for movie_name in algo.trainset._raw2inner_id_items.keys():
        movie_vector = get_vector_by_movie_name(movie_name, algo)
        similarity_score = cosine_distance(vector1, movie_vector)
        similarity_table1.append((1-similarity_score, movie_name))
        
    # Convert similarity table into a data frame
    mov_rec1 = pd.DataFrame(similarity_table1, columns = ['similarity', 'movie name'])
    # Scale cosine score by rating
    mov_rec1['similarity'] = mov_rec1['similarity'] * score1
    # Sort data set to descending
    mov_rec1 = mov_rec1.sort_values('similarity', ascending = False)
    
    ##############################################################################################################
    
    # Calculate cosine similarity for all three chosen movies' vectors against all other movie vectors
    similarity_table2 = []
    for movie_name in algo.trainset._raw2inner_id_items.keys():
        movie_vector = get_vector_by_movie_name(movie_name, algo)
        similarity_score = cosine_distance(vector2, movie_vector)
        similarity_table2.append((1-similarity_score, movie_name))
        
    # Convert similarity table into a data frame
    mov_rec2 = pd.DataFrame(similarity_table2, columns = ['similarity', 'movie name'])
    # Scale cosine score by rating
    mov_rec2['similarity'] = mov_rec2['similarity'] * score2
    # Sort data set to descending
    mov_rec2 = mov_rec2.sort_values('similarity', ascending = False)
    
    ##############################################################################################################
    
    # Calculate cosine similarity for all three chosen movies' vectors against all other movie vectors
    similarity_table3 = []
    for movie_name in algo.trainset._raw2inner_id_items.keys():
        movie_vector = get_vector_by_movie_name(movie_name, algo)
        similarity_score = cosine_distance(vector3, movie_vector)
        similarity_table3.append((1-similarity_score, movie_name))
    
    # Convert similarity table into a data frame
    mov_rec3 = pd.DataFrame(similarity_table3, columns = ['similarity', 'movie name'])
    # Scale cosine score by rating
    mov_rec3['similarity'] = mov_rec3['similarity'] * score3
    # Sort data set to descending
    mov_rec3 = mov_rec3.sort_values('similarity', ascending = False)
    
    ##############################################################################################################
    
    # Create a list of all data frames
    df_list = [mov_rec1, mov_rec2, mov_rec3]
    # Concatenate all data frames by axis 0
    mov_rec4 = pd.concat(df_list, axis = 0)
    # Remove all three chosen movies 
    mov_rec4 = mov_rec4.loc[(mov_rec4['movie name'] != mov1) & (mov_rec4['movie name'] != mov2) &
                            (mov_rec4['movie name'] != mov3)].reset_index(drop = True)
    # Scale cosine score by duplicates
    mov_rec4 = mov_rec4.groupby(by = "movie name").sum().reset_index()
    # Sort values by cosine values in descending order
    mov_rec4 = mov_rec4.sort_values('similarity', ascending = False).reset_index(drop = True)
    
    # Print recommendations
    print('\n')
    mov_rec4.info()
    return mov_rec4.head(10)

In [150]:
# Test recommendation engine function
recommendation("Jurassic Park (1993)", 5, 
               "Godfather, The (1972)", 5, 
               "Kill Bill: Vol. 2 (2004)", 5)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movie name  459 non-null    object 
 1   similarity  459 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.3+ KB


Unnamed: 0,movie name,similarity
0,Kill Bill: Vol. 1 (2003),0.835388
1,"Godfather: Part II, The (1974)",0.783946
2,Pulp Fiction (1994),0.414238
3,Jaws (1975),0.395897
4,Goodfellas (1990),0.365485
5,"Godfather: Part III, The (1990)",0.344804
6,No Country for Old Men (2007),0.317878
7,Reservoir Dogs (1992),0.301384
8,"Silence of the Lambs, The (1991)",0.287296
9,"Lost World: Jurassic Park, The (1997)",0.273813
