Movie Recommender System

In [1]:
# Loading the librairies
import pandas as pd
import numpy as np
import warnings

In [2]:
# Ignore all warnings
warnings.filterwarnings('ignore')

In [3]:
# Links to the datasets
ratings_link = 'https://raw.githubusercontent.com/kueyram/DSC630/main/Data/ratings.csv'
movies_link = 'https://raw.githubusercontent.com/kueyram/DSC630/main/Data/movies.csv'

# Loading the datasets into pandas
ratings_df = pd.read_csv(ratings_link, sep=',')
movies_df = pd.read_csv(movies_link, sep=',')

In [4]:
# Checking the ratings dataset
ratings_df.sample(n=5)

Unnamed: 0,userId,movieId,rating,timestamp
45790,304,529,5.0,891173884
11862,73,34162,4.5,1464197168
85931,558,367,3.0,1035415962
7635,51,3107,5.0,1230928844
28162,195,1732,4.0,974705986


In [5]:
# Checking the movies dataset
movies_df.sample(n=5)

Unnamed: 0,movieId,title,genres
4978,7620,Monster in a Box (1992),Comedy|Drama
3135,4219,Girls Just Want to Have Fun (1985),Comedy
4384,6428,Two Mules for Sister Sara (1970),Comedy|War|Western
8381,109633,"Garden of Words, The (Koto no ha no niwa) (2013)",Animation|Romance
2019,2691,"Legend of 1900, The (a.k.a. The Legend of the ...",Drama


In [6]:
# Checking the shape of the datasets
print('The shape of the ratings dataset is ',ratings_df.shape)
print('The shape of the movies dataset is ',movies_df.shape)

The shape of the ratings dataset is  (100836, 4)
The shape of the movies dataset is  (9742, 3)


In [7]:
# The datasets have a common columns moviedID
# We will join the ratings and movies datasets to create one dataset for this project
movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')

# Checking the new dataset
movie_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
41165,308,6377,4.0,1421375046,Finding Nemo (2003),Adventure|Animation|Children|Comedy
73098,489,40583,3.0,1333232237,Syriana (2005),Drama|Thriller
806,368,70,3.0,971273705,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
19820,606,509,4.0,1171733384,"Piano, The (1993)",Drama|Romance
23716,474,3365,3.5,1127688921,"Searchers, The (1956)",Drama|Western


In [8]:
# Checking the shape of the new dataset
print('The shape of the final dataset is ',movie_ratings.shape)

The shape of the final dataset is  (100836, 6)


In [9]:
# Calculating the mean rating of all movies
movie_ratings.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Gena the Crocodile (1969)              5.0
True Stories (1986)                    5.0
Cosmic Scrat-tastrophe (2015)          5.0
Love and Pigeons (1985)                5.0
Red Sorghum (Hong gao liang) (1987)    5.0
Name: rating, dtype: float64

In [10]:
# Calculating the count rating of all movies
movie_ratings.groupby('title')['rating'].count().sort_values(ascending=False).head(5)

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [11]:
# Creating a dataframe with 'rating' count values
ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
ratings['num of ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())

# Printing random rows from the ratings dataframe
ratings.sample(n=5)

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Quantum of Solace (2008),3.457143,35
"10th Kingdom, The (2000)",2.75,2
Bad News Bears (2005),3.0,3
Rat Race (2001),3.055556,18
Flesh & Blood (1985),3.0,2


In [12]:
# Creating a user-item matrix where rows represent users and columns represent movies
user_movie_matrix = movie_ratings.pivot_table(index ='userId', columns ='title', values ='rating') 

# Sorting the dataset by the num of ratings
ratings.sort_values('num of ratings', ascending = False).head(10)

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.164134,329
"Shawshank Redemption, The (1994)",4.429022,317
Pulp Fiction (1994),4.197068,307
"Silence of the Lambs, The (1991)",4.16129,279
"Matrix, The (1999)",4.192446,278
Star Wars: Episode IV - A New Hope (1977),4.231076,251
Jurassic Park (1993),3.75,238
Braveheart (1995),4.031646,237
Terminator 2: Judgment Day (1991),3.970982,224
Schindler's List (1993),4.225,220


In [13]:
# Function to recommend movies based on the user inputs
# The user will enter the name of the movie and the number of recommendations
def get_similar_movies(input_movie, top_n):
    
    # input_movie: Name of the movie the user is searching for
    # top_n: The number of movies to recommend
    # The output will be the list of movies and their correlation values
    
    # Check if the input movie exists in the user_movie_matrix
    if input_movie not in user_movie_matrix.columns:
        print("Movie was not found in the database. Please try again.")
        return pd.DataFrame()

    # Get the ratings of the specified movie
    selected_movie_ratings = user_movie_matrix[input_movie]

    # Check if top_n is an integer
    if not isinstance(top_n, int) or top_n <= 0:
        print("Please enter a positive integer for the number of correlated movies.")
        return pd.DataFrame()

    # Similarity scores between the selected movie and all other movies
    similarity_results = user_movie_matrix.corrwith(selected_movie_ratings)
    
    # Converting the similarity results into a dataframe
    similarity_df = pd.DataFrame(similarity_results, columns=['Similarity'])
    
    # Removing entries with missing values
    similarity_df.dropna(inplace=True)
    
    # Sorting by similarity in descending order
    similarity_df = similarity_df.sort_values(by='Similarity', ascending=False)

    # Exclude the input movie from the recommendations
    similarity_df = similarity_df[similarity_df.index != input_movie]

    # Return the top_n recommendations
    return similarity_df.head(top_n)

In [14]:
# testing the function
get_similar_movies('Forrest Gump (1994)',5)

Unnamed: 0_level_0,Similarity
title,Unnamed: 1_level_1
Lost & Found (1999),1.0
"Century of the Self, The (2002)",1.0
The 5th Wave (2016),1.0
Play Time (a.k.a. Playtime) (1967),1.0
Memories (Memorîzu) (1995),1.0


In [15]:
# testing the function
get_similar_movies('Mission: Impossible - Fallout (2018)',10)

Unnamed: 0_level_0,Similarity
title,Unnamed: 1_level_1
Planet Earth II (2016),1.0
Arrival (2016),1.0
Avengers: Infinity War - Part I (2018),1.0
Batman Begins (2005),1.0
Blade Runner 2049 (2017),1.0
"Dark Knight, The (2008)",1.0
Doctor Strange (2016),1.0
Guardians of the Galaxy 2 (2017),1.0
Thor: Ragnarok (2017),1.0


In [16]:
# Prompt user for movie input and number of recommendations
user_input_movie = input("Enter the name of the movie you like: ")
try:
    num_recommendations = int(input("Enter the number of recommendations you want: "))
except ValueError:
    print("Please enter a valid integer for the number of recommendations.")
    num_recommendations = 0

# getting the recommendations
recommendations = get_similar_movies(user_input_movie, num_recommendations)

# Print the results
print("Recommended Movies:")
print(recommendations)

Enter the name of the movie you like: Money Train (1995)
Enter the number of recommendations you want: 3
Recommended Movies:
                    Similarity
title                         
Bulletproof (1996)         1.0
Porky's (1982)             1.0
Insomnia (2002)            1.0
