<a href="https://colab.research.google.com/github/james-finn-travers/Movie-Recommendation-System/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System
James Travers

## Libraries Importing

---



In [1]:
# Make sure to install the sklearn library before running this code
# You can install it by running: pip install scikit-learn
#%pip install scikit-learn
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re

## Load Datasets

In [2]:
import pandas as pd
#https://files.grouplens.org/datasets/movielens/ml-25m/


!wget https://files.grouplens.org/datasets/movielens/ml-latest.zip
!unzip ml-latest.zip

# Updated file paths to load from the ml-latest directory:
genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('ml-latest/genome-tags.csv')
movies = pd.read_csv('ml-latest/movies.csv')
links = pd.read_csv('ml-latest/links.csv')
tags = pd.read_csv('ml-latest/tags.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')


movies_average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
movies_average_ratings.rename(columns={'rating': 'average_rating'}, inplace=True)



movies = pd.merge(movies, movies_average_ratings, on='movieId', how='left')

#imdb.com/title/tt0114885/



--2025-01-16 19:21:09--  https://files.grouplens.org/datasets/movielens/ml-latest.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 350896731 (335M) [application/zip]
Saving to: ‘ml-latest.zip’


2025-01-16 19:21:31 (16.3 MB/s) - ‘ml-latest.zip’ saved [350896731/350896731]

Archive:  ml-latest.zip
   creating: ml-latest/
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/links.csv     
  inflating: ml-latest/README.txt    
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


## Filter Movie Recommendation List
### Filter out Obscure Movies (<500 ratings)

In [3]:
movies_obscure = ratings.groupby('movieId')['rating'].count().reset_index()
movies_obscure = movies_obscure[movies_obscure['rating'] < 500].sort_values(by='rating', ascending=False)
obscure_ids = set(movies_obscure['movieId'].values)
movies_recommending = movies[~movies['movieId'].isin(movies_obscure['movieId'])]

### Remove Bad Movies for Small Sizes

In [4]:
movies_bad = ratings.groupby('movieId')['rating'].mean().reset_index()
movies_bad = movies_bad[movies_bad['rating'] < 3].sort_values(by='rating', ascending=False)
bad_ids = set(movies_bad['movieId'].values)


good_movies = list(movies_recommending[~movies_recommending['movieId'].isin(movies_bad['movieId'])]['movieId'])

###Clean movie titles to improve ease of search

In [5]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title
movies["clean_title"] = movies["title"].apply(clean_title)

### Vectorize clean titles for searching

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

### Verify Proper Loading

In [7]:
'''
print('Genome Scores:')
print(genome_scores.head())
'''
'''
print('Genome Tags:')
print(genome_tags.head())'''
print('Movies:')
print(movies.head())
'''
print('Tags:')
print(tags.head())
'''
print('Ratings:')
print(ratings.head())


print('Movies Obscure:')
print(movies_obscure.head())

Movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  average_rating  \
0  Adventure|Animation|Children|Comedy|Fantasy        3.893508   
1                   Adventure|Children|Fantasy        3.278179   
2                               Comedy|Romance        3.171271   
3                         Comedy|Drama|Romance        2.868395   
4                                       Comedy        3.076957   

                        clean_title  
0                    Toy Story 1995  
1                      Jumanji 1995  
2             Grumpier Old Men 1995  
3            Waiting to Exhale 1995  
4  Father of the Bride Part II 1995  
Ratings:
   userId  movieId  rating   timestamp
0       1        1

### Check for Empty/Duplicate Data

In [8]:
print('Genome Scores:')
print(genome_scores.isnull().sum())
print(genome_scores.duplicated().sum())
'''
print('Genome Tags:')
print(genome_tags.isnull().sum())
print(genome_tags.duplicated().sum())'''
print('Movies:')
print(movies.isnull().sum())
print(movies.duplicated().sum())

Genome Scores:
movieId      0
tagId        0
relevance    0
dtype: int64
0
Movies:
movieId              0
title                0
genres               0
average_rating    3298
clean_title          0
dtype: int64
0


## Normalize Ratings to 0-1 Linear Scale

In [9]:
ratings['rating'] /=5
print(ratings.head())

   userId  movieId  rating   timestamp
0       1        1     0.8  1225734739
1       1      110     0.8  1225865086
2       1      158     0.8  1225733503
3       1      260     0.9  1225735204
4       1      356     1.0  1225735119


## Create Relevant Numpy Arrays

In [10]:

num_movies = movies['movieId'].max()
num_tags = genome_scores['tagId'].max()
num_users = ratings['userId'].max()

print('{}  movies, {}  tags,  {} users'.format(num_movies, num_tags, num_users))


# Updated user_movie_ratings to be a list of lists (dynamic lists)
#user_movie_ratings = [[0] * (num_movies + 1) for _ in range(num_users + 1)]
user_tags = np.zeros((num_users + 1, num_tags + 1, 2))
movie_tags = np.zeros((num_movies + 1, num_tags + 1))


print(user_tags.shape)
print(movie_tags.shape)

288983  movies, 1128  tags,  330975 users
(330976, 1129, 2)
(288984, 1129)


#### Fill Movie-Tags Relevance Matrix

In [11]:
movie_tags[genome_scores['movieId'].values, genome_scores['tagId'].values] = genome_scores['relevance'].values

Verify proper relevance tags matrix filling

In [12]:
print(movie_tags[:5,:5])

[[0.      0.      0.      0.      0.     ]
 [0.      0.032   0.02225 0.07    0.059  ]
 [0.      0.0325  0.032   0.0405  0.051  ]
 [0.      0.0415  0.05525 0.02125 0.07225]
 [0.      0.0315  0.034   0.028   0.02725]]


#### Create User Preferences Matrix
Note: This kernel takes up over 13 GB of RAM. Make sure you are running this kernel.

In [13]:
# Extract relevant columns
user_ids = ratings['userId'].values.astype(int)  # User IDs from ratings
movie_ids = ratings['movieId'].values.astype(int)  # Movie IDs from ratings
ratings_values = ratings['rating'].values  # Ratings from ratings
# Get dimensions of user_tags
num_users, num_tags, _ = user_tags.shape

# Update `user_tags[:,:,0]`: Weighted ratings for tags
for u_id, m_id, r in zip(user_ids, movie_ids, ratings_values):
    user_tags[u_id, :, 0] += r * movie_tags[m_id]
    user_tags[u_id, :, 1] += movie_tags[m_id]

# Compute user preferences
users_preferences = user_tags[:, :, 0] / np.where(user_tags[:, :, 1] != 0, user_tags[:, :, 1], 1)


Verify user-preference matrix is filled

In [14]:
print(users_preferences[:5,:5])

[[0.         0.         0.         0.         0.        ]
 [0.         0.83153694 0.79887343 0.7894713  0.78466524]
 [0.         0.6961728  0.66832028 0.76642915 0.75254237]
 [0.         0.98610396 0.98635511 0.98379843 0.98237923]
 [0.         0.87904588 0.86582531 0.91831785 0.86316876]]


## Find Most Similar Movies Dashboard

### Recommended Movies Dataframe Class

## User Preferences

In [15]:
class User:

    def __init__(self):
        #self.ratings = np.full((num_movies+1,),np.nan,dtype=float)
        self.ratings = pd.DataFrame(columns=['movieId','movie','rating','backendRating'])
        # Initialize tags_ratings as a 2D array with shape (num_tags + 1, 2)
        self.tags_ratings = np.zeros((movie_tags.shape[1], 2))
        self.preferences = np.zeros(movie_tags.shape[1]) # Initialize as zeros so we can handle div by zero errors
        self.movies_seen = []


    def get_num_movies(self):
        return len(self.ratings)


    def add_rating(self, entry_title, rating):
        # find most simlar movie title
        clean_title = re.sub("[^a-zA-Z0-9 ]", "", entry_title)
        query_vec = vectorizer.transform([clean_title])
        similarity = cosine_similarity(query_vec, tfidf).flatten()
        index = np.argmax(similarity)
        movie_id = movies.iloc[index]['movieId']
        self.adjust_weightings(movie_id, rating) #adjust the weightings
        self.movies_seen.append(movie_id)


    def adjust_weightings(self, movie_id, rating):

        # Normalize
        rating /=5
        # Updates tag-based ratings
        self.tags_ratings[:, 0] += rating * movie_tags[movie_id, :]
        self.tags_ratings[:, 1] += movie_tags[movie_id, :]
        #add to dataframe
        new_row = {'movieId': movie_id, 'movie': movies.loc[movies['movieId'] == movie_id, 'title'].values[0],
                  'rating': rating * 5, 'backendRating': rating}
        self.ratings = pd.concat([pd.DataFrame([new_row]), self.ratings], ignore_index=True)

        self.preferences = self.tags_ratings[:, 0] / np.where(self.tags_ratings[:, 1] != 0, self.tags_ratings[:, 1], 1) # replace user preference matrix

    def get_ratings(self):
        return self.ratings #return dataframe

    def get_similar_users(self,num_users):
         # Assuming this function needs to get preferences of other users so using the global variable users_preferences
        indices, distances = self.get_nearest_neighbours(users_preferences, k=num_users, metric='cosine')
        return indices[1:]


    def get_movie_reccomendations(self):
        if self.get_num_movies() == 0:
            print("No movies have been rated yet.")
            return
        print('Note: to increase personalization of recommendations, it is encouraged to input as many movies as possible.')
        print("Finding movies...")
        #find 25 most similar users
        closest_users = self.get_similar_users(25)
        #average out user's artings
        recommending_movies = ratings.groupby('rating').sum().reset_index()
        recommending_movies = ratings.sort_values(by = 'rating', ascending = False)
        recommendations_filtered = recommending_movies[recommending_movies['movieId'].isin(good_movies) & ~recommending_movies['movieId'].isin(obscure_ids) & ~recommending_movies['movieId'].isin(self.movies_seen)][:5]
        for i in  list(recommendations_filtered['movieId'])[:5]:
            print(movies.loc[movies['movieId'] == i, 'title'].iloc[0])

        #return recommendation_df


    def print_movie_ratings(self):
        print(self.ratings[['movie','rating']])



    def get_nearest_neighbours(self, data, k, metric):
        # Fit k-nearest neighbors model
        model = NearestNeighbors(n_neighbors=k, metric=metric)
        model.fit(data)
        # Get distances and indices of k-nearest neighbors
        distances, indices = model.kneighbors(self.preferences.reshape(1, -1))
        # Return distances and indices of k-nearest neighbors, excluding the user in question
        return indices[0], distances[0]

##Create Interactive Widgets

In [16]:
# Create widgets

#takes input for movie
movie_input = widgets.Text(
    placeholder="Enter movie name",
    description="Movie:"
)


#takes input for when movie was released
year_input = widgets.IntText(
    placeholder="Enter movie name",
    description="Year:",
    value = None
)

#slider for user to decide ratings
rating_slider = widgets.FloatSlider(
    value=0,
    min=0,
    max=5,
    step=0.1,
    description="Rating:",
    orientation='horizontal'
)

#submit button
submit_button = widgets.Button(
    description="Submit Rating"
)

#view user's ratings
view_ratings_button = widgets.Button(
    description="View Ratings"
)

#give recommendations
recommendations_button = widgets.Button(
    description="What Should I Watch"
)


# Organize layout
layout = widgets.VBox([
    movie_input,
    year_input,
    rating_slider,
    widgets.HBox([submit_button,view_ratings_button, recommendations_button])
])

# Display the layout
#display(layout)


##Run System

In [17]:
user_instance = User()

# Define output widget for displaying results
output = widgets.Output()

# Event handler for submitting a rating
def on_submit_rating(change):
    with output:
        output.clear_output()
        if movie_input.value == '':
            print("Please enter a movie title.")
            return
        movie_title = movie_input.value
        if year_input.value :
            movie_title += ' ' + str(year_input.value)
        rating = rating_slider.value
        if movie_title != '':
            user_instance.add_rating(movie_title, rating)
            print(f"Added rating for movie '{movie_title}' with {rating} stars.")
        else:
            print("Please enter a movie title.")

# Event handler for viewing ratings
def on_view_ratings(change):
    with output:
        output.clear_output()
        ratings_df = user_instance.get_ratings()
        if not ratings_df.empty:
            print("Your Ratings:")
            display(ratings_df[['movie']])
        else:
            print("No ratings have been added yet.")

# Event handler for viewing recommendations
def on_view_recommendations(change):
    with output:
        output.clear_output()
        recommendations = user_instance.get_movie_reccomendations()


# Attach event handlers to buttons
submit_button.on_click(on_submit_rating)
view_ratings_button.on_click(on_view_ratings)
recommendations_button.on_click(on_view_recommendations)

# Display the output widget below the layout

layout_with_output = widgets.VBox([layout, output])
display(layout_with_output)

VBox(children=(VBox(children=(Text(value='', description='Movie:', placeholder='Enter movie name'), IntText(va…