### Recommender Systems – Exercise

##### Import the libraries necessary for this project.

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

##### Load the ratings, movies and plots datasets. Display the first 5 rows of the dataframes to get an overview of the data.

In [2]:
ratings_df = pd.read_csv('ml-10M100K/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [3]:
movies_df = pd.read_csv('ml-10M100K/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python')
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
plots_df = pd.read_csv("datasets/wiki_movie_plots_deduped.csv", encoding="utf-8")
plots_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


##### Merge the two datasets on movieId and calculate the average rating for each genre.

In [5]:
# Merge the ratings and movies dataframes on the MovieID column to combine dataframe with movie details alongside ratings information.
df = pd.merge(ratings_df, movies_df, on='MovieID')

# Split the Genres column, which contains multiple genres separated by '|', into separate rows, so that each genre is represented individually for each movie.
# 'expand=True' ensures that each genre is split into a new column.
# 'stack()' converts columns into rows.
# 'reset_index(level=1, drop=True)' drops the old index column to maintain a clean format.
df = df.join(df['Genres'].str.split('|', expand=True).stack().reset_index(level=1, drop=True).rename('Genre'))

# Group by Genre and calculate the average rating for each genre for each genre group and sorts the genres by their average ratings in descending order.
genre_ratings = df.groupby('Genre')['Rating'].mean().sort_values(ascending=False)

# Print the top 5 genres based on the average rating.
print("The Top 5 Genres:\n", genre_ratings.head(5))

The Top 5 Genres:
 Genre
Film-Noir      4.012151
Documentary    3.783459
War            3.780173
IMAX           3.764537
Mystery        3.677631
Name: Rating, dtype: float64


##### Suggest five movies to a new user that are most popular and top rated.

In [6]:
def movies_suggestions(ratings_df, min_ratings=1000, top_n=5):
    # Group by MovieID and Title and calculate the average rating and the count of ratings for each movie.
    top_rated_movies = ratings_df.groupby(['MovieID','Title']).agg({'Rating': ['mean', 'count']})
    # Rename the columns for easy access.
    top_rated_movies.columns = ['average_rating', 'num_ratings']
    # Filter movies that have received at least min_ratings number of ratings to ensure that only movies with sufficient user feedback are considered.
    filtered_movies = top_rated_movies[top_rated_movies['num_ratings'] >= min_ratings]
    # Return the top n movies based on their average rating after filtering for popular movies.
    return filtered_movies.head(top_n)

# Call the function and print the top 5 movie suggestions based on the criteria.
print("Five Movies Suggestions:\n", movies_suggestions(df))

Five Movies Suggestions:
                                             average_rating  num_ratings
MovieID Title                                                          
1       Toy Story (1995)                          3.928769       132245
2       Jumanji (1995)                            3.208070        36096
3       Grumpier Old Men (1995)                   3.150385        15580
4       Waiting to Exhale (1995)                  2.860544         5292
5       Father of the Bride Part II (1995)        3.077435         7135


##### Add the new user's ratings to the ratings table.

In [7]:
# Find the maximum UserID from the existing ratings dataframe and increase it by 1 to create a new UserID.
new_user_id = ratings_df['UserID'].max() + 1

# List of new ratings provided by the new user.
new_ratings_list = [
    {'Title': 'Judge Dredd', 'Rating': 5},
    {'Title': 'Waterworld', 'Rating': 5},
    {'Title': 'Screamers', 'Rating': 4},
    {'Title': 'Jumanji', 'Rating': 3},
]

# Initialize empty lists to store the new ratings and any new movie entries.
new_ratings = []
new_movies = []

# Iterate over the list of new ratings.
for counter, rating in enumerate(new_ratings_list):
    # Check if the movie exists in the movies dataframe.
    if movies_df['Title'].str.contains(rating['Title']).any():
        # If the movie exists, get the MovieID and add the new rating for the new user.
        movie_row = movies_df[movies_df['Title'].str.contains(rating['Title'])]
        new_ratings.append({'UserID': new_user_id, 'MovieID': movie_row['MovieID'].values[0], 'Rating': rating['Rating']})
    else:
        # If the movie does not exist in the dataframe, assign a new MovieID and create a new movie entry.
        counter += 1 # Increment the counter for new MovieID.
        new_movie_id = movies_df['MovieID'].max() + counter # Generate a new unique MovieID.
        new_movies.append({'MovieID': new_movie_id, 'Title': rating['Title']}) # Add new movie to the new movies list.
        # Add the new rating for the new movie to the ratings list.
        new_ratings.append({'UserID': new_user_id, 'MovieID': new_movie_id, 'Rating': rating['Rating']})

# Add new movies to movies table and new user ratings to ratings table.
movies_df = pd.concat([movies_df, pd.DataFrame(new_movies)], ignore_index=True)
ratings_df = pd.concat([ratings_df, pd.DataFrame(new_ratings)], ignore_index=True)

# Print the ratings provided by the new user.
print(ratings_df[ratings_df['UserID'] == new_user_id])

          UserID  MovieID  Rating  Timestamp
10000054   71568      173     5.0        NaN
10000055   71568      208     5.0        NaN
10000056   71568       76     4.0        NaN
10000057   71568        2     3.0        NaN


##### The following code snippet provided in the course materials (matrix_factorization).

In [11]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def matrix_factorization_with_regularization(ratings_df, n_factors=10, learning_rate=0.01, regularization=0.1, n_iterations=100):
    """Performs matrix factorization with L2 regularization using gradient descent."""
    
    # Create mappings for UserIDs and MovieIDs to sequential indices.
    # This will help us map the user and movie IDs to matrix indices.
    user_map = {user_id: index for index, user_id in enumerate(ratings_df['UserID'].unique())}
    movie_map = {movie_id: index for index, movie_id in enumerate(ratings_df['MovieID'].unique())}

    n_users = len(user_map)  # Number of unique users.
    n_movies = len(movie_map)  # Number of unique movies.

    # Create the ratings matrix, initially filled with zeros.
    ratings_matrix = np.zeros((n_users, n_movies))
    
    # Populate the ratings matrix with actual ratings from the dataframe.
    for index, row in ratings_df.iterrows():
        user_idx = user_map[row['UserID']]  # Get the index of the user in the matrix.
        movie_idx = movie_map[row['MovieID']]  # Get the index of the movie in the matrix.
        ratings_matrix[user_idx, movie_idx] = row['Rating']  # Fill the rating for this user-movie pair.

    # Initialize user and movie latent factors randomly.
    # These represent the "hidden" features that matrix factorization will uncover.
    user_factors = np.random.rand(n_users, n_factors)
    movie_factors = np.random.rand(n_movies, n_factors)

    # Gradient descent loop.
    for _ in tqdm(range(n_iterations)):
        for u in range(n_users):  # Loop over all users.
            for i in range(n_movies):  # Loop over all movies.
                if ratings_matrix[u, i] > 0:  # Only update if there's a rating (non-zero entry).
                    # Calculate the error (actual rating - predicted rating).
                    error = ratings_matrix[u, i] - np.dot(user_factors[u], movie_factors[i])

                    # Update the user and movie factors with gradient descent and regularization.
                    user_factors[u] += learning_rate * (error * movie_factors[i] - regularization * user_factors[u])
                    movie_factors[i] += learning_rate * (error * user_factors[u] - regularization * movie_factors[i])

    return user_factors, movie_factors, user_map, movie_map

def predict_rating(user_id, movie_id, user_factors, movie_factors, user_map, movie_map):
    """Predicts the rating for a given user-movie pair."""
    try:
        # Get the index of the user and movie in the matrix.
        user_idx = user_map[user_id]
        movie_idx = movie_map[movie_id]
        # Predict the rating by taking the dot product of the corresponding user and movie factors.
        return np.dot(user_factors[user_idx], movie_factors[movie_idx])
    except KeyError:
        # If the user or movie is not found in the map, return None (i.e., unable to predict).
        return None

# Use a small sample of the ratings dataframe for quick testing.
ratings_df_small = ratings_df.sample(frac=0.001, random_state=42)
# Perform matrix factorization on the sample data.
user_factors, movie_factors, user_map, movie_map = matrix_factorization_with_regularization(ratings_df_small, regularization=0.01, n_iterations=10)
# Predict ratings for all user-movie pairs in the sample.
ratings_df_small['predicted'] = ratings_df_small.apply(
    lambda x: predict_rating(int(x['UserID']), int(x['MovieID']), user_factors, movie_factors, user_map, movie_map),
    axis=1
)

# Calculate the squared error between the actual rating and predicted rating.
ratings_df_small['squared_error'] = (ratings_df_small['Rating'] - ratings_df_small['predicted'])**2
# Display the first few rows of the dataframe to check the results.
ratings_df_small.head()
# Predict ratings for a new user.
# Merge the predicted ratings with movie titles for easy viewing.
top_movies = pd.merge(ratings_df_small, movies_df, on='MovieID')
# Sort the movies by predicted rating in descending order to recommend top-rated ones.
top_movies = top_movies.sort_values(by=['predicted'], ascending=False)

# Display the top 5 recommended movies for the new user.
print("Recommended movies for the new user: \n", top_movies['Title'][:5])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:47<00:00,  4.77s/it]


Recommended movies for the new user: 
 1992                             Wizard of Oz, The (1939)
5009                           Usual Suspects, The (1995)
1158    Lord of the Rings: The Fellowship of the Ring,...
7953                                 Trainspotting (1996)
8333                                   Stand by Me (1986)
Name: Title, dtype: object


##### Content-based Filtering

In [9]:
# Combine the movie title with the release year for a more descriptive title.
plots_df['Title'] = plots_df['Title'] + ' (' + plots_df['Release Year'].astype(str) + ')'
# Initialize the TF-IDF Vectorizer, excluding common stop words.
tfidf = TfidfVectorizer(stop_words="english")
# Fill any missing plot descriptions with an empty string.
plots_df['Plot'] = plots_df['Plot'].fillna("")
# Transform the Plot column into a matrix of TF-IDF features.
tfidf_matrix = tfidf.fit_transform(plots_df['Plot'])
# Calculate the cosine similarity between the TF-IDF vectors.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Create an index for fast lookup of movie titles.
indices = pd.Series(plots_df.index, index=plots_df['Title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    """
    Given a movie title, return the top 2 most similar movies based on cosine similarity.
    """
    # Get the index of the movie from the title.
    idx = indices[title]
    # Get the similarity scores for all movies with the given movie.
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on similarity scores in descending order.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the top 2 most similar movies.
    sim_scores = sim_scores[1:3]
    # Extract the movie indices from the similarity scores.
    movies_indices = [ind[0] for ind in sim_scores]
    # Return the titles of the most similar movies.
    movies = plots_df['Title'].iloc[movies_indices]
    return movies

# Get recommendations for the movie "Judge Dredd (1995)".
print("Recommendations for Judge Dredd:\n")
print(get_recommendations("Judge Dredd (1995)"))

Recommendations for Judge Dredd:

21246            Dredd (2012)
1154     Little Caesar (1931)
Name: Title, dtype: object
