### Recommender Systems – Exercise

##### Import the libraries necessary for this project.

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

##### Load the ratings, movies and plots data. 

In [2]:
ratings_df = pd.read_csv('ml-10M100K/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [3]:
movies_df = pd.read_csv('ml-10M100K/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python')
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
plots_df = pd.read_csv("datasets/wiki_movie_plots_deduped.csv", encoding="utf-8")
plots_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


##### Merge the two datasets on movieId. Calculate the average rating for each genre.

In [5]:
# Merge the datasets
df = pd.merge(ratings_df, movies_df, on='MovieID')

# Split the genres into separate rows
df = df.join(df['Genres'].str.split('|', expand=True).stack().reset_index(level=1, drop=True).rename('Genre'))

# Calculate the average rating for each genre
genre_ratings = df.groupby('Genre')['Rating'].mean().sort_values(ascending=False)

# Get the top 5 genres based on average rating
print("The Top 5 Genres:\n", genre_ratings.head(5))

The Top 5 Genres:
 Genre
Film-Noir      4.012151
Documentary    3.783459
War            3.780173
IMAX           3.764537
Mystery        3.677631
Name: Rating, dtype: float64


##### Suggest five movies to a new user that are most popular and top rated.

In [6]:
def movies_suggestions(ratings_df, min_ratings=1000, top_n=5):
    top_rated_movies = ratings_df.groupby(['MovieID','Title']).agg({'Rating': ['mean', 'count']})
    top_rated_movies.columns = ['average_rating', 'num_ratings']
    filtered_movies = top_rated_movies[top_rated_movies['num_ratings'] >= min_ratings]
    return filtered_movies.head(top_n)

print("Five Movies Suggestions:\n", movies_suggestions(df))
# Based on the analysis, we will suggest five movies that are popular and top rated.
# These suggestions are based on their popularity and ratings, ensuring a well-rounded selection for a new user.

Five Movies Suggestions:
                                             average_rating  num_ratings
MovieID Title                                                          
1       Toy Story (1995)                          3.928769       132245
2       Jumanji (1995)                            3.208070        36096
3       Grumpier Old Men (1995)                   3.150385        15580
4       Waiting to Exhale (1995)                  2.860544         5292
5       Father of the Bride Part II (1995)        3.077435         7135


##### Add the new user's ratings to the ratings table.

In [7]:
# Find the maximum UserID
new_user_id = ratings_df['UserID'].max() + 1

# List of new ratings
new_ratings_list = [
    {'Title': 'Judge Dredd', 'Rating': 5},
    {'Title': 'Waterworld', 'Rating': 5},
    {'Title': 'Screamers', 'Rating': 4},
    {'Title': 'Jumanji', 'Rating': 3},
]

# Initialize an empty list to store new ratings and new movie entries
new_ratings = []
new_movies = []

# Check if each new movie title exists in the movies DataFrame
for counter, rating in enumerate(new_ratings_list):
    if movies_df['Title'].str.contains(rating['Title']).any():
        movie_row = movies_df[movies_df['Title'].str.contains(rating['Title'])]
        new_ratings.append({'UserID': new_user_id, 'MovieID': movie_row['MovieID'].values[0], 'Rating': rating['Rating']})
    else:
        counter += 1
        new_movie_id = movies_df['MovieID'].max() + counter
        new_movies.append({'MovieID': new_movie_id, 'Title': rating['Title']})
        new_ratings.append({'UserID': new_user_id, 'MovieID': new_movie_id, 'Rating': rating['Rating']})

# Add new movies to movies table and new user ratings to ratings table
movies_df = pd.concat([movies_df, pd.DataFrame(new_movies)], ignore_index=True)
ratings_df = pd.concat([ratings_df, pd.DataFrame(new_ratings)], ignore_index=True)
print(ratings_df[ratings_df['UserID'] == new_user_id])

#Find Maximum UserID: Identify the maximum UserID and increase it by one to get the new UserID.
#New User's Ratings: Define the new user's ratings for the specified movies.
#Add to Ratings Table: Convert the new ratings to a DataFrame and concatenate it with the existing ratings table.

          UserID  MovieID  Rating  Timestamp
10000054   71568      173     5.0        NaN
10000055   71568      208     5.0        NaN
10000056   71568       76     4.0        NaN
10000057   71568        2     3.0        NaN


##### The following code snippet provided in the course materials (matrix_factorization).

In [25]:
def matrix_factorization_with_regularization(ratings_df, n_factors=10, learning_rate=0.01, regularization=0.1, n_iterations=100):
    """Performs matrix factorization with L2 regularization using gradient descent."""

    # Create mappings for UserIDs and MovieIDs to sequential indices
    user_map = {user_id: index for index, user_id in enumerate(ratings_df['UserID'].unique())}
    movie_map = {movie_id: index for index, movie_id in enumerate(ratings_df['MovieID'].unique())}

    n_users = len(user_map)
    n_movies = len(movie_map)

    # Create the ratings matrix using pivot_table and the maps
    ratings_matrix = np.zeros((n_users, n_movies))
    for index, row in ratings_df.iterrows():
        user_idx = user_map[row['UserID']]
        movie_idx = movie_map[row['MovieID']]
        ratings_matrix[user_idx, movie_idx] = row['Rating']

    # Initialize user and movie latent factors randomly
    user_factors = np.random.rand(n_users, n_factors)
    movie_factors = np.random.rand(n_movies, n_factors)

    # Gradient descent
    for _ in tqdm(range(n_iterations)):
        for u in range(n_users):
            for i in range(n_movies):
                if ratings_matrix[u, i] > 0:
                    error = ratings_matrix[u, i] - np.dot(user_factors[u], movie_factors[i])

                    # Update factors with regularization
                    user_factors[u] += learning_rate * (error * movie_factors[i] - regularization * user_factors[u])
                    movie_factors[i] += learning_rate * (error * user_factors[u] - regularization * movie_factors[i])

    return user_factors, movie_factors, user_map, movie_map

def predict_rating(user_id, movie_id, user_factors, movie_factors, user_map, movie_map):
    """Predicts the rating for a given user-movie pair."""
    try:
        user_idx = user_map[user_id]
        movie_idx = movie_map[movie_id]
        return np.dot(user_factors[user_idx], movie_factors[movie_idx])
    except KeyError:
        return None

ratings_df_small = ratings_df.sample(frac=0.001, random_state=42)
user_factors, movie_factors, user_map, movie_map = matrix_factorization_with_regularization(ratings_df_small, regularization=0.01,n_iterations=10)
ratings_df_small['predicted'] = ratings_df_small.apply(lambda x: predict_rating(int(x['UserID']), int(x['MovieID']), user_factors, movie_factors, user_map, movie_map), axis=1)
ratings_df_small['squared_error'] = (ratings_df_small['Rating'] - ratings_df_small['predicted'])**2
ratings_df_small.head()

# Predict ratings for a new user.
top_movies = pd.merge(ratings_df_small, movies_df, on='MovieID')
top_movies = top_movies.sort_values(by=['predicted'], ascending=False)
print("Recommended movies for the new user: \n",top_movies['Title'][:5])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:45<00:00,  4.52s/it]

Recommended movies for the new user: 
 953                      Godfather, The (1972)
4352    Monty Python and the Holy Grail (1975)
5393                         Annie Hall (1977)
6919          Shawshank Redemption, The (1994)
130                          Casablanca (1942)
Name: Title, dtype: object





##### Content-based Filtering

In [24]:
plots_df['Title'] = plots_df['Title'] + ' (' + plots_df['Release Year'].astype(str) + ')'
tfidf = TfidfVectorizer(stop_words="english")
plots_df['Plot'] = plots_df['Plot'].fillna("")
tfidf_matrix = tfidf.fit_transform(plots_df['Plot'])
cosine_sim =  cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(plots_df.index, index=plots_df['Title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:3]
    movies_indices = [ind[0] for ind in sim_scores]
    movies = plots_df['Title'].iloc[movies_indices]
    return movies

print("Recommendations for Judge Dredd:\n")
print(get_recommendations("Judge Dredd (1995)"))

Recommendations for Judge Dredd:

21246            Dredd (2012)
1154     Little Caesar (1931)
Name: Title, dtype: object
