### Recommender Systems – Exercise

##### Import the libraries necessary for this project.

In [1]:
import pandas as pd

##### Load the ratings and movies data. Merge the two datasets on movieId.

In [2]:
ratings_df = pd.read_csv('ml-10M100K/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [3]:
movies_df = pd.read_csv('ml-10M100K/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python')
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##### Calculate the average rating for each genre.

In [4]:
# Merge the datasets
df = pd.merge(ratings_df, movies_df, on='MovieID')

# Split the genres into separate rows
df = df.join(df['Genres'].str.split('|', expand=True).stack().reset_index(level=1, drop=True).rename('Genre'))

# Calculate the average rating for each genre
genre_ratings = df.groupby('Genre')['Rating'].mean().sort_values(ascending=False).head(5)

# Get the top 5 genres based on average rating
print("The Top 5 Genres:\n", genre_ratings)

The Top 5 Genres:
 Genre
Film-Noir      4.012151
Documentary    3.783459
War            3.780173
IMAX           3.764537
Mystery        3.677631
Name: Rating, dtype: float64


##### Suggest five movies to a new user that are most popular and highly rated from different genres.

In [5]:
# Determine the most popular movies based on the number of ratings
popular_movies = df.groupby('Title').size().sort_values(ascending=False)

# Identify the top-rated movies based on average rating
top_rated_movies = df.groupby('Title')['Rating'].mean().sort_values(ascending=False)

# Filter the data to only include movies with genres in top_5_genres
top_5_genres = genre_ratings.index
genre_movies = df[df['Genre'].isin(top_5_genres)].groupby('Title').size().sort_values(ascending=False)

# Filter the top-rated movies to only include titles that exist in the popular_movies
top_rated_in_popular = top_rated_movies[top_rated_movies.index.isin(popular_movies.index)]

# Filter the top-rated movies (that are also popular) to only include titles that exist in genre_movies
movies_suggestions = top_rated_in_popular[top_rated_in_popular.index.isin(genre_movies.index)]

# Get the five movies that are popular, highly rated, and in preferred of genres.
print("Five Movies Suggestions:\n", movies_suggestions.head(5))

# Based on the analysis, we will suggest five movies that are popular, highly rated, and in preferred of genres.
# These suggestions are based on their popularity, ratings, and genre, ensuring a well-rounded selection for a new user.

Five Movies Suggestions:
 Title
Blue Light, The (Das Blaue Licht) (1932)                 5.00
Constantine's Sword (2007)                               4.75
More (1998)                                              4.75
Human Condition II, The (Ningen no joken II) (1959)      4.75
Human Condition III, The (Ningen no joken III) (1961)    4.75
Name: Rating, dtype: float64


In [14]:
# Find the maximum UserID and increase it by one
new_user_id = ratings_df['UserID'].max() + 1

# List of new ratings
new_ratings_list = [
    {'Title': 'Judge Dredd', 'Rating': 5},
    {'Title': 'Waterworld', 'Rating': 5},
    {'Title': 'Screamers', 'Rating': 4},
    {'Title': 'Jumanji', 'Rating': 3},
]

# Initialize an empty list to store new ratings and new movie entries
new_ratings = []
new_movies = []

# Check if each movie title exists in the movies DataFrame
for rating in new_ratings_list:
    movie_title = rating['Title']
    movie_row = movies_df[movies_df['Title'] == movie_title]
    if movie_row.empty:
        new_movie_id = movies_df['MovieID'].max() + 1
        new_movies.append({'MovieID': new_movie_id, 'Title': movie_title, 'Genres': ''})
        new_ratings.append({'UserID': new_user_id, 'MovieID': new_movie_id, 'Rating': rating['Rating'], 'Timestamp': pd.Timestamp.now().timestamp()})
    else:
        new_ratings.append({'UserID': new_user_id, 'MovieID': movie_row['MovieID'].values[0], 'Rating': rating['Rating'], 'Timestamp': pd.Timestamp.now().timestamp()})

movies_df = pd.concat([movies_df, pd.DataFrame(new_movies)], ignore_index=True)

# Convert new ratings to DataFrame and add new ratings to the ratings table
ratings_df = pd.concat([ratings_df, pd.DataFrame(new_ratings)], ignore_index=True)
print(ratings_df[ratings_df['UserID'] == new_user_id])

#Find Maximum UserID: Identify the maximum UserID and increase it by one to get the new UserID.
#New User's Ratings: Define the new user's ratings for the specified movies.
#Add to Ratings Table: Convert the new ratings to a DataFrame and concatenate it with the existing ratings table.

          UserID  MovieID  Rating     Timestamp
10000058   71569    65134     5.0  1.743005e+09
10000059   71569    65134     5.0  1.743005e+09
10000060   71569    65134     4.0  1.743005e+09
10000061   71569    65134     3.0  1.743005e+09
