In [2]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

credits_df = pd.read_csv('../dataset/credits.csv', low_memory=False)
keywords_df = pd.read_csv('../dataset/keywords.csv', low_memory=False)
links_df = pd.read_csv('../dataset/links.csv', low_memory=False)
links_small_df = pd.read_csv('../dataset/links_small.csv', low_memory=False)
movies_metadata_df = pd.read_csv('../dataset/movies_metadata.csv', low_memory=False)
ratings_df = pd.read_csv('../dataset/ratings.csv', low_memory=False)
ratings_small_df = pd.read_csv('../dataset/ratings_small.csv', low_memory=False)

In [25]:
print(credits_df.columns)
print(movies_metadata_df[movies_metadata_df['id'] == '862']['title'])
print(credits_df.head()['id'])
print(credits_df[credits_df['id'] == 862])

Index(['cast', 'crew', 'id'], dtype='object')
0    Toy Story
Name: title, dtype: object
0      862
1     8844
2    15602
3    31357
4    11862
Name: id, dtype: int64
                                                cast  \
0  [{'cast_id': 14, 'character': 'Woody (voice)',...   

                                                crew   id  
0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...  862  


# Ratings Based Recommender

In [4]:
# Ratings-Based Recommendation
# Load and prepare data
reader = Reader(rating_scale=(ratings_small_df['rating'].min(), ratings_small_df['rating'].max()))
data = Dataset.load_from_df(ratings_small_df[['userId', 'movieId', 'rating']], reader)

# Split data and train SVD model
trainset, testset = train_test_split(data, test_size=0.2)
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x140884d10>

In [27]:
import ast
import ast
user_id = 3

# Get all movie IDs in the dataset
all_movie_ids = ratings_small_df['movieId'].unique()

# Filter out movies the user has already rated
rated_movies = ratings_small_df[ratings_small_df['userId'] == user_id]['movieId']
unrated_movies = [movie for movie in all_movie_ids if movie not in rated_movies.values]

# Predict ratings for each unrated movie
predictions = [(movie_id, svd.predict(user_id, movie_id).est) for movie_id in unrated_movies]
sorted_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

predicted_ratings_df = pd.DataFrame(predictions, columns=['movie_id', 'rating']).set_index('movie_id')

# Step 5: Get Top 5 Recommendations with Valid Titles
top_5_recommendations = []
for movie_id, predicted_rating in sorted_predictions:
    # Retrieve title
    title = movies_metadata_df[movies_metadata_df['id'] == str(movie_id)]['title']
    genre = movies_metadata_df[movies_metadata_df['id'] == str(movie_id)]['genres']
    cast = credits_df[credits_df['id'] == movie_id]['cast']

    names = []
    for g in genre:
        g = ast.literal_eval(g)
        for n in g:
            names.append(n['name'])
    
    # Check if the title exists and is not empty
    if not title.empty:
        top_5_recommendations.append((movie_id, title.values[0], predicted_rating, names, cast))
    
    # Stop once we have 5 valid recommendations
    if len(top_5_recommendations) == 5:
        break

# Display recommended movie IDs, titles, and predicted ratings
print("Top 5 Recommended Movies (movie_id, title, predicted_rating):")
for movie_id, title, predicted_rating, genres, cast in top_5_recommendations:
    print(f"Movie ID: {movie_id}, Movie Name: {title}, Predicted Rating: {round(predicted_rating, 2)}, Genres: {genres}")
    print(f"Cast: {cast}")

Top 5 Recommended Movies (movie_id, title, predicted_rating):
Movie ID: 858, Movie Name: Sleepless in Seattle, Predicted Rating: 4.3, Genres: ['Comedy', 'Drama', 'Romance']
Cast: 534    [{'cast_id': 13, 'character': 'Sam Baldwin', '...
Name: cast, dtype: object
Movie ID: 745, Movie Name: The Sixth Sense, Predicted Rating: 4.16, Genres: ['Mystery', 'Thriller', 'Drama']
Cast: 2647    [{'cast_id': 9, 'character': 'Malcolm Crowe', ...
Name: cast, dtype: object
Movie ID: 2064, Movie Name: While You Were Sleeping, Predicted Rating: 4.1, Genres: ['Comedy', 'Drama', 'Romance']
Cast: 334    [{'cast_id': 9, 'character': 'Lucy Eleanor Mod...
Name: cast, dtype: object
Movie ID: 922, Movie Name: Dead Man, Predicted Rating: 4.07, Genres: ['Drama', 'Fantasy', 'Western']
Cast: 700    [{'cast_id': 22, 'character': 'William Blake',...
Name: cast, dtype: object
Movie ID: 111, Movie Name: Scarface, Predicted Rating: 3.99, Genres: ['Action', 'Crime', 'Drama', 'Thriller']
Cast: 4135    [{'cast_id': 9, 'char

# Combining

In [6]:
from sklearn.preprocessing import StandardScaler
# Standardize the content-based similarity scores for easier combination
scaler = StandardScaler()
genre_similarity_scaled = scaler.fit_transform(genre_similarity)
keyword_similarity_scaled = scaler.fit_transform(keyword_similarity)
actor_similarity_scaled = scaler.fit_transform(actor_similarity)

# Define weights for each component
weight_collab = 0.5
weight_genre = 0.2
weight_keyword = 0.2
weight_actor = 0.1

# Create a final recommendation score for each movie by combining weighted scores
def hybrid_recommend(user_id, top_n=5):
    # Retrieve collaborative filtering predictions for this user
    user_predictions = predicted_ratings_df['rating']
    
    # Compute weighted hybrid scores for each movie
    hybrid_scores = (weight_collab * user_predictions +
                     weight_genre * genre_similarity_scaled[user_id] +
                     weight_keyword * keyword_similarity_scaled[user_id] +
                     weight_actor * actor_similarity_scaled[user_id])
    
    # Sort and get the top N recommended movie IDs
    top_movie_indices = np.argsort(hybrid_scores)[::-1][:top_n]
    top_movies = movies_metadata_df.iloc[top_movie_indices]
    
    # Display the recommendations
    print("Top Recommended Movies:")
    for index, row in top_movies.iterrows():
        print(f"Title: {row['title']}, Hybrid Score: {hybrid_scores[index]}")

# Example recommendation for a specific user
hybrid_recommend(user_id=1, top_n=5)

NameError: name 'genre_similarity' is not defined