In [31]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

crew_df = pd.read_csv('../dataset/crew.csv', low_memory=False)
actors_df = pd.read_csv('../dataset/actors.csv', low_memory=False)
genres_df = pd.read_csv('../dataset/genres.csv', low_memory=False)
languages_df = pd.read_csv('../dataset/languages.csv', low_memory=False)
movies_df = pd.read_csv('../dataset/movies.csv', low_memory=False)
posters_df = pd.read_csv('../dataset/posters.csv', low_memory=False)
ratings_df = pd.read_csv('../dataset/ratings.csv', low_memory=False)

In [36]:
print(movies_df.columns)

Index(['id', 'name', 'date', 'tagline', 'description', 'minute', 'rating'], dtype='object')


# Ratings Based Recommender

In [None]:
# Ratings-Based Recommendation
# Load and prepare data
reader = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data and train SVD model
trainset, testset = train_test_split(data, test_size=0.2)
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x140884d10>

In [52]:
import ast
import ast
user_id = 3

if ratings_df.size > 0:
    # Get all movie IDs in the dataset
    all_movie_ids = ratings_df['movieId'].unique()

    # Filter out movies the user has already rated
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId']
    unrated_movies = [movie for movie in all_movie_ids if movie not in rated_movies.values]

    # Predict ratings for each unrated movie
    predictions = [(movie_id, svd.predict(user_id, movie_id).est) for movie_id in unrated_movies]
    sorted_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

    predicted_ratings_df = pd.DataFrame(predictions, columns=['movie_id', 'rating']).set_index('movie_id')

    # Step 5: Get Top 5 Recommendations with Valid Titles
    top_5_recommendations = []
    for movie_id, predicted_rating in sorted_predictions:
        # Retrieve title
        title = movies_df[movies_df['id'] == str(movie_id)]['title']
        genre = movies_df[movies_df['id'] == str(movie_id)]['genres']
        cast = actors_df[actors_df['id'] == movie_id]['cast']
        poster = movies_df[movies_df['id'] == str(movie_id)]['poster_path']

        names = []
        for g in genre:
            g = ast.literal_eval(g)
            for n in g:
                names.append(n['name'])
        
        # Check if the title exists and is not empty
        if not title.empty:
            top_5_recommendations.append((movie_id, title.values[0], predicted_rating, names, cast, poster))
        
        # Stop once we have 5 valid recommendations
        if len(top_5_recommendations) == 5:
            break

    # Display recommended movie IDs, titles, and predicted ratings
    print("Top 5 Recommended Movies (movie_id, title, predicted_rating):")
    for movie_id, title, predicted_rating, genres, cast, poster in top_5_recommendations:
        print(f"Movie ID: {movie_id}, Movie Name: {title}, Predicted Rating: {round(predicted_rating, 2)}, Genres: {genres}")
        print(f"Cast: {cast}, Poster: {poster}")
else:
    top_5_recommendations = movies_df.sample(n=5).values.tolist()
    for movie in top_5_recommendations:
        print(f"Movie ID: {movie[0]}, Title: {movie[1]}, Rating: {movie[6]}")

Movie ID: 1179808, Title: Accidents, Blunders and Calamities, Rating: nan
Movie ID: 1043156, Title: Outfoxed: Rupert Murdoch's War on Journalism, Rating: 3.36
Movie ID: 1442438, Title: Il Diario di Sisifo, Rating: nan
Movie ID: 1128365, Title: REM, Rating: nan
Movie ID: 1515400, Title: Sweet Tooth, Rating: nan


# Combining

In [6]:
from sklearn.preprocessing import StandardScaler
# Standardize the content-based similarity scores for easier combination
scaler = StandardScaler()
genre_similarity_scaled = scaler.fit_transform(genre_similarity)
keyword_similarity_scaled = scaler.fit_transform(keyword_similarity)
actor_similarity_scaled = scaler.fit_transform(actor_similarity)

# Define weights for each component
weight_collab = 0.5
weight_genre = 0.2
weight_keyword = 0.2
weight_actor = 0.1

# Create a final recommendation score for each movie by combining weighted scores
def hybrid_recommend(user_id, top_n=5):
    # Retrieve collaborative filtering predictions for this user
    user_predictions = predicted_ratings_df['rating']
    
    # Compute weighted hybrid scores for each movie
    hybrid_scores = (weight_collab * user_predictions +
                     weight_genre * genre_similarity_scaled[user_id] +
                     weight_keyword * keyword_similarity_scaled[user_id] +
                     weight_actor * actor_similarity_scaled[user_id])
    
    # Sort and get the top N recommended movie IDs
    top_movie_indices = np.argsort(hybrid_scores)[::-1][:top_n]
    top_movies = movies_metadata_df.iloc[top_movie_indices]
    
    # Display the recommendations
    print("Top Recommended Movies:")
    for index, row in top_movies.iterrows():
        print(f"Title: {row['title']}, Hybrid Score: {hybrid_scores[index]}")

# Example recommendation for a specific user
hybrid_recommend(user_id=1, top_n=5)

NameError: name 'genre_similarity' is not defined