In [1]:
from copy import deepcopy
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# links_df = pd.read_csv('Dataset/links.csv')
movies_df = pd.read_csv('Dataset/movies.csv')
ratings_df = pd.read_csv('Dataset/ratings.csv')
# tags_df = pd.read_csv('Dataset/tags.csv')


In [3]:
ratings_df = ratings_df.sort_values(by=['userId', 'movieId', 'timestamp'], ascending=[True, True, False])

ratings_df = ratings_df.drop_duplicates(subset=['userId', 'movieId'], keep='first')

ratings_df = ratings_df.drop(columns=['timestamp'])

In [4]:
# tags_df_altered = deepcopy(tags_df)
# tags_df_altered['tag'] = tags_df_altered['tag'].str.upper()
# tags_df_altered = tags_df_altered.groupby('movieId')['tag'].apply(lambda x: '|'.join(pd.unique(x))).reset_index()
# tags_df_altered.columns = ['movieId', 'tags']

In [5]:
movies_df.head(4)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [6]:
ratings_df.head(4)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0


In [7]:
df_final = ratings_df.merge(movies_df, how="inner", on="movieId")#.merge(tags_df_altered, how="inner", on="movieId")
df_final = df_final.drop(columns=['title'])

df_final.head(4)

Unnamed: 0,userId,movieId,rating,genres
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Comedy|Romance
2,1,6,4.0,Action|Crime|Thriller
3,1,47,5.0,Mystery|Thriller


In [8]:
# Group by movieId and calculate the mean rating
weighted_avg = df_final.groupby('movieId')['rating'].mean().reset_index()

# Rename columns for clarity
weighted_avg.rename(columns={'rating': 'weighted_average_rating'}, inplace=True)

# Filter to include only those with weighted_average_rating >= 3.5
weighted_avg = weighted_avg[weighted_avg['weighted_average_rating'] >= 3.5]

In [9]:
df = deepcopy(df_final)#.merge(weighted_avg, how="inner", on="movieId")

In [None]:
# Create a user-item interaction matrix
user_item_matrix = df.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Train-test split
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

# Convert train data into a user-item interaction matrix
train_matrix = train_data.pivot(index="userId", columns="movieId", values="rating").fillna(0)
user_ids = train_matrix.index
movie_ids = train_matrix.columns

In [11]:
from scipy.sparse.linalg import svds

# Matrix factorization using Singular Value Decomposition (SVD)
def matrix_factorization(matrix, num_features=10):
    U, sigma, Vt = svds(matrix, k=num_features)
    sigma = np.diag(sigma)
    return U, sigma, Vt

# Apply SVD on the train matrix
U, sigma, Vt = matrix_factorization(train_matrix.to_numpy())
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [12]:
from scipy.sparse import csr_matrix

# Optimized Genre Similarity Calculation
def calculate_genre_similarity(df):
    count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split('|'))
    genre_matrix = count_vectorizer.fit_transform(df['genres'])
    return csr_matrix(genre_matrix)

# Calculate sparse genre matrix
sparse_genre_matrix = calculate_genre_similarity(df)



In [13]:
# Function to calculate similarity for specific rows on-demand
def get_genre_similarity(sparse_matrix, target_index):
    target_vector = sparse_matrix[target_index]
    similarity = cosine_similarity(target_vector, sparse_matrix)
    return similarity.flatten()

In [14]:
# # Hybrid Recommendation Function
# def hybrid_recommend_movies(user_id, df, user_item_matrix, predicted_ratings, sparse_genre_matrix, num_recommendations=5):
#     user_idx = np.where(user_ids == user_id)[0][0]

#     # Predicted collaborative filtering scores for the user
#     predicted_user_ratings = predicted_ratings[user_idx]

#     # Get movies the user has already rated
#     rated_movies = df[df["userId"] == user_id]["movieId"].values

#     # Filter out movies the user has already rated
#     unrated_movies = [movie for movie in movie_ids if movie not in rated_movies]
    
#     # Filter out the movies with low average score
#     unrated_movies = weighted_avg [weighted_avg ['movieId'].isin(unrated_movies)]['movieId'].tolist()

#     recommendations = []
#     for movie_id in unrated_movies:
#         # Collaborative filtering score
#         movie_idx = np.where(movie_ids == movie_id)[0][0]
#         predicted_rating = predicted_user_ratings[movie_idx]

#         # Content-based score (average similarity with rated movies)
#         rated_movie_indices = [df[df["movieId"] == movie].index[0] for movie in rated_movies]
#         current_movie_index = df[df["movieId"] == movie_id].index[0]
#         genre_similarities = get_genre_similarity(sparse_genre_matrix, current_movie_index)
#         content_score = np.mean([genre_similarities[i] for i in rated_movie_indices])

#         # Combine scores (weighted hybrid)
#         hybrid_score = 0.7 * predicted_rating + 0.3 * content_score
#         recommendations.append((movie_id, hybrid_score))

#     # Sort by hybrid score
#     recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

#     # Get the top N recommendations
#     top_recommendations = recommendations[:num_recommendations]
#     return top_recommendations

# # Example usage: Recommend movies for user 1
# recommendations = hybrid_recommend_movies(1, df, user_item_matrix, predicted_ratings, sparse_genre_matrix)

# print("Top Recommendations for User 1:")
# for movie_id, hybrid_score in recommendations:
#     print(f"Movie ID: {movie_id}, Hybrid Score: {hybrid_score:.2f}")

In [15]:
# import numpy as np

# def hybrid_recommend_movies(user_id, df, user_item_matrix, predicted_ratings, sparse_genre_matrix, num_recommendations=5):
#     user_idx = np.where(user_ids == user_id)[0][0]
#     predicted_user_ratings = predicted_ratings[user_idx]
    
#     # Get movie indices
#     movie_ids_np = np.array(movie_ids)  # Convert to numpy for faster indexing
#     unrated_movies_mask = ~np.isin(movie_ids_np, df[df["userId"] == user_id]["movieId"].values)
#     unrated_movie_ids = movie_ids_np[unrated_movies_mask]

#     # Collaborative filtering score (predicted ratings for unrated movies)
#     predicted_ratings_unrated = predicted_user_ratings[np.isin(movie_ids_np, unrated_movie_ids)]

#     # Calculate content-based score for all unrated movies
#     genre_similarities = np.array([
#         get_genre_similarity(sparse_genre_matrix, df[df["movieId"] == movie_id].index[0])
#         for movie_id in unrated_movie_ids
#     ])
    
#     # Get genre similarities for all rated movies
#     rated_movie_indices = [df[df["movieId"] == movie].index[0] for movie in df[df["userId"] == user_id]["movieId"].values]
    
#     # Compute the average genre similarity for each unrated movie against the rated ones
#     content_scores = np.mean(genre_similarities[:, rated_movie_indices], axis=1)
    
#     # Combine scores (weighted hybrid)
#     hybrid_scores = 0.7 * predicted_ratings_unrated + 0.3 * content_scores

#     # Combine movie ids and hybrid scores, sort by score, and get top recommendations
#     recommendations = list(zip(unrated_movie_ids, hybrid_scores))
#     recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

#     # Return the top N recommendations
#     return recommendations[:num_recommendations]

# # Example usage: Recommend movies for user 1
# recommendations = hybrid_recommend_movies(1, df, user_item_matrix, predicted_ratings, sparse_genre_matrix)
# for movie_id, hybrid_score in recommendations:
#     print(f"Movie ID: {movie_id}, Hybrid Score: {hybrid_score:.2f}")


In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed

def hybrid_recommend_movies(user_id, df, predicted_ratings, sparse_genre_matrix, num_recommendations=5, n_jobs=-1):
    # Identify the user index in the matrix
    user_idx = np.where(user_ids == user_id)[0][0]
    predicted_user_ratings = predicted_ratings[user_idx]
    
    # Get movie indices (converted to numpy arrays for faster indexing)
    movie_ids_np = np.array(movie_ids)
    rated_movies = df[df["userId"] == user_id]["movieId"].values
    unrated_movies_mask = ~np.isin(movie_ids_np, rated_movies)
    unrated_movie_ids = movie_ids_np[unrated_movies_mask]
    
    # Collaborative filtering scores for unrated movies
    predicted_ratings_unrated = predicted_user_ratings[np.isin(movie_ids_np, unrated_movie_ids)]

    # Calculate content-based genre similarities for unrated movies in parallel
    genre_similarities = Parallel(n_jobs=n_jobs)(
        delayed(get_genre_similarity)(sparse_genre_matrix, df[df["movieId"] == movie_id].index[0]) for movie_id in unrated_movie_ids
    )
    
    # Convert the list of similarities into a numpy array
    genre_similarities = np.array(genre_similarities)
    
    # Get the indices of rated movies
    rated_movie_indices = [df[df["movieId"] == movie].index[0] for movie in rated_movies]

    # Compute the average genre similarity for each unrated movie against the rated movies
    content_scores = np.mean(genre_similarities[:, rated_movie_indices], axis=1)

    # Combine collaborative and content-based scores (weighted hybrid)
    hybrid_scores = 0.5 * predicted_ratings_unrated + 0.5 * content_scores

    # Combine movie ids and hybrid scores, sort by score, and get the top recommendations
    recommendations = list(zip(unrated_movie_ids, hybrid_scores))
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

    # Return the top N recommendations
    return recommendations[:num_recommendations]

# Example usage: Recommend movies for user 1
recommendations = hybrid_recommend_movies(1, df, predicted_ratings, sparse_genre_matrix)
for movie_id, hybrid_score in recommendations:
    print(f"Movie ID: {movie_id}, Hybrid Score: {hybrid_score:.2f}")


Movie ID: 589, Hybrid Score: 2.34
Movie ID: 858, Hybrid Score: 2.02
Movie ID: 1200, Hybrid Score: 1.85
Movie ID: 318, Hybrid Score: 1.82
Movie ID: 1221, Hybrid Score: 1.80
