In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

# Download NLTK data for sentiment analysis
nltk.download('vader_lexicon')

# Set data path
data_path = r'C:\Users\jahna\Movie_Recommendations\data\movielens-20m-dataset'

# Load datasets
movies = pd.read_csv(os.path.join(data_path, 'movie.csv'))
ratings = pd.read_csv(os.path.join(data_path, 'rating.csv'))
tags = pd.read_csv(os.path.join(data_path, 'tag.csv'))
genome_scores = pd.read_csv(os.path.join(data_path, 'genome_scores.csv'))
genome_tags = pd.read_csv(os.path.join(data_path, 'genome_tags.csv'))

# Sample a subset for faster processing (optional, remove for full dataset)
ratings = ratings.sample(n=1000000, random_state=42)  # Sample 1M ratings
movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]
tags = tags[tags['movieId'].isin(ratings['movieId'].unique())]
genome_scores = genome_scores[genome_scores['movieId'].isin(ratings['movieId'].unique())]

# Clean data
# Remove movies with missing titles
movies = movies.dropna(subset=['title'])

# Process genres
movies['genres'] = movies['genres'].replace('|', ' ', regex=True)
movies['genres'] = movies['genres'].replace('(no genres listed)', '')

# Process tags
# Aggregate user tags per movie
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies = movies.merge(movie_tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres and tags for content-based filtering
movies['content'] = movies['genres'] + ' ' + movies['tag']

# Merge ratings with movie titles
ratings = ratings.merge(movies[['movieId', 'title', 'content']], left_on='movieId', right_on='movieId', how='left')

# Display data
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Tags shape:", tags.shape)
print("Genome scores shape:", genome_scores.shape)
print(ratings.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jahna\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Ratings shape: (1000000, 6)
Movies shape: (15374, 5)
Tags shape: (446487, 4)
Genome scores shape: (11489808, 3)
   userId  movieId  rating            timestamp  \
0  122270     8360     3.5  2012-04-22 01:07:04   
1   49018       32     2.0  2001-09-11 07:50:36   
2   89527   109374     3.5  2015-01-06 09:26:40   
3  106704     1060     3.0  2000-01-22 21:27:57   
4   47791     1732     2.0  2006-01-19 15:48:23   

                                       title  \
0                             Shrek 2 (2004)   
1  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
2           Grand Budapest Hotel, The (2014)   
3                            Swingers (1996)   
4                   Big Lebowski, The (1998)   

                                             content  
0   A d v e n t u r e | A n i m a t i o n | C h i...  
1   M y s t e r y | S c i - F i | T h r i l l e r...  
2   C o m e d y | D r a m a  amazing storytelling...  
3   C o m e d y | D r a m a  funny Vince Vaughn f...  
4   C o m e d y |

In [5]:
# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
svd = SVD(n_factors=100, random_state=42)
svd.fit(trainset)

# Function to get collaborative filtering predictions
def get_collaborative_recommendations(user_id, n=5):
    # Get all movie IDs
    movie_ids = movies['movieId'].unique()
    # Predict ratings for all movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in movie_ids]
    # Sort by predicted rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    # Get top N movie IDs
    top_movie_ids = [pred.iid for pred in predictions[:n]]
    # Get movie titles
    top_movies = movies[movies['movieId'].isin(top_movie_ids)][['movieId', 'title', 'content']]
    return top_movies

# Test collaborative filtering
user_id = ratings['userId'].iloc[0]  # Example user
collab_recs = get_collaborative_recommendations(user_id)
print(f"Collaborative Filtering Recommendations for User {user_id}:")
print(collab_recs)

Collaborative Filtering Recommendations for User 122270:
      movieId                             title  \
312       318  Shawshank Redemption, The (1994)   
1029     1069           Murder, My Sweet (1944)   
1101     1147         When We Were Kings (1996)   
1171     1221    Godfather: Part II, The (1974)   
2426     2571                Matrix, The (1999)   

                                                content  
312    C r i m e | D r a m a  friendship masterplan ...  
1029   C r i m e | F i l m - N o i r | T h r i l l e...  
1101   D o c u m e n t a r y  character based on rea...  
1171   C r i m e | D r a m a  complex characters maf...  
2426   A c t i o n | S c i - F i | T h r i l l e r  ...  


In [6]:
# Create TF-IDF matrix for content (genres + tags)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies['content'])

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get content-based recommendations
def get_content_recommendations(title, n=5):
    # Get movie index
    idx = movies[movies['title'] == title].index
    if len(idx) == 0:
        return pd.DataFrame()
    idx = idx[0]
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get top N similar movies
    sim_scores = sim_scores[1:n+1]  # Exclude the movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['movieId', 'title', 'content']]

# Test content-based filtering
movie_title = movies['title'].iloc[0]  # Example movie
content_recs = get_content_recommendations(movie_title)
print(f"Content-Based Recommendations for {movie_title}:")
print(content_recs)

Content-Based Recommendations for Toy Story (1995):
      movieId                  title  \
2954     3114     Toy Story 2 (1999)   
2214     2355   Bug's Life, A (1998)   
4650     4886  Monsters, Inc. (2001)   
4964     5218         Ice Age (2002)   
6024     6377    Finding Nemo (2003)   

                                                content  
2954   A d v e n t u r e | A n i m a t i o n | C h i...  
2214   A d v e n t u r e | A n i m a t i o n | C h i...  
4650   A d v e n t u r e | A n i m a t i o n | C h i...  
4964   A d v e n t u r e | A n i m a t i o n | C h i...  
6024   A d v e n t u r e | A n i m a t i o n | C h i...  


In [7]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Compute sentiment for tags
tags['sentiment'] = tags['tag'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Aggregate sentiment per movie
movie_sentiment = tags.groupby('movieId')['sentiment'].mean().reset_index()

# Function to filter recommendations by sentiment
def filter_by_sentiment(recommendations, min_sentiment=0.1):
    # Merge recommendations with sentiment
    recs_with_sentiment = recommendations.merge(movie_sentiment, on='movieId', how='left')
    # Fill missing sentiment with neutral (0)
    recs_with_sentiment['sentiment'] = recs_with_sentiment['sentiment'].fillna(0)
    # Filter by positive sentiment
    return recs_with_sentiment[recs_with_sentiment['sentiment'] >= min_sentiment]

# Test sentiment filtering on collaborative recommendations
collab_recs_sentiment = filter_by_sentiment(collab_recs)
print("Collaborative Recommendations with Positive Sentiment:")
print(collab_recs_sentiment)

Collaborative Recommendations with Positive Sentiment:
Empty DataFrame
Columns: [movieId, title, content, sentiment]
Index: []


In [8]:
# Function for hybrid recommendations
def get_hybrid_recommendations(user_id, movie_title, n=5, use_sentiment=True):
    # Get collaborative recommendations
    collab_recs = get_collaborative_recommendations(user_id, n=10)
    # Get content-based recommendations
    content_recs = get_content_recommendations(movie_title, n=10)
    # Combine recommendations
    combined = pd.concat([collab_recs, content_recs]).drop_duplicates(subset=['movieId'])
    # Apply sentiment filtering if enabled
    if use_sentiment:
        combined = filter_by_sentiment(combined)
    # Return top N
    return combined.head(n)

# Test hybrid recommendations
user_id = ratings['userId'].iloc[0]
movie_title = movies['title'].iloc[0]
hybrid_recs = get_hybrid_recommendations(user_id, movie_title)
print(f"Hybrid Recommendations for User {user_id} and Movie {movie_title}:")
print(hybrid_recs)

Hybrid Recommendations for User 122270 and Movie Toy Story (1995):
    movieId                           title  \
1       898  Philadelphia Story, The (1940)   
12     4886           Monsters, Inc. (2001)   

                                              content  sentiment  
1    C o m e d y | D r a m a | R o m a n c e  scre...   0.193295  
12   A d v e n t u r e | A n i m a t i o n | C h i...   0.136745  
