In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pickle
import os

ML-based recommendation engine using:  Content-based filtering (TF-IDF + Cosine Similarity)

In [3]:
movies = pickle.load(open("../Data/processed_movies.pkl", "rb"))
movies.head()


Unnamed: 0,id,title,overview,genres,vote_average,vote_count,popularity,release_date,runtime,director,mood_happy_score,mood_sad_score,mood_excited_score,mood_scared_score,mood_romantic_score,mood_thoughtful_score,mood_adventurous_score,mood_relaxed_score,mood_mysterious_score,mood_inspired_score
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,2009-12-10,162.0,James Cameron,1.94,1.94,10.94,1.94,3.44,3.44,12.44,1.94,1.94,1.94
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",6.9,4500,139.082615,2007-05-19,169.0,Gore Verbinski,3.38,1.88,7.88,1.88,3.38,3.38,10.88,1.88,1.88,1.88
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]",6.3,4466,107.376788,2015-10-26,148.0,Sam Mendes,1.76,1.76,7.76,1.76,1.76,1.76,7.76,1.76,6.26,1.76
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]",7.6,9106,112.31295,2012-07-16,165.0,Christopher Nolan,2.02,5.02,5.02,5.02,5.02,5.02,5.02,2.02,9.52,2.02
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]",6.1,2124,43.926995,2012-03-07,132.0,Andrew Stanton,1.72,1.72,10.72,1.72,1.72,1.72,9.22,1.72,1.72,1.72


In [4]:
movies.isna().sum()

id                        0
title                     0
overview                  0
genres                    0
vote_average              0
vote_count                0
popularity                0
release_date              0
runtime                   0
director                  0
mood_happy_score          0
mood_sad_score            0
mood_excited_score        0
mood_scared_score         0
mood_romantic_score       0
mood_thoughtful_score     0
mood_adventurous_score    0
mood_relaxed_score        0
mood_mysterious_score     0
mood_inspired_score       0
dtype: int64

In [5]:
TfidfVectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2
    )
content_matrix = None
cosine_sim_matrix = None
scaler = MinMaxScaler()


preparing to convert text → numerical vectors
Build content-based features from movie metadata

combining features craete genreactore,director,plot similarity

In [6]:
# Overview
overview = movies['overview'].fillna('').astype(str)

# Genres
genres = movies['genres'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else ''
).fillna('').astype(str)

# Keywords
if 'keywords' in movies.columns:
    keywords = movies['keywords'].apply(
        lambda x: ' '.join(x) if isinstance(x, list) else ''
    ).fillna('').astype(str)
else:
    keywords = pd.Series([''] * len(movies), index=movies.index)

# Director
director = movies['director'].fillna('').astype(str) \
    if 'director' in movies.columns \
    else pd.Series([''] * len(movies), index=movies.index)

# Cast
if 'cast_names' in movies.columns:
    cast = movies['cast_names'].apply(
        lambda x: ' '.join(x[:3]) if isinstance(x, list) else ''
    ).fillna('').astype(str)
else:
    cast = pd.Series([''] * len(movies), index=movies.index)

# Combine
movies['combined_features'] = (
    overview + ' ' +
    genres + ' ' +
    keywords + ' ' +
    director + ' ' +
    cast
)


In [7]:
movies['combined_features'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction  James Cameron '

In [8]:
movies.isna().sum() 

id                        0
title                     0
overview                  0
genres                    0
vote_average              0
vote_count                0
popularity                0
release_date              0
runtime                   0
director                  0
mood_happy_score          0
mood_sad_score            0
mood_excited_score        0
mood_scared_score         0
mood_romantic_score       0
mood_thoughtful_score     0
mood_adventurous_score    0
mood_relaxed_score        0
mood_mysterious_score     0
mood_inspired_score       0
combined_features         0
dtype: int64

This converts all movies into a large matrix
Rows = movies
Columns = words
Values = importance score


In [9]:
# Create TF-IDF matrix
content_matrix = TfidfVectorizer.fit_transform(movies['combined_features'])

How similar is Movie A to Movie B?

In [10]:
#Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(content_matrix)

From top candidates, pick movies that are not too similar to each other.

In [11]:
def select_diverse_movies(candidates_df, cosine_sim_matrix, n=10, diversity_factor=0.3):
    """Select diverse movies using cosine similarity"""
    
    if len(candidates_df) <= n:
        return list(range(len(candidates_df)))
    
    selected_indices = []
    remaining_indices = list(range(len(candidates_df)))
    
    # Always select top movie first
    selected_indices.append(0)
    remaining_indices.remove(0)
    
    while len(selected_indices) < n and remaining_indices:
        
        max_score = -1
        best_idx = None
        
        for idx in remaining_indices:
            
            candidate_movie_idx = candidates_df.iloc[idx].name
            
            similarities = []
            
            for selected_idx in selected_indices:
                selected_movie_idx = candidates_df.iloc[selected_idx].name
                
                if (candidate_movie_idx < len(cosine_sim_matrix) and
                    selected_movie_idx < len(cosine_sim_matrix)):
                    
                    sim = cosine_sim_matrix[candidate_movie_idx][selected_movie_idx]
                    similarities.append(sim)
            
            avg_similarity = np.mean(similarities) if similarities else 0
            diversity_score = 1 - avg_similarity
            
            position_score = 1 - (idx / len(candidates_df))
            
            combined_score = (
                (1 - diversity_factor) * position_score +
                diversity_factor * diversity_score
            )
            
            if combined_score > max_score:
                max_score = combined_score
                best_idx = idx
        
        if best_idx is not None:
            selected_indices.append(best_idx)
            remaining_indices.remove(best_idx)
    
    return selected_indices


In [12]:
def get_surprise_recommendations(n=10):
    hidden_gems = movies[
        (movies['vote_average'] >= 7.0) &
        (movies['vote_count'] >= 100) &
        (movies['vote_count'] <= 1000) &
        (movies['popularity'] < 20)
    ].copy()
    
    if len(hidden_gems) <= n:
        hidden_gems = movies[movies['vote_average'] >= 7.0]
    
    selected = []
    candidates = list(hidden_gems.index)
    
    # Start with a random movie
    first_idx = np.random.choice(candidates)
    selected.append(first_idx)
    candidates.remove(first_idx)
    
    while len(selected) < n and candidates:
        max_diversity = -1
        best_candidate = None
        for candidate in candidates:
            similarities = [cosine_sim_matrix[candidate][sel] for sel in selected]
            avg_sim = np.mean(similarities)
            diversity = 1 - avg_sim
            if diversity > max_diversity:
                max_diversity = diversity
                best_candidate = candidate
        if best_candidate is not None:
            selected.append(best_candidate)
            candidates.remove(best_candidate)
    
    return selected


In [13]:
def get_mood_based_recommendations(
    movies,
    cosine_sim_matrix,
    mood,
    n=10,
    diversity_factor=0.3
):
    """
    Hybrid recommendation:
    - Mood score
    - Rating
    - Popularity
    - Diversity via cosine similarity
    """
    
    mood_col = f'mood_{mood}_score'
    
    if mood_col not in movies.columns:
        raise ValueError(f"Mood '{mood}' not found in dataset")
    
    # Step 1: Get top mood candidates
    candidates_df = movies.nlargest(n * 5, mood_col).copy()
    
    if len(candidates_df) == 0:
        return []
    
    scaler = MinMaxScaler()
    
    # Normalize scores
    candidates_df['mood_score_norm'] = scaler.fit_transform(
        candidates_df[[mood_col]]
    )
    
    candidates_df['rating_score_norm'] = scaler.fit_transform(
        candidates_df[['vote_average']]
    )
    
    candidates_df['popularity_score_norm'] = scaler.fit_transform(
        candidates_df[['popularity']]
    )
    
    # Step 2: Composite Score
    candidates_df['composite_score'] = (
        0.5 * candidates_df['mood_score_norm'] +
        0.3 * candidates_df['rating_score_norm'] +
        0.2 * candidates_df['popularity_score_norm']
    )
    
    candidates_df = candidates_df.sort_values(
        'composite_score',
        ascending=False
    )
    
    # Step 3: Apply Diversity
    selected_indices = select_diverse_movies(
        candidates_df,
        cosine_sim_matrix,
        n=n,
        diversity_factor=diversity_factor
    )
    
    # Return recommended movies
    return candidates_df.iloc[selected_indices]


In [14]:
recommendations = get_mood_based_recommendations(
    movies,
    cosine_sim_matrix,
    mood="scared",
    n=10,
    diversity_factor=0.4
)

recommendations[['title', 'vote_average', 'composite_score']]


Unnamed: 0,title,vote_average,composite_score
2096,The Conjuring,7.4,0.828887
3554,Insidious: Chapter 2,6.5,0.7057
2170,Psycho,8.2,0.634742
3676,The Awakening,6.3,0.630905
346,What Lies Beneath,6.3,0.611145
1766,Case 39,6.1,0.601383
217,Ghost Rider,5.2,0.52937
3455,A Nightmare on Elm Street 5: The Dream Child,5.5,0.497973
2842,Stir of Echoes,6.5,0.487132
2808,The Exorcist,7.5,0.446588


In [15]:
import joblib

# Prepare a single object containing all important items
model_data = {
    'movies_df': movies,               # Your movies DataFrame
    'tfidf_vectorizer': TfidfVectorizer,  # TF-IDF vectorizer
    'cosine_sim_matrix': cosine_sim_matrix  # Cosine similarity matrix
}

# Save to one file
joblib.dump(model_data, 'moodflix.pkl', compress=3)

print("Movie recommender saved in one file using joblib: 'moodflix.pkl'")


Movie recommender saved in one file using joblib: 'moodflix.pkl'


In [16]:
import joblib

# Load the saved recommender
loaded_data = joblib.load('../Data/moodflix.pkl')

# Extract items
movies = loaded_data['movies_df']
loaded_vectorizer = loaded_data['tfidf_vectorizer']
loaded_cosine_sim = loaded_data['cosine_sim_matrix']

print("Movie recommender loaded successfully!")


Movie recommender loaded successfully!
