In [1]:
# Imports and environment setup
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

Our dataset:

https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies

In [2]:
df = pd.read_csv("titles_cleaned.csv")

In [3]:
numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols

Index(['release_year', 'runtime', 'seasons', 'imdb_score', 'imdb_votes',
       'tmdb_popularity', 'tmdb_score'],
      dtype='object')

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

df['genres_list'] = df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['genres_list']),
                             columns=mlb.classes_,
                             index=df.index)

df = pd.concat([df, genre_dummies], axis=1)

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_description = tfidf.fit_transform(df['description'])

from scipy.sparse import hstack
combined_features = hstack([tfidf_description, genre_dummies])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(combined_features)

In [12]:
def top_rated_by_genres_all(genres, df, n=10):
    
    if isinstance(genres, str):
        genres = [g.strip().lower() for g in genres.split(",")]
    else:
        genres = [g.lower() for g in genres]
    
    
    col = df['genre_string'].fillna("").astype(str).str.lower()

    
    mask = col.apply(lambda x: all(g in x for g in genres))
    
    genre_df = df[mask]

    if genre_df.empty:
        return f"No titles found containing ALL genres: {genres}"

    top = genre_df.sort_values(by='imdb_score', ascending=False)

    return top[['title', 'genres', 'imdb_score', 'description']].head(n)


In [18]:
top_rated_by_genres_all("Action", df)

Unnamed: 0,title,genres,imdb_score,description
233,Avatar: The Last Airbender,"['scifi', 'animation', 'action', 'family', 'fa...",9.3,"In a war-torn world of elemental magic, a youn..."
4770,Major,"['action', 'drama']",9.1,Based on the life of real-life Hero Major Sand...
4717,Arcane,"['scifi', 'action', 'drama', 'animation', 'fan...",9.0,Amid the stark discord of twin cities Piltover...
835,Attack on Titan,"['action', 'scifi', 'animation', 'horror', 'dr...",9.0,"Several hundred years ago, humans were nearly ..."
653,Hunter x Hunter,"['action', 'animation', 'comedy', 'fantasy']",9.0,Twelve-year-old Gon Freecss one day discovers ...
103,Cowboy Bebop,"['animation', 'action', 'scifi', 'western', 'c...",8.9,"In 2071, roughly fifty years after an accident..."
3217,The Untamed,"['action', 'drama', 'thriller', 'romance', 'co...",8.8,"In a land dominated by the mighty Wen clan, a ..."
3048,Vinland Saga,"['animation', 'action', 'drama', 'war', 'histo...",8.8,"For a thousand years, the Vikings have made qu..."
100,One Piece,"['animation', 'action', 'comedy', 'drama', 'fa...",8.8,"Years ago, the fearsome Pirate King, Gol D. Ro..."
237,Inception,"['action', 'scifi', 'music', 'thriller']",8.8,"Cobb, a skilled thief who commits corporate es..."


In [7]:
def recommend(title, df, similarity_matrix, n=10):
   
    title = title.lower()
    
   
    matches = df.index[df['title_lower'] == title]

    if len(matches) == 0:
        return f"Title '{title}' not found."

    idx = matches[0]

    
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    
    top_indices = [i for i, score in scores[1:n+1]]

    return df.iloc[top_indices][['title', 'genres', 'description']]

df['title_lower'] = df['title'].str.lower()

In [50]:
recommend("Stranger Things", df, similarity_matrix)

Unnamed: 0,title,genres,description
3208,The Order,"['scifi', 'drama', 'thriller', 'fantasy', 'hor...","Out to avenge his mother's death, a college st..."
2191,GHOUL,"['scifi', 'drama', 'thriller', 'fantasy', 'hor...",A newly minted military interrogator arrives a...
5132,The Privilege,"['scifi', 'thriller', 'drama', 'horror']",A wealthy teen and his friends attending an el...
1188,Glitch,"['scifi', 'thriller', 'drama', 'horror']",A police officer and a doctor face an emotiona...
4846,Choose or Die,"['scifi', 'thriller', 'drama', 'horror']","In pursuit of an unclaimed $125,000 prize, a b..."
836,The Originals,"['scifi', 'drama', 'fantasy', 'horror']",A spin-off from The Vampire Diaries and set in...
1855,Nightflyers,"['scifi', 'thriller', 'drama', 'horror']","In the year 2093, a team of scientists aboard ..."
1939,The Titan,"['scifi', 'thriller', 'fantasy', 'drama', 'rom...","On a bleak future Earth, a soldier endures a r..."
3092,Black Summer,"['scifi', 'thriller', 'action', 'drama', 'horr...","In the dark, early days of a zombie apocalypse..."
3196,The Call,"['scifi', 'thriller', 'drama', 'crime', 'horror']",Connected by phone in the same home but 20 yea...


In [15]:
def hybrid_weighted_recommend(title, df, similarity_matrix, n=10):
    if 'title_lower' not in df.columns:
        df['title_lower'] = df['title'].str.lower()
    
    title = title.lower()
    matches = df.index[df['title_lower'] == title]
    if len(matches) == 0:
        return f"Title '{title}' not found."
    
    idx = matches[0]
    sim_scores = similarity_matrix[idx]
    
    R = df['imdb_score'].fillna(0)
    v = df['imdb_votes'].fillna(0)
    C = R.mean()
    m = np.percentile(v, 80)
    
    weighted_imdb = (v/(v + m)) * R + (m/(v + m)) * C
    weighted_norm = (weighted_imdb - weighted_imdb.min()) / (weighted_imdb.max() - weighted_imdb.min())
    
    hybrid_scores = 0.7 * sim_scores + 0.3 * weighted_norm
    
    scores = list(enumerate(hybrid_scores))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, s in scores[1:n+1]]

    return df.iloc[top_indices][['title', 'genres', 'imdb_score', 'imdb_votes', 'description']]
    

In [49]:
hybrid_weighted_recommend("Stranger Things", df, similarity_matrix, n=10)

Unnamed: 0,title,genres,imdb_score,imdb_votes,description
836,The Originals,"['scifi', 'drama', 'fantasy', 'horror']",8.3,133406.0,A spin-off from The Vampire Diaries and set in...
3208,The Order,"['scifi', 'drama', 'thriller', 'fantasy', 'hor...",6.7,27943.0,"Out to avenge his mother's death, a college st..."
2191,GHOUL,"['scifi', 'drama', 'thriller', 'fantasy', 'hor...",8.0,6.0,A newly minted military interrogator arrives a...
1188,Glitch,"['scifi', 'thriller', 'drama', 'horror']",7.3,17240.0,A police officer and a doctor face an emotiona...
5132,The Privilege,"['scifi', 'thriller', 'drama', 'horror']",4.6,4293.0,A wealthy teen and his friends attending an el...
3196,The Call,"['scifi', 'thriller', 'drama', 'crime', 'horror']",7.1,30772.0,Connected by phone in the same home but 20 yea...
1855,Nightflyers,"['scifi', 'thriller', 'drama', 'horror']",5.8,13222.0,"In the year 2093, a team of scientists aboard ..."
3046,"Love, Death & Robots","['scifi', 'animation', 'thriller', 'horror', '...",8.4,131937.0,"Terrifying creatures, wicked surprises and dar..."
3092,Black Summer,"['scifi', 'thriller', 'action', 'drama', 'horr...",6.5,29318.0,"In the dark, early days of a zombie apocalypse..."
219,Supernatural,"['scifi', 'horror', 'thriller', 'drama', 'fant...",8.4,434081.0,"When they were boys, Sam and Dean Winchester l..."


In [19]:
def top_weighted_by_year(year, df, n=10):
    year_df = df[df['release_year'] == year]
    if year_df.empty:
        return f"No titles found for year {year}."
    
    R = year_df['imdb_score'].fillna(0)
    v = year_df['imdb_votes'].fillna(0)
    C = R.mean()
    m = np.percentile(year_df['imdb_votes'].fillna(0), 80)
    
    weighted_imdb = (v/(v + m)) * R + (m/(v + m)) * C
    year_df = year_df.copy()
    year_df['weighted_score'] = weighted_imdb
    
    top = year_df.sort_values(by='weighted_score', ascending=False)
    
    return top[['title', 'genres', 'release_year', 'imdb_score', 'imdb_votes', 'weighted_score', 'description']].head(n)

In [45]:
top_weighted_by_year(2008, df, n=10)

Unnamed: 0,title,genres,release_year,imdb_score,imdb_votes,weighted_score,description
216,Breaking Bad,"['drama', 'crime', 'thriller']",2008,9.5,1775990.0,9.444545,"When Walter White, a New Mexico chemistry teac..."
298,Ip Man,"['drama', 'fantasy', 'comedy', 'history', 'act...",2008,8.0,222372.0,7.78293,"A semi-biographical account of Yip Man, the fi..."
284,Merlin,"['action', 'drama', 'scifi', 'fantasy', 'europ...",2008,7.9,80927.0,7.433881,"The unlikely friendship between Merlin, a youn..."
256,The Hurt Locker,"['thriller', 'drama', 'war']",2008,7.5,446904.0,7.416752,"During the Iraq War, a Sergeant recently assig..."
280,Rambo,"['action', 'thriller', 'war', 'european']",2008,7.0,230333.0,6.905551,When governments fail to act on behalf of capt...
238,The Spectacular Spider-Man,"['action', 'animation', 'family', 'scifi', 'co...",2008,8.3,14996.0,6.886898,Having spent the summer engaging common crimin...
412,Jodhaa Akbar,"['romance', 'history', 'war', 'action', 'drama']",2008,7.5,32440.0,6.866122,Jodhaa Akbar is a sixteenth century love story...
495,Rock On!!,"['drama', 'music']",2008,7.7,22092.0,6.825196,"Aditya, Joe, Kedar and Rob form a rock band, b..."
334,Toradora!,"['animation', 'comedy', 'drama', 'romance']",2008,8.0,14828.0,6.782883,Ryūji Takasu is a gentle high school student w...
472,Jaane Tu... Ya Jaane Na,"['drama', 'comedy', 'romance']",2008,7.4,26894.0,6.757485,Two best friends being convinced that they are...
