In [1]:
# Imports and environment setup
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

Our dataset:

https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies

In [2]:
df = pd.read_csv("titles_cleaned.csv")

In [3]:
numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols

Index(['release_year', 'runtime', 'seasons', 'imdb_score', 'imdb_votes',
       'tmdb_popularity', 'tmdb_score'],
      dtype='object')

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

df['genres_list'] = df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['genres_list']),
                             columns=mlb.classes_,
                             index=df.index)

df = pd.concat([df, genre_dummies], axis=1)

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_description = tfidf.fit_transform(df['description'])

from scipy.sparse import hstack
combined_features = hstack([tfidf_description, genre_dummies])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(combined_features)

In [7]:
def top_rated_by_genres_all(genres, df, n=10):
    
    if isinstance(genres, str):
        genres = [g.strip().lower() for g in genres.split(",")]
    else:
        genres = [g.lower() for g in genres]
    
    
    col = df['genre_string'].fillna("").astype(str).str.lower()

    
    mask = col.apply(lambda x: all(g in x for g in genres))
    
    genre_df = df[mask]

    if genre_df.empty:
        return f"No titles found containing ALL genres: {genres}"

    top = genre_df.sort_values(by='imdb_score', ascending=False)

    return top[['title', 'genres', 'imdb_score', 'description']].head(n)


In [8]:
top_rated_by_genres_all("Action, Drama", df)

Unnamed: 0,title,genres,imdb_score,description
4770,Major,"['action', 'drama']",9.1,Based on the life of real-life Hero Major Sand...
4717,Arcane,"['scifi', 'action', 'drama', 'animation', 'fan...",9.0,Amid the stark discord of twin cities Piltover...
835,Attack on Titan,"['action', 'scifi', 'animation', 'horror', 'dr...",9.0,"Several hundred years ago, humans were nearly ..."
103,Cowboy Bebop,"['animation', 'action', 'scifi', 'western', 'c...",8.9,"In 2071, roughly fifty years after an accident..."
100,One Piece,"['animation', 'action', 'comedy', 'drama', 'fa...",8.8,"Years ago, the fearsome Pirate King, Gol D. Ro..."
3048,Vinland Saga,"['animation', 'action', 'drama', 'war', 'histo...",8.8,"For a thousand years, the Vikings have made qu..."
3217,The Untamed,"['action', 'drama', 'thriller', 'romance', 'co...",8.8,"In a land dominated by the mighty Wen clan, a ..."
290,Code Geass: Lelouch of the Rebellion,"['scifi', 'action', 'animation', 'drama', 'war...",8.7,Japan has been invaded and conquered by the Br...
1904,Mr. Sunshine,"['drama', 'history', 'romance', 'war', 'action']",8.7,"Set in the early 1900s, this drama tells the s..."
1126,One-Punch Man,"['action', 'animation', 'comedy', 'fantasy', '...",8.7,Saitama is a hero who only became a hero for f...


In [9]:
def recommend(title, df, similarity_matrix, n=10):
   
    title = title.lower()
    
   
    matches = df.index[df['title_lower'] == title]

    if len(matches) == 0:
        return f"Title '{title}' not found."

    idx = matches[0]

    
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    
    top_indices = [i for i, score in scores[1:n+1]]

    return df.iloc[top_indices][['title', 'genres', 'description']]

df['title_lower'] = df['title'].str.lower()

In [10]:
recommend("Breaking Bad", df, similarity_matrix)

Unnamed: 0,title,genres,description
1708,Ozark,"['drama', 'crime', 'thriller']",A financial adviser drags his family from Chic...
2300,The Inmate,"['drama', 'crime', 'thriller']","A former U.S. Marine, Lázaro Mendoza, enters a..."
4954,The Billion Dollar Code,"['drama', 'crime', 'thriller']","In 1990s Berlin, an artist and a hacker invent..."
5275,Elite Short Stories: Guzmán Caye Rebe,"['drama', 'crime', 'thriller']",Rebe hosts an intimate house warming party for...
4876,Who Killed Sara?,"['drama', 'crime', 'thriller']",Hell-bent on exacting revenge and proving he w...
5418,Blood Sisters,"['drama', 'crime', 'thriller']","Connected by a dangerous secret, best friends ..."
5058,Collision,"['drama', 'crime', 'thriller']","Over the course of one fateful day, a corrupt ..."
1721,Mindhunter,"['drama', 'crime', 'thriller']",An agent in the FBI's Elite Serial Crime Unit ...
141,Menace II Society,"['drama', 'crime', 'thriller']",A young street hustler attempts to escape the ...
1337,Case,"['drama', 'crime', 'thriller']",A dramatic thriller about a broken lawyer who ...


In [11]:
def hybrid_weighted_recommend(title, df, similarity_matrix, n=10):
    if 'title_lower' not in df.columns:
        df['title_lower'] = df['title'].str.lower()
    
    title = title.lower()
    matches = df.index[df['title_lower'] == title]
    if len(matches) == 0:
        return f"Title '{title}' not found."
    
    idx = matches[0]
    sim_scores = similarity_matrix[idx]
    
    R = df['imdb_score'].fillna(0)
    v = df['imdb_votes'].fillna(0)
    C = R.mean()
    m = np.percentile(v, 80)
    
    weighted_imdb = (v/(v + m)) * R + (m/(v + m)) * C
    weighted_norm = (weighted_imdb - weighted_imdb.min()) / (weighted_imdb.max() - weighted_imdb.min())
    
    hybrid_scores = 0.7 * sim_scores + 0.3 * weighted_norm
    
    scores = list(enumerate(hybrid_scores))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, s in scores[1:n+1]]

    return df.iloc[top_indices][['title', 'genres', 'imdb_score', 'imdb_votes', 'description']]
    

In [13]:
hybrid_weighted_recommend("Breaking Bad", df, similarity_matrix, n=10)

Unnamed: 0,title,genres,imdb_score,imdb_votes,description
1708,Ozark,"['drama', 'crime', 'thriller']",8.5,300454.0,A financial adviser drags his family from Chic...
1721,Mindhunter,"['drama', 'crime', 'thriller']",8.6,261429.0,An agent in the FBI's Elite Serial Crime Unit ...
2133,The Mechanism,"['drama', 'crime', 'thriller']",8.0,36136.0,A scandal erupts in Brazil during an investiga...
141,Menace II Society,"['drama', 'crime', 'thriller']",7.5,58179.0,A young street hustler attempts to escape the ...
4954,The Billion Dollar Code,"['drama', 'crime', 'thriller']",8.0,8164.0,"In 1990s Berlin, an artist and a hacker invent..."
3229,Undercover,"['drama', 'crime', 'thriller']",7.8,17005.0,Undercover agents infiltrate a drug kingpin's ...
3292,Quicksand,"['drama', 'crime', 'thriller']",7.5,21276.0,After a tragedy at a school sends shock waves ...
2300,The Inmate,"['drama', 'crime', 'thriller']",6.9,2714.0,"A former U.S. Marine, Lázaro Mendoza, enters a..."
5275,Elite Short Stories: Guzmán Caye Rebe,"['drama', 'crime', 'thriller']",7.4,2319.0,Rebe hosts an intimate house warming party for...
2080,Black Earth Rising,"['drama', 'crime', 'thriller']",7.4,4917.0,"As a child, Kate Ashby was rescued from the ho..."


In [14]:
def top_weighted_by_year(year, df, n=10):
    year_df = df[df['release_year'] == year]
    if year_df.empty:
        return f"No titles found for year {year}."
    
    R = year_df['imdb_score'].fillna(0)
    v = year_df['imdb_votes'].fillna(0)
    C = R.mean()
    m = np.percentile(year_df['imdb_votes'].fillna(0), 80)
    
    weighted_imdb = (v/(v + m)) * R + (m/(v + m)) * C
    year_df = year_df.copy()
    year_df['weighted_score'] = weighted_imdb
    
    top = year_df.sort_values(by='weighted_score', ascending=False)
    
    return top[['title', 'genres', 'release_year', 'imdb_score', 'imdb_votes', 'weighted_score', 'description']].head(n)

In [15]:
top_weighted_by_year(2008, df, n=10)

Unnamed: 0,title,genres,release_year,imdb_score,imdb_votes,weighted_score,description
216,Breaking Bad,"['drama', 'crime', 'thriller']",2008,9.5,1775990.0,9.444545,"When Walter White, a New Mexico chemistry teac..."
298,Ip Man,"['drama', 'fantasy', 'comedy', 'history', 'act...",2008,8.0,222372.0,7.78293,"A semi-biographical account of Yip Man, the fi..."
284,Merlin,"['action', 'drama', 'scifi', 'fantasy', 'europ...",2008,7.9,80927.0,7.433881,"The unlikely friendship between Merlin, a youn..."
256,The Hurt Locker,"['thriller', 'drama', 'war']",2008,7.5,446904.0,7.416752,"During the Iraq War, a Sergeant recently assig..."
280,Rambo,"['action', 'thriller', 'war', 'european']",2008,7.0,230333.0,6.905551,When governments fail to act on behalf of capt...
238,The Spectacular Spider-Man,"['action', 'animation', 'family', 'scifi', 'co...",2008,8.3,14996.0,6.886898,Having spent the summer engaging common crimin...
412,Jodhaa Akbar,"['romance', 'history', 'war', 'action', 'drama']",2008,7.5,32440.0,6.866122,Jodhaa Akbar is a sixteenth century love story...
495,Rock On!!,"['drama', 'music']",2008,7.7,22092.0,6.825196,"Aditya, Joe, Kedar and Rob form a rock band, b..."
334,Toradora!,"['animation', 'comedy', 'drama', 'romance']",2008,8.0,14828.0,6.782883,Ryūji Takasu is a gentle high school student w...
472,Jaane Tu... Ya Jaane Na,"['drama', 'comedy', 'romance']",2008,7.4,26894.0,6.757485,Two best friends being convinced that they are...


### Impact 

Recommender systems are widely used across online platforms and have the potential for substantial impact to users. When designed responsibly, they can help people make informed purchases, discover educational resources, and explore new media. However, these same systems can also trap people in algorithmic echo chambers, and reinforce addictive engagement patterns. Because of these considerations, creating transparent and advantageous recommender systems is essential. 

With our recommender model, we aim to achieve three main goals. First, we have made an effort to create a fully transparent model, with each aspect of the model being explained and adjustable by the user. This contrasts to the majority of commercial recommender systems, that function as black boxes. Second, we hope that users will be able to use the model to find movies and shows that are similar to those that they already enjoy. Third, we hope that viewing this model will provide educational value to the viewer, giving the viewer insight into how the real-world recommender systems function.

Several negative impacts must also be considered. Like many media recommender systems, the model can easily fall into an 'echo chamber' where users are consistently recommended highly rated, popular movies, which then contributes to those movies remaining popular. A user may be less likely to watch an indie film or new show due to the model's bias. Scaled to large platforms like Netflix, this can harm the success of smaller producers. Our model suffers from the same bias. If we were to publish a model for production, changes to incorporate less popular movies and shows would likely need to be implemented. Another consideration is that our model considers genre and rating, but doesn't include aspects like pacing, and cinematography. This may lead to users having an unfulfilling experience with our model, and not finding their desired media. 

Overall, while our recommender system provides transparency and educational value, it also reveals the challenges of creating an unbiased model. Acknowledging these aspects is essential to creating responsible, user-centered recommender systems. 