In [1]:
# Imports and environment setup
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

Our dataset:

https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies

In [8]:
df = pd.read_csv("titles_cleaned.csv")

In [9]:
numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols

Index(['release_year', 'runtime', 'seasons', 'imdb_score', 'imdb_votes',
       'tmdb_popularity', 'tmdb_score'],
      dtype='object')

In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

df['genres_list'] = df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['genres_list']),
                             columns=mlb.classes_,
                             index=df.index)

df = pd.concat([df, genre_dummies], axis=1)

In [11]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_description = tfidf.fit_transform(df['description'])

from scipy.sparse import hstack
combined_features = hstack([tfidf_description, genre_dummies])

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(combined_features)

In [34]:
def recommend(title, df, similarity_matrix, n=10):
   
    title = title.lower()
    
   
    matches = df.index[df['title_lower'] == title]

    if len(matches) == 0:
        return f"Title '{title}' not found."

    idx = matches[0]

    
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    
    top_indices = [i for i, score in scores[1:n+1]]

    return df.iloc[top_indices][['title', 'genres', 'description']]

df['title_lower'] = df['title'].str.lower()

In [47]:
recommend("Baby Driver", df, similarity_matrix)

"Title 'baby driver' not found."

In [49]:
def top_rated_by_genre(genre, df, n=15):
    genre = genre.lower()
    
    
    mask = df['genre_string'].str.lower().str.contains(genre, na=False)
    genre_df = df[mask]
    
    if genre_df.empty:
        return f"No titles found for genre '{genre}'."
    
    
    top = genre_df.sort_values(by='imdb_score', ascending=False)
    
    return top[['title', 'genres', 'imdb_score', 'description']].head(n)

In [53]:
top_rated_by_genre("drama", df)

Unnamed: 0,title,genres,imdb_score,description
216,Breaking Bad,"['drama', 'crime', 'thriller']",9.5,"When Walter White, a New Mexico chemistry teac..."
1258,Reply 1988,"['comedy', 'drama', 'family', 'romance']",9.2,A nostalgic trip back to the late 1980s throug...
3499,Kota Factory,"['drama', 'comedy']",9.1,"Dedicated to Shrimati SL Loney ji, Shri Irodov..."
4770,Major,"['action', 'drama']",9.1,Based on the life of real-life Hero Major Sand...
1885,My Mister,"['drama', 'family']",9.1,"In a world that is less than kind, a young wom..."
165,Okupas,"['drama', 'crime']",9.0,"During the year 2000, Ricardo, Pollo, Walter a..."
835,Attack on Titan,"['action', 'scifi', 'animation', 'horror', 'dr...",9.0,"Several hundred years ago, humans were nearly ..."
4717,Arcane,"['scifi', 'action', 'drama', 'animation', 'fan...",9.0,Amid the stark discord of twin cities Piltover...
243,DEATH NOTE,"['animation', 'thriller', 'scifi', 'drama', 'f...",9.0,Light Yagami is an ace student with great pros...
103,Cowboy Bebop,"['animation', 'action', 'scifi', 'western', 'c...",8.9,"In 2071, roughly fifty years after an accident..."


In [58]:
def top_rated_by_genres_all(genres, df, n=10):
    
    if isinstance(genres, str):
        genres = [g.strip().lower() for g in genres.split(",")]
    else:
        genres = [g.lower() for g in genres]
    
    
    col = df['genre_string'].fillna("").astype(str).str.lower()

    
    mask = col.apply(lambda x: all(g in x for g in genres))
    
    genre_df = df[mask]

    if genre_df.empty:
        return f"No titles found containing ALL genres: {genres}"

    top = genre_df.sort_values(by='imdb_score', ascending=False)

    return top[['title', 'genres', 'imdb_score', 'description']].head(n)


In [60]:
top_rated_by_genres_all("Action, Comedy", df)

Unnamed: 0,title,genres,imdb_score,description
653,Hunter x Hunter,"['action', 'animation', 'comedy', 'fantasy']",9.0,Twelve-year-old Gon Freecss one day discovers ...
103,Cowboy Bebop,"['animation', 'action', 'scifi', 'western', 'c...",8.9,"In 2071, roughly fifty years after an accident..."
100,One Piece,"['animation', 'action', 'comedy', 'drama', 'fa...",8.8,"Years ago, the fearsome Pirate King, Gol D. Ro..."
3217,The Untamed,"['action', 'drama', 'thriller', 'romance', 'co...",8.8,"In a land dominated by the mighty Wen clan, a ..."
1126,One-Punch Man,"['action', 'animation', 'comedy', 'fantasy', '...",8.7,Saitama is a hero who only became a hero for f...
4738,Alchemy of Souls,"['drama', 'action', 'thriller', 'scifi', 'come...",8.6,A powerful sorceress in a blind woman's body e...
2062,Scissor Seven,"['comedy', 'drama', 'action', 'animation', 'fa...",8.6,"Seeking to recover his memory, a scissor-wield..."
1719,Cobra Kai,"['action', 'drama', 'sport', 'comedy']",8.5,This Karate Kid sequel series picks up 30 year...
5415,The Creature Cases,"['animation', 'comedy', 'family', 'action']",8.5,"Follows Sam Snow and Kit Casey, two brilliant ..."
5014,Vincenzo,"['action', 'drama', 'comedy', 'crime', 'romance']",8.4,Vincenzo Cassano is an Italian lawyer and Mafi...
