In [29]:
import pandas as pd

In [30]:
df = pd.read_csv("netflix_titles.csv")

In [31]:
df = df.drop("date_added", axis=1).reset_index(drop=True)
df = df.drop("cast", axis=1).reset_index(drop=True)
df = df.drop("rating", axis=1).reset_index(drop=True)
df = df.drop("duration", axis=1).reset_index(drop=True)

In [32]:
df['director'] = df['director'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')

In [33]:
df = df[df['type'] == 'TV Show'].reset_index()

In [34]:
df.shape

(2676, 9)

In [35]:
df.isnull().sum()

index           0
show_id         0
type            0
title           0
director        0
country         0
release_year    0
listed_in       0
description     0
dtype: int64

In [36]:
df.head()

Unnamed: 0,index,show_id,type,title,director,country,release_year,listed_in,description
0,1,s2,TV Show,Blood & Water,Unknown,South Africa,2021,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
1,2,s3,TV Show,Ganglands,Julien Leclercq,Unknown,2021,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2,3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,2021,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
3,4,s5,TV Show,Kota Factory,Unknown,India,2021,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
4,5,s6,TV Show,Midnight Mass,Mike Flanagan,Unknown,2021,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...


In [37]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [38]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [39]:
def preprocess(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s-]', '', text)
    
    tokens = [word for word in text.split() 
              if word not in stop_words and len(word) > 2]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

In [40]:
def clean_genres(genres):
    if pd.isna(genres):
        return ""
    
    # Step 1: Lowercase and keep only letters/commas/spaces
    text = re.sub(r'[^a-z,\\s]', '', str(genres).lower())
    
    # Step 2: Replace commas with spaces (maintain multi-word genres)
    text = text.replace(",", " ")
    
    # Step 3: Collapse multiple spaces
    return " ".join(text.split())

In [41]:
df['description'] = df['description'].apply(preprocess)
df['listed_in'] = df['listed_in'].apply(clean_genres)

In [42]:
df['combined_features'] = 10 * (' ' + df['listed_in'] + ' ') + ' ' + 3 * (' ' + df['description'] + ' ') + ' ' + df['director'] + ' ' + df['country']

In [43]:
df.head()

Unnamed: 0,index,show_id,type,title,director,country,release_year,listed_in,description,combined_features
0,1,s2,TV Show,Blood & Water,Unknown,South Africa,2021,internationaltvshows tvdramas tvmysteries,crossing path party cape town teen set prove w...,internationaltvshows tvdramas tvmysteries in...
1,2,s3,TV Show,Ganglands,Julien Leclercq,Unknown,2021,crimetvshows internationaltvshows tvactionadve...,protect family powerful drug lord skilled thie...,crimetvshows internationaltvshows tvactionadv...
2,3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,2021,docuseries realitytv,feud flirtation toilet talk among incarcerated...,docuseries realitytv docuseries realitytv d...
3,4,s5,TV Show,Kota Factory,Unknown,India,2021,internationaltvshows romantictvshows tvcomedies,city coaching center known train india finest ...,internationaltvshows romantictvshows tvcomedi...
4,5,s6,TV Show,Midnight Mass,Mike Flanagan,Unknown,2021,tvdramas tvhorror tvmysteries,arrival charismatic young priest brings glorio...,tvdramas tvhorror tvmysteries tvdramas tvhor...


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [46]:
matrix = tfid.fit_transform(df['combined_features'])

In [47]:
similarity_matrix = cosine_similarity(matrix)

In [48]:
similarity_matrix

array([[1.00000000e+00, 3.59474336e-02, 2.92925437e-04, ...,
        2.83536853e-02, 1.10750316e-01, 2.82615872e-02],
       [3.59474336e-02, 1.00000000e+00, 2.82678967e-04, ...,
        1.61893919e-04, 3.83621993e-02, 3.11791762e-04],
       [2.92925437e-04, 2.82678967e-04, 1.00000000e+00, ...,
        3.26029017e-04, 3.12602679e-04, 6.27899814e-04],
       ...,
       [2.83536853e-02, 1.61893919e-04, 3.26029017e-04, ...,
        1.00000000e+00, 1.79031617e-04, 1.20988761e-01],
       [1.10750316e-01, 3.83621993e-02, 3.12602679e-04, ...,
        1.79031617e-04, 1.00000000e+00, 3.44797283e-04],
       [2.82615872e-02, 3.11791762e-04, 6.27899814e-04, ...,
        1.20988761e-01, 3.44797283e-04, 1.00000000e+00]],
      shape=(2676, 2676))

In [49]:
similarity_matrix.shape

(2676, 2676)

In [50]:
def get_genres(title):
    try:
        show = df[df['title'].str.lower() == title.lower()].iloc[0]
        return show['listed_in']
    except IndexError:
        return "Show not found in database"

In [51]:
get_genres("Breaking Bad")

'crimetvshows tvdramas tvthrillers'

In [None]:
def recommend(title):
    try:
        indx = df[df['title'].str.lower() == title.lower()].index[0]

        distances = sorted(list(enumerate(similarity_matrix[indx])), reverse = True, key = lambda x: x[1])

        recs = []
        for i in distances[1:20]:
            show_title = df.iloc[i[0]].title
            show_genres = get_genres(show_title)
            recs.append((show_title, show_genres))
        
        return recs

    except IndexError:
        return [("Title not found", "")]



In [53]:
recommend("squid game")

[('Kakegurui', 'internationaltvshows tvdramas tvthrillers'),
 ('Gonul', 'internationaltvshows tvdramas tvthrillers'),
 ('Children of Adam', 'internationaltvshows tvdramas tvthrillers'),
 ('American Odyssey', 'tvdramas tvthrillers'),
 ('Nobel', 'internationaltvshows tvdramas tvthrillers'),
 ('50M2', 'internationaltvshows tvdramas tvthrillers'),
 ('House of Cards', 'tvdramas tvthrillers'),
 ('Darwin’s Game', 'animeseries internationaltvshows tvthrillers'),
 ('Marseille', 'internationaltvshows tvdramas tvthrillers'),
 ('The Spy', 'internationaltvshows tvdramas tvthrillers'),
 ('Winter Sun', 'internationaltvshows tvdramas tvthrillers'),
 ('Fatma', 'internationaltvshows tvdramas tvthrillers'),
 ('Breaking Bad', 'crimetvshows tvdramas tvthrillers'),
 ('High-Rise Invasion', 'animeseries internationaltvshows tvthrillers'),
 ('Containment', 'tvdramas tvthrillers'),
 ('The Lizzie Borden Chronicles', 'crimetvshows tvdramas tvthrillers'),
 ('Occupied', 'internationaltvshows tvdramas tvthrillers'),

In [54]:
import pickle

In [55]:
pickle.dump(similarity_matrix, open("similar.pkl", "wb"))

In [56]:
pickle.dump(df, open("df.pkl", "wb"))