In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
df = df.drop("date_added", axis=1).reset_index(drop=True)
df = df.drop("cast", axis=1).reset_index(drop=True)
df = df.drop("rating", axis=1).reset_index(drop=True)
df = df.drop("duration", axis=1).reset_index(drop=True)

In [4]:
df['director'] = df['director'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')

In [5]:
df = df[df['type'] == 'TV Show'].reset_index()

In [6]:
df.shape

(2676, 9)

In [7]:
df.isnull().sum()

index           0
show_id         0
type            0
title           0
director        0
country         0
release_year    0
listed_in       0
description     0
dtype: int64

In [8]:
df.head()

Unnamed: 0,index,show_id,type,title,director,country,release_year,listed_in,description
0,1,s2,TV Show,Blood & Water,Unknown,South Africa,2021,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
1,2,s3,TV Show,Ganglands,Julien Leclercq,Unknown,2021,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2,3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,2021,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
3,4,s5,TV Show,Kota Factory,Unknown,India,2021,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
4,5,s6,TV Show,Midnight Mass,Mike Flanagan,Unknown,2021,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess(text):
    """
    3-step cleaning:
    1. Basic normalization
    2. Stopword removal
    3. Lemmatization
    """
    if pd.isna(text):
        return ""
    
    # Step 1: Normalize
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s-]', '', text)  # Keep alphanumeric+hyphens
    
    # Step 2: Tokenize and remove stopwords
    tokens = [word for word in text.split() 
              if word not in stop_words and len(word) > 2]
    
    # Step 3: Lemmatize (gentler than stemming)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

In [15]:
df['description'] = df['description'].apply(preprocess)
df['listed_in'] = df['listed_in'].apply(preprocess)

In [16]:
df['combined_features'] = 5 * (' ' + df['listed_in'] + ' ') + ' ' + 4 * (' ' + df['description'] + ' ') + ' ' + 2 * (' ' + df['director'] + ' ') + ' ' + df['country']

In [17]:
df.head()

Unnamed: 0,index,show_id,type,title,director,country,release_year,listed_in,description,combined_features
0,1,s2,TV Show,Blood & Water,Unknown,South Africa,2021,international show drama mystery,crossing path party cape town teen set prove w...,international show drama mystery internation...
1,2,s3,TV Show,Ganglands,Julien Leclercq,Unknown,2021,crime show international show action adventure,protect family powerful drug lord skilled thie...,crime show international show action adventur...
2,3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,2021,docuseries reality,feud flirtation toilet talk among incarcerated...,docuseries reality docuseries reality docus...
3,4,s5,TV Show,Kota Factory,Unknown,India,2021,international show romantic show comedy,city coaching center known train india finest ...,international show romantic show comedy inte...
4,5,s6,TV Show,Midnight Mass,Mike Flanagan,Unknown,2021,drama horror mystery,arrival charismatic young priest brings glorio...,drama horror mystery drama horror mystery d...


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [19]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [20]:
matrix = tfid.fit_transform(df['combined_features'])

In [21]:
similarity_matrix = cosine_distances(matrix)

In [22]:
similarity_matrix.shape

(2676, 2676)

In [23]:
def recommend(title):
    indx = df[df['title'].str.lower() == title.lower()].index[0]

    distances = sorted(list(enumerate(similarity_matrix[indx])), reverse = True, key = lambda x: x[1])

    shows = []
    for i in distances[1:20]:
        shows.append(df.iloc[i[0]].title)
    return shows

In [25]:
recommend("narcos")

['Office Girls',
 'Sab Jholmaal Hai',
 "Let's Fight Ghost",
 'Men on a Mission',
 'Nevenka: Breaking the Silence',
 'The House Arrest of Us',
 'Anitta: Made In Honório',
 'Before 30',
 'Afronta! Facing It!',
 'The Devil Punisher',
 'Dating Around: Brazil',
 'Trailer Park Boys: The Animated Series',
 'Almost Happy',
 'Fary : Hexagone',
 'Velvet',
 'Six Windows in the Desert',
 'Bring It On, Ghost',
 'Cheese in the Trap',
 'Reply 1994']