In [46]:
# Data handling
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [47]:
df = pd.read_csv("netflix_titles.csv")

In [48]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [49]:
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [50]:
# Checking for null values
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [51]:
# Filling missing values with empty strings
df['director'] = df['director'].fillna('')
df['cast'] = df['cast'].fillna('')
df['country'] = df['country'].fillna('')
df['date_added'] = df['date_added'].fillna('')
df['rating'] = df['rating'].fillna('')
df['duration'] = df['duration'].fillna('')

In [52]:
# Cleaning text
#def clean_text(x):
#    return x.lower().replace(" ", "") if isinstance(x, str) else ''

def clean_text(x):
    return x.lower() if isinstance(x, str) else ''

# Combinung selected features into one
df['combined_features'] = df['title'] + ' ' + df['director'] + ' ' + \
                          df['cast'] + ' ' + df['listed_in'] + ' ' + df['description']

In [53]:
# Clean the combined text
df['combined_features'] = df['combined_features'].apply(clean_text)

In [54]:
df['combined_features'].head()

0    dick johnson is dead kirsten johnson  document...
1    blood & water  ama qamata, khosi ngema, gail m...
2    ganglands julien leclercq sami bouajila, tracy...
3    jailbirds new orleans   docuseries, reality tv...
4    kota factory  mayur more, jitendra kumar, ranj...
Name: combined_features, dtype: object

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Transform the combined features column
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Check shape of the resulting matrix
print(tfidf_matrix.shape)


(8807, 52938)


In [56]:
from sklearn.metrics.pairwise import cosine_similarity

print("Calculating cosine similarity:-")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Viewing similarity scores for the first show
print("\nSimilarity scores for the first show:")
print(cosine_sim[0][:10])  # Print similarities of show 0 to first 10 shows

Calculating cosine similarity:-
Cosine similarity matrix shape: (8807, 8807)

Similarity scores for the first show:
[1.         0.         0.         0.         0.00751303 0.
 0.         0.         0.02262031 0.00741236]


In [69]:
# Map show titles to indices
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

# Check it
print("\nIndex of 'Stranger Things':", indices.get('stranger things'))


Index of 'Stranger Things': 3685


In [75]:
def recommend(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Convert the title to lowercase to ensure case-insensitive matching
    title = title.lower()

    # If the title doesn't exist in our dataset, return an empty list
    if title not in indices:
        return []

    # Step 1: Get the index of the given show from the indices mapping
    index = indices[title]

    # Step 2: Get the similarity scores for this show with all other shows
    similarity_scores = list(enumerate(cosine_sim[index]))

    # Step 3: Sort all shows based on their similarity score (highest first)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Step 4: Exclude the first item (it’s the same show) and take top 10 similar shows
    top_matches = similarity_scores[1:11]

    # Step 5: Extract the indices of those top 10 similar shows
    top_indices = [i[0] for i in top_matches]

    # Step 6: Return the titles of the top recommended shows
    return df['title'].iloc[top_indices].tolist()


In [77]:
recommendations = recommend("Stranger Things")

print("\nRecommended Shows:")
for idx, show in enumerate(recommendations, 1):
    print(f"{idx}. {show}")


Recommended Shows:
1. Beyond Stranger Things
2. Prank Encounters
3. Eli
4. Things Heard & Seen
5. Homefront
6. The Umbrella Academy
7. Kiss Me First
8. The Sinner
9. Safe Haven
10. Good Witch


In [79]:
import pickle

# Save cosine similarity
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

# Save dataframe
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)

# Save indices
with open('indices.pkl', 'wb') as f:
    pickle.dump(indices, f)