In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [4]:
# Read data
data = pd.read_csv("./netflix_titles.csv")

df = pd.DataFrame(data)

print(len(df))

# dropping the rows having NaN values
df.dropna(subset = ['director', 'title', 'description'], inplace=True)

# To reset the indices
df = df.reset_index(drop=True)

print(len(df))

df.head()

8807
6173


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
3,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
4,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."


In [5]:
# TF-IDF Vectorization for each relevant text column
tfidf_vectorizer_title = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf_vectorizer_title.fit_transform(df['title'])

tfidf_vectorizer_director = TfidfVectorizer(stop_words='english')
tfidf_matrix_director = tfidf_vectorizer_director.fit_transform(df['director'])

tfidf_vectorizer_description = TfidfVectorizer(stop_words='english')
tfidf_matrix_description = tfidf_vectorizer_description.fit_transform(df['description'])

In [8]:
# Function to get the top N matching records for each search as a JSON dictionary with similarity scores
def search_by_title(query, n=5):
    search_vector_title = tfidf_vectorizer_title.transform([query])
    cosine_similarities_title = linear_kernel(search_vector_title, tfidf_matrix_title).flatten()
    most_matching_indices_title = cosine_similarities_title.argsort()[:-n-1:-1]
    results = df.loc[most_matching_indices_title].to_dict(orient='records')
    for i, idx in enumerate(most_matching_indices_title):
        results[i]["similarity_score"] = cosine_similarities_title[idx]
    return results

def search_by_director(query, n=5):
    search_vector_director = tfidf_vectorizer_director.transform([query])
    cosine_similarities_director = linear_kernel(search_vector_director, tfidf_matrix_director).flatten()
    most_matching_indices_director = cosine_similarities_director.argsort()[:-n-1:-1]
    results = df.loc[most_matching_indices_director].to_dict(orient='records')
    for i, idx in enumerate(most_matching_indices_director):
        results[i]["similarity_score"] = cosine_similarities_director[idx]
    return results

def search_by_description(query, n=5):
    search_vector_description = tfidf_vectorizer_description.transform([query])
    cosine_similarities_description = linear_kernel(search_vector_description, tfidf_matrix_description).flatten()
    most_matching_indices_description = cosine_similarities_description.argsort()[:-n-1:-1]
    results = df.loc[most_matching_indices_description].to_dict(orient='records')
    for i, idx in enumerate(most_matching_indices_description):
        results[i]["similarity_score"] = cosine_similarities_description[idx]
    return results

In [13]:
title_query = "Home alone"
top_n_results_title = search_by_title(title_query, n=3)
print("Top 3 Results for Search by Title:")
print(top_n_results_title)

Top 3 Results for Search by Title:
[{'show_id': 's492', 'type': 'Movie', 'title': 'Home Again', 'director': 'Hallie Meyers-Shyer', 'cast': 'Reese Witherspoon, Michael Sheen, Candice Bergen, Nat Wolff, Jon Rudnitsky, Pico Alexander, Lake Bell, Reid Scott, Dolly Wells, Lola Flanery, Eden Grace Redfield, P.J. Byrne', 'country': 'United States', 'date_added': 'July 8, 2021', 'release_year': 2017, 'rating': 'PG-13', 'duration': '97 min', 'listed_in': 'Comedies, Dramas, Romantic Movies', 'description': 'A newly single mom takes in three young male filmmakers as boarders, but her estranged husband returns to complicate her new, unconventional life.', 'similarity_score': 1.0}, {'show_id': 's1598', 'type': 'Movie', 'title': 'You Are My Home', 'director': 'Amanda Raymond', 'cast': 'Alyssa Milano, Angel Parker, Cristián de la Fuente, Eva Ariel Binder, Joel Steingold', 'country': 'United States', 'date_added': 'December 4, 2020', 'release_year': 2020, 'rating': 'TV-PG', 'duration': '93 min', 'list

In [14]:
director_query = "Christopher Nolan"
top_n_results_director = search_by_director(director_query, n=3)
print("\nTop 3 Results for Search by Director:")
print(top_n_results_director)


Top 3 Results for Search by Director:
[{'show_id': 's341', 'type': 'Movie', 'title': 'Inception', 'director': 'Christopher Nolan', 'cast': 'Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page, Tom Hardy, Ken Watanabe, Dileep Rao, Cillian Murphy, Tom Berenger, Marion Cotillard, Pete Postlethwaite, Michael Caine, Lukas Haas', 'country': 'United States, United Kingdom', 'date_added': 'August 1, 2021', 'release_year': 2010, 'rating': 'PG-13', 'duration': '148 min', 'listed_in': 'Action & Adventure, Sci-Fi & Fantasy, Thrillers', 'description': "A troubled thief who extracts secrets from people's dreams takes one last job: leading a dangerous mission to plant an idea in a target's subconscious.", 'similarity_score': 1.0}, {'show_id': 's5825', 'type': 'Movie', 'title': 'Interview with a Serial Killer', 'director': 'Christopher Martin', 'cast': 'Arthur Shawcross', 'country': 'United States', 'date_added': 'August 1, 2016', 'release_year': 2008, 'rating': 'TV-MA', 'duration': '45 min', 'liste

In [15]:
description_query = "Christmas special"
top_n_results_description = search_by_description(description_query, n=3)
print("\nTop 3 Results for Search by Description:")
print(top_n_results_description)


Top 3 Results for Search by Description:
[{'show_id': 's6055', 'type': 'Movie', 'title': 'A Russell Peters Christmas', 'director': 'Henry Sarwer-Foner', 'cast': 'Russell Peters, Pamela Anderson, Michael Bublé, Jon Lovitz, Scott Thompson, Faizon Love, Goapele, Ted Lange', 'country': 'Canada', 'date_added': 'November 1, 2013', 'release_year': 2011, 'rating': 'NR', 'duration': '44 min', 'listed_in': 'Stand-Up Comedy', 'description': 'Inspired by the variety shows of the 1970s, "A Russell Peters Christmas" is a sweet, silly, sentimental and, most of all, funny Christmas special.', 'similarity_score': 0.4376471765520135}, {'show_id': 's7789', 'type': 'Movie', 'title': 'Power Rangers: Megaforce: The Robo Knight Before Christmas', 'director': 'James Barr', 'cast': 'Andrew M. Gray, Ciara Hanna, John Mark Loudermilk, Christina Masterson, Azim Rizk, Shailesh Prajapati, Ian Harcourt', 'country': 'United States', 'date_added': 'January 1, 2016', 'release_year': 2013, 'rating': 'TV-Y7', 'duration'