In [462]:
import pandas as pd
import networkx as nx

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [463]:
#Import dataset from CSV file
f = pd.read_csv('./netflix_titles.csv')

In [464]:
#Read CSV file into DataFrame
df = pd.DataFrame(f)
df = df.fillna('')
df['index'] = df.index

df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,index
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",1
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",3
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",8802
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",8803
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,8804
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",8805


In [465]:
#Combine all features to compare titles with
def combined_features(row):
    return row['cast']+" "+row['director']+", "+row['rating']+", "+row['listed_in']

df['combined_features'] = df.apply(combined_features, axis=1)

In [466]:
#Count number of features present in each title
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])
print("Count Matrix: ", count_matrix.toarray())

#Set  similarity metirc
cosine_sim = cosine_similarity(count_matrix)

Count Matrix:  [[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


# Query 1

In [467]:
#Set target movie
target_title = 'Grown Ups'

#Show target features
list(df[df['title']==target_title]['combined_features'])

['Adam Sandler, Kevin James, Chris Rock, David Spade, Rob Schneider, Salma Hayek, Maria Bello, Maya Rudolph, Colin Quinn, Tim Meadows, Joyce Van Patten Dennis Dugan, PG-13, Comedies']

In [468]:
#Get movie index
def get_index_from(title): 
    return df[df['title'] == title]['index'].values[0]


movie_index = get_index_from(target_title)

#Generate distances from target title to all other titles
similar_movies = list(enumerate(cosine_sim[movie_index]))

similar_movies

[(0, 0.1690308509457033),
 (1, 0.0),
 (2, 0.02786391062876764),
 (3, 0.0),
 (4, 0.029160592175990215),
 (5, 0.0),
 (6, 0.07018624063435963),
 (7, 0.0),
 (8, 0.0),
 (9, 0.21428571428571422),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.07142857142857141),
 (14, 0.0),
 (15, 0.034503277967117704),
 (16, 0.0),
 (17, 0.0),
 (18, 0.035714285714285705),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0975900072948533),
 (23, 0.0),
 (24, 0.04583492485141056),
 (25, 0.0),
 (26, 0.0890870806374748),
 (27, 0.9999999999999997),
 (28, 0.07273929674533079),
 (29, 0.07559289460184544),
 (30, 0.0),
 (31, 0.06900655593423541),
 (32, 0.03026137663344012),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.04029114820126901),
 (38, 0.07018624063435963),
 (39, 0.0),
 (40, 0.03026137663344012),
 (41, 0.035714285714285705),
 (42, 0.035714285714285705),
 (43, 0.07142857142857141),
 (44, 0.11118739749916517),
 (45, 0.0944911182523068),
 (46, 0.0),
 (47, 0.02699746235780194),
 (48, 0.0),
 (49, 0.02916059217599

In [469]:
#Sort to get most similar titles first
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True)
sorted_similar_movies

[(27, 0.9999999999999997),
 (1879, 0.4125143236626951),
 (5533, 0.3335621924974955),
 (6018, 0.32732683535398854),
 (6303, 0.32142857142857134),
 (1353, 0.2760262237369417),
 (7008, 0.2760262237369417),
 (8790, 0.2594372608313854),
 (7014, 0.2593756879669057),
 (2540, 0.2545875386086578),
 (867, 0.253546276418555),
 (8742, 0.23385358667337128),
 (7316, 0.22237479499833035),
 (5227, 0.22047927592204916),
 (8253, 0.2182178902359924),
 (6582, 0.21428571428571425),
 (9, 0.21428571428571422),
 (1808, 0.21428571428571422),
 (8284, 0.21428571428571422),
 (1418, 0.20965696734438366),
 (6497, 0.20701966780270625),
 (8372, 0.20701966780270625),
 (1454, 0.20619652471058064),
 (5258, 0.20619652471058064),
 (8457, 0.2038588765750502),
 (7998, 0.2036532699906392),
 (6977, 0.20365326999063918),
 (3753, 0.20145574100634503),
 (168, 0.20044593143431827),
 (6164, 0.1973855084879307),
 (6583, 0.1973855084879307),
 (8158, 0.1973855084879307),
 (231, 0.19702760155977517),
 (5179, 0.19702760155977517),
 (91

In [470]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

#Print top ten most similar titles
i=0
for movies in sorted_similar_movies:
    print(get_title_from_index(movies[0]), movies[1])
    i = i+1;
    if i>10:
        break

Grown Ups 0.9999999999999997
Hubie Halloween 0.4125143236626951
Sandy Wexler 0.3335621924974955
50 First Dates 0.32732683535398854
Big Daddy 0.32142857142857134
Beverly Hills Ninja 0.2760262237369417
Hotel Transylvania 3: Summer Vacation 0.2760262237369417
You Don't Mess with the Zohan 0.2594372608313854
How to Be a Latin Lover 0.2593756879669057
The Wrong Missy 0.2545875386086578
The Last Days 0.253546276418555


# Query 2

In [471]:
#Set target show
target_title = 'The Flash'

#Show target features
list(df[df['title']==target_title]['combined_features'])

['Grant Gustin, Candice Patton, Danielle Panabaker, Carlos Valdes, Tom Cavanagh, Jesse L. Martin, Neil Sandilands, Britne Oldford, Danielle Nicolet, Keiynan Lonsdale, Kim Engelbrecht Glen Winter, TV-14, Crime TV Shows, TV Action & Adventure, TV Sci-Fi & Fantasy']

In [472]:
#Get movie index
def get_index_from(title): 
    return df[df['title'] == title]['index'].values[0]


movie_index = get_index_from(target_title)

#Generate distances from target title to all other titles
similar_movies = list(enumerate(cosine_sim[movie_index]))

similar_movies

[(0, 0.0),
 (1, 0.3053290134455174),
 (2, 0.43788026951985703),
 (3, 0.42761798705987897),
 (4, 0.3927922024247863),
 (5, 0.30237157840738177),
 (6, 0.0),
 (7, 0.1131370849898476),
 (8, 0.412837477233712),
 (9, 0.0),
 (10, 0.5144957554275265),
 (11, 0.33844564489065976),
 (12, 0.10504514628777804),
 (13, 0.10690449676496974),
 (14, 0.5144957554275265),
 (15, 0.30983866769659335),
 (16, 0.16329931618554522),
 (17, 0.37625606633113623),
 (18, 0.13363062095621217),
 (19, 0.4364357804719848),
 (20, 0.51910854761844),
 (21, 0.42163702135578396),
 (22, 0.1460593486680443),
 (23, 0.1414213562373095),
 (24, 0.17149858514250882),
 (25, 0.4949747468305833),
 (26, 0.13333333333333336),
 (27, 0.0),
 (28, 0.08164965809277261),
 (29, 0.0),
 (30, 0.12499999999999999),
 (31, 0.20655911179772887),
 (32, 0.4076197322920545),
 (33, 0.3286335345030997),
 (34, 0.29199855803537256),
 (35, 0.10690449676496974),
 (36, 0.15491933384829665),
 (37, 0.36181361349331637),
 (38, 0.05252257314388902),
 (39, 0.266666

In [473]:
#Sort to get most similar titles first
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True)
sorted_similar_movies

[(380, 1.0000000000000004),
 (2190, 0.6487446070815475),
 (3707, 0.5883484054145521),
 (3053, 0.5773502691896258),
 (4250, 0.5773502691896258),
 (657, 0.5602794333886092),
 (6166, 0.5554920598635309),
 (749, 0.5547001962252291),
 (7184, 0.5547001962252291),
 (4015, 0.5499999999999999),
 (4494, 0.5499999999999999),
 (6513, 0.5499999999999999),
 (7081, 0.5499999999999999),
 (8755, 0.5499999999999999),
 (218, 0.5491251783869153),
 (224, 0.5491251783869153),
 (368, 0.5491251783869153),
 (549, 0.5491251783869153),
 (1060, 0.5491251783869153),
 (1383, 0.5491251783869153),
 (1623, 0.5491251783869153),
 (3726, 0.5491251783869153),
 (8199, 0.5491251783869153),
 (369, 0.5487954724560282),
 (1951, 0.5487954724560282),
 (2165, 0.5487954724560282),
 (5822, 0.5487954724560282),
 (7324, 0.5487954724560282),
 (7629, 0.5487954724560282),
 (8541, 0.5487954724560282),
 (366, 0.5484827557301445),
 (430, 0.5484827557301445),
 (1093, 0.5484827557301445),
 (2657, 0.5484827557301445),
 (2982, 0.54848275573014

In [474]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

#Print top ten most similar titles
i=0
for movies in sorted_similar_movies:
    print(get_title_from_index(movies[0]), movies[1])
    i = i+1;
    if i>10:
        break

The Flash 1.0000000000000004
The Umbrella Academy 0.6487446070815475
Motown Magic 0.5883484054145521
Border Security: America's Front Line 0.5773502691896258
Pioneers: First Women Filmmakers* 0.5773502691896258
The Mole 0.5602794333886092
Anjaan: Rural Myths 0.5554920598635309
L.A.’s Finest 0.5547001962252291
Khan: No. 1 Crime Hunter 0.5547001962252291
The Disappearance of Madeleine McCann 0.5499999999999999
Making a Murderer 0.5499999999999999


# Query 3

In [475]:
#Set target movie
target_title = 'Insidious'

#Show target features
list(df[df['title']==target_title]['combined_features'])

['Patrick Wilson, Rose Byrne, Lin Shaye, Ty Simpkins, Barbara Hershey, Leigh Whannell, Angus Sampson, Andrew Astor, Joseph Bishara James Wan, PG-13, Horror Movies, Thrillers']

In [476]:
#Get movie index
def get_index_from(title): 
    return df[df['title'] == title]['index'].values[0]


movie_index = get_index_from(target_title)

#Generate distances from target title to all other titles
similar_movies = list(enumerate(cosine_sim[movie_index]))

similar_movies

[(0, 0.17888543819998318),
 (1, 0.025400025400038103),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.026726124191242442),
 (6, 0.11141720290623112),
 (7, 0.08000000000000002),
 (8, 0.0),
 (9, 0.07559289460184544),
 (10, 0.0),
 (11, 0.0),
 (12, 0.037139067635410375),
 (13, 0.07559289460184544),
 (14, 0.0),
 (15, 0.03651483716701107),
 (16, 0.05773502691896259),
 (17, 0.0),
 (18, 0.03779644730092272),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.10327955589886445),
 (23, 0.05),
 (24, 0.09701425001453319),
 (25, 0.0),
 (26, 0.09428090415820635),
 (27, 0.11338934190276817),
 (28, 0.15396007178390023),
 (29, 0.12000000000000002),
 (30, 0.07071067811865475),
 (31, 0.0),
 (32, 0.0),
 (33, 0.025819888974716113),
 (34, 0.0),
 (35, 0.07559289460184544),
 (36, 0.03651483716701107),
 (37, 0.0),
 (38, 0.07427813527082075),
 (39, 0.0),
 (40, 0.0),
 (41, 0.07559289460184544),
 (42, 0.1889822365046136),
 (43, 0.15118578920369088),
 (44, 0.23533936216582085),
 (45, 0.1),
 (46, 0.04082482904638631),
 (47, 0.02

In [477]:
#Sort to get most similar titles first
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True)
sorted_similar_movies

[(1118, 1.0000000000000007),
 (5903, 0.2910427500435996),
 (1283, 0.28),
 (8221, 0.27456258919345766),
 (867, 0.2683281572999748),
 (3102, 0.2683281572999748),
 (2097, 0.26666666666666666),
 (6180, 0.2618614682831909),
 (1284, 0.2514474228374849),
 (3450, 0.25021729686848976),
 (7061, 0.25021729686848976),
 (8384, 0.24370871833797697),
 (5110, 0.24253562503633297),
 (3764, 0.23570226039551587),
 (44, 0.23533936216582085),
 (1387, 0.23533936216582085),
 (5737, 0.22941573387056174),
 (283, 0.22677868380553634),
 (317, 0.22677868380553634),
 (560, 0.22677868380553634),
 (5240, 0.22677868380553634),
 (8275, 0.22677868380553634),
 (8403, 0.22677868380553634),
 (7009, 0.2267786838055363),
 (5042, 0.22360679774997896),
 (8722, 0.22283440581246222),
 (7168, 0.22188007849009167),
 (5091, 0.21821789023599242),
 (7210, 0.2155263624321299),
 (8548, 0.2155263624321299),
 (8211, 0.21320071635561044),
 (557, 0.21213203435596423),
 (5641, 0.21213203435596423),
 (6270, 0.2085144140570748),
 (7220, 0.20

In [478]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

#Print top ten most similar titles
i=0
for movies in sorted_similar_movies:
    print(get_title_from_index(movies[0]), movies[1])
    i = i+1;
    if i>10:
        break

Insidious 1.0000000000000007
Creep 0.2910427500435996
The Conjuring 0.28
The Boy 0.27456258919345766
The Last Days 0.2683281572999748
Sweetheart 0.2683281572999748
Rising Phoenix 0.26666666666666666
Apollo 18 0.2618614682831909
The Conjuring 2 0.2514474228374849
In the Tall Grass 0.25021729686848976
In The Deep 0.25021729686848976
