In [77]:
# Using TF-IDF Technique to create recommendation model with Sigmoid Kernal for movies and TV shows. The result helps in finding out similar movies and TV shows with the accuracy. 


# Import numpy and pandas

import numpy as np 
import pandas as pd

# Read the dataset

data = pd.read_csv("netflix_titles.csv")
print (data.shape)
data.head(5)

(7787, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [78]:
# Print the data columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


Content Based Sigmoid Kernel. Using the combined function to compute the similarity.

In [79]:
# Dropping the columns thata are not required

new_movies = data[data['type']=='Movie'].reset_index()
new_movies = new_movies.drop(columns = ['duration','country','date_added','release_year','show_id','type','index','listed_in'])

In [80]:
 # Searching if there is any null values

new_movies['director'] = new_movies['director'].fillna("")
new_movies['cast'] = new_movies['cast'].fillna("")

# Combining columns such as description, cast and directors

new_movies['combined'] = new_movies['description']+new_movies['cast']+movies['director']
new_movies.head(5)

Unnamed: 0,title,director,cast,rating,description,combined
0,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",TV-MA,After a devastating earthquake hits Mexico Cit...,After a devastating earthquake hits Mexico Cit...
1,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",R,"When an army recruit is found dead, his fellow...","When an army recruit is found dead, his fellow..."
2,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",PG-13,"In a postapocalyptic world, rag-doll robots hi...","In a postapocalyptic world, rag-doll robots hi..."
3,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",PG-13,A brilliant group of students become card-coun...,A brilliant group of students become card-coun...
4,122,Yasir Al Yasiri,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",TV-MA,"After an awful accident, a couple admitted to ...","After an awful accident, a couple admitted to ..."


In [81]:
# Importing TfidVectorizer to find out the best recommendation

from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df = 3,max_features = None,analyzer = 'word',token_pattern = 'r\w{1,}', ngram_range = (1,3), stop_words = 'english')

In [82]:
movies['combined'] = movies['combined'].fillna("")

In [83]:
# Using Sigmoid Kernal and creating the matrix

tfv_matrix = tfv.fit_transform(new_movies['combined'])
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix,tfv_matrix)
sig[0]
indices = pd.Series(new_movies.index,index = new_movies['title']).drop_duplicates()
indices

  'stop_words.' % sorted(inconsistent))


title
7:19                                          0
23:59                                         1
9                                             2
21                                            3
122                                           4
                                           ... 
Zoom                                       5372
Zozo                                       5373
Zubaan                                     5374
Zulu Man in Japan                          5375
ZZ TOP: THAT LITTLE OL' BAND FROM TEXAS    5376
Length: 5377, dtype: int64

In [84]:
# Creating a recommendation model to find similar movies 

def recommend(title,sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores,key = lambda x:x[1], reverse = True)
    sig_scores = sig_scores[1:11]
    movies_indices = [i[0] for i in sig_scores]
    return new_movies['title'].iloc[movies_indices],sig_scores
  

In [85]:
recommend('Avengers: Infinity War')

(4895               Thor: Ragnarok
 3587     Resident Evil: Afterlife
 4140         Take Me Home Tonight
 1878                          Her
 4707     The Pursuit of Happyness
 605                 Before I Fall
 975     Chris D'Elia: Man on Fire
 179                 A Secret Love
 3449           Prescription Thugs
 5349         Yours, Mine and Ours
 Name: title, dtype: object,
 [(4895, 0.7616270181011502),
  (3587, 0.7616232966831555),
  (4140, 0.7616193098679381),
  (1878, 0.761617602870682),
  (4707, 0.7616143417188269),
  (605, 0.7616131942359952),
  (975, 0.7616130803234207),
  (179, 0.7616128661965993),
  (3449, 0.7616128656226869),
  (5349, 0.76161253836603)])

In [86]:
recommend('Tarzan')

(4158                               Tarif de nuit
 4334                                 The Command
 808     Brian Regan: Nunchucks and Flamethrowers
 4707                    The Pursuit of Happyness
 2788                                     Maynard
 2082          Indiana Jones and the Last Crusade
 4171                                Teach Us All
 2182                 Jeff Dunham: Beside Himself
 3700                            S Is for Stanley
 4898                          Thorne: Sleepyhead
 Name: title, dtype: object,
 [(4158, 0.7616181393619372),
  (4334, 0.7616172582464316),
  (808, 0.7616150096251881),
  (4707, 0.7616148951884403),
  (2788, 0.7616146430974647),
  (2082, 0.7616145351898341),
  (4171, 0.7616138606485937),
  (2182, 0.7616135532341477),
  (3700, 0.7616131948927722),
  (4898, 0.761612689658796)])

Performing Sigmoid Kernel on TV Shows

In [87]:
# Dropping the columns that are not required

tv_shows = data[data['type']=='TV Show'].reset_index()
tv_shows = tv_shows.drop(columns = ['duration','country','date_added','release_year','show_id','type','index','listed_in'])

In [88]:
 # Searching if there is any null values

tv_shows['director'] = tv_shows['director'].fillna("")
tv_shows['cast'] = tv_shows['cast'].fillna("")

# Combining columns such as description, cast and directors

tv_shows['combined'] = tv_shows['description']+tv_shows['cast']+tv_shows['director']

In [89]:
tv_shows.head()

Unnamed: 0,title,director,cast,rating,description,combined
0,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",TV-MA,In a future where the elite inhabit an island ...,In a future where the elite inhabit an island ...
1,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",TV-MA,A genetics professor experiments with a treatm...,A genetics professor experiments with a treatm...
2,1983,,"Robert Więckiewicz, Maciej Musiał, Michalina O...",TV-MA,"In this dark alt-history thriller, a naïve law...","In this dark alt-history thriller, a naïve law..."
3,1994,Diego Enrique Osorno,,TV-MA,Archival video and new interviews examine Mexi...,Archival video and new interviews examine Mexi...
4,Feb-09,,"Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-...",TV-14,"As a psychology professor faces Alzheimer's, h...","As a psychology professor faces Alzheimer's, h..."


In [90]:
# Importing TfidVectorizer to find out the best recommendation

from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df = 3,max_features = None,analyzer = 'word',token_pattern = 'r\w{1,}', ngram_range = (1,3), stop_words = 'english')
tv_shows['combined'] = tv_shows['combined'].fillna("")

In [91]:
# Using Sigmoid Kernal and creating the matrix

tfv_matrix_shows = tfv.fit_transform(tv_shows['combined'])

from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix_shows,tfv_matrix_shows)
sig[0]
indices = pd.Series(tv_shows.index,index = tv_shows['title']).drop_duplicates()
indices

  'stop_words.' % sorted(inconsistent))


title
3%                          0
46                          1
1983                        2
1994                        3
Feb-09                      4
                         ... 
Zindagi Gulzar Hai       2405
Zoids Wild               2406
Zombie Dumb              2407
Zona Rosa                2408
Zumbo's Just Desserts    2409
Length: 2410, dtype: int64

In [92]:
# Creating a recommendation model to find similar TV Shows
 
def recommend(title,sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores,key = lambda x:x[1], reverse = True)
    sig_scores = sig_scores[1:11]
    shows_indices = [i[0] for i in sig_scores]
    return tv_shows['title'].iloc[shows_indices], sig_scores

In [93]:
recommend('Iron Man: Armored Adventures')

(1977                                           The Hollow
 1923                                    The Dragon Prince
 383                       Cinderella and the Four Knights
 1076                  LEGO Jurassic World: Secret Exhibit
 2109                                        The Staircase
 1064    Learning Songs by Little Baby Bum: Nursery Rhy...
 1327             Mystery Science Theater 3000: The Return
 1159                                                Lupin
 1096               Little Baby Bum: Nursery Rhyme Friends
 759                                           Grand Hotel
 Name: title, dtype: object,
 [(1977, 0.761673673909367),
  (1923, 0.7616693914166767),
  (383, 0.7616568124636461),
  (1076, 0.7616491537416048),
  (2109, 0.7616452708075617),
  (1064, 0.7616438428587147),
  (1327, 0.7616435904246859),
  (1159, 0.7616431371729565),
  (1096, 0.7616400920896784),
  (759, 0.7616395289981868)])

In [94]:
recommend("Grey's Anatomy")

(186                           Battle Creek
 531                Do Do Sol Sol La La Sol
 33                      A Boy Name Flora A
 385                             Cinta Iris
 386               Cinta Si Wedding Planner
 1905                             The Crime
 1650                           Secret City
 1514                             President
 859                               Hormones
 1998    The Irregular at Magic High School
 Name: title, dtype: object,
 [(186, 0.7616726260963065),
  (531, 0.7616689331625517),
  (33, 0.7616669835173516),
  (385, 0.7616617394345969),
  (386, 0.7616576078500836),
  (1905, 0.761651856791308),
  (1650, 0.7616503959954495),
  (1514, 0.7616485062865156),
  (859, 0.7616475372370397),
  (1998, 0.761647517158233)])