In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from pandas.io.parsers.readers import read_csv
from pandas.io.excel import read_excel
df_1=pd.read_csv('netflix_titles.csv')

In [None]:
df_1.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df_1['description'].head()

0    As her father nears the end of his life, filmm...
1    After crossing paths at a party, a Cape Town t...
2    To protect his family from a powerful drug lor...
3    Feuds, flirtations and toilet talk go down amo...
4    In a city of coaching centers known to train I...
Name: description, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
df_1['description'] = df_1['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(df_1['description'])
tfidf_matrix.shape

(8807, 18895)

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sum = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(df_1.index, index=df_1['title']).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sum=cosine_sum):
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sum[idx]))
  sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return df_1['title'].iloc[movie_indices]

In [None]:
get_recommendations('Blood & Water',cosine_sum)

5344    Message from the King
1884      Walk Away from Love
4285                    Lilli
4271               Lion Pride
4209               Next Enti?
613                 Voiceless
108                 Dive Club
1905             Cold Harbour
6289                  Bewafaa
5485                Ram Jaane
Name: title, dtype: object

In [None]:
get_recommendations('Blood & Water')

5344    Message from the King
1884      Walk Away from Love
4285                    Lilli
4271               Lion Pride
4209               Next Enti?
613                 Voiceless
108                 Dive Club
1905             Cold Harbour
6289                  Bewafaa
5485                Ram Jaane
Name: title, dtype: object

In [None]:
def get_list(x):
  if isinstance(x, list):
    names=[i['name']for i in x]
    if len(names)>3:
      names =names[:1000]
    return names
  return[]

In [None]:

features =['cast','listed_in','description']
for feature in features:
  df_1[feature] = df_1[feature].apply(get_list)


In [None]:
df_1[['title','type','listed_in', 'cast', 'director']].head(3)

Unnamed: 0,title,type,listed_in,cast,director
0,Dick Johnson Is Dead,Movie,[],[],kirstenjohnson
1,Blood & Water,TV Show,[],[],
2,Ganglands,TV Show,[],[],julienleclercq


In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
features = ['cast', 'listed_in', 'description', 'director']

for feature in features:
    df_1[feature] = df_1[feature].apply(clean_data)

In [None]:
def create_soup(x):
    return ' '.join(x['listed_in']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['description'])
df_1['soup'] = df_1.apply(create_soup, axis=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_1['soup'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
df_1 = df_1.reset_index()
indices = pd.Series(df_1.index, index=df_1['title'])

In [None]:
get_recommendations('Midnight Mass', cosine_sim2)


5091                       Before I Wake
5252                       Gerald's Game
5852                                Hush
0                   Dick Johnson Is Dead
1                          Blood & Water
2                              Ganglands
3                  Jailbirds New Orleans
4                           Kota Factory
6       My Little Pony: A New Generation
7                                Sankofa
Name: title, dtype: object

In [None]:
get_recommendations('Blood & Water', cosine_sim2)

1                           Blood & Water
2                               Ganglands
3                   Jailbirds New Orleans
4                            Kota Factory
5                           Midnight Mass
6        My Little Pony: A New Generation
7                                 Sankofa
8           The Great British Baking Show
9                            The Starling
10    Vendetta: Truth, Lies and The Mafia
Name: title, dtype: object