In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

In [2]:
import nltk
nltk.download('stopwords')
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phutran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
## get the file path
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        print(file_path)
file_path

/Users/phutran/Documents/Coding/Practice Python/Netflix Movie Recommendation/.DS_Store
/Users/phutran/Documents/Coding/Practice Python/Netflix Movie Recommendation/netflix movie recommendation.ipynb
/Users/phutran/Documents/Coding/Practice Python/Netflix Movie Recommendation/netflix_titles.csv


'/Users/phutran/Documents/Coding/Practice Python/Netflix Movie Recommendation/netflix_titles.csv'

In [4]:
# another way to get file path
# df = pd.read_csv('../Netflix Movie Recommendation/netflix_titles.csv')

# Import data

In [5]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Data Wrangling

In [6]:
round(df.isna().sum()/df.shape[0]*100,3)

show_id          0.000
type             0.000
title            0.000
director        29.908
cast             9.368
country          9.436
date_added       0.114
release_year     0.000
rating           0.045
duration         0.034
listed_in        0.000
description      0.000
dtype: float64

check duplicates

In [7]:
df["show_id"].duplicated().sum()

0

#### six columns have NA data. since they are all text data, it is hard to impute. hence, we should just drop them

In [8]:
df.dropna(inplace=True)
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [9]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...
24,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"September 21, 2021",1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...
...,...,...,...,...,...,...,...,...,...,...,...,...
8801,s8802,Movie,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...","United Arab Emirates, Jordan","March 9, 2016",2015,TV-MA,96 min,"Dramas, International Movies, Thrillers",Recovering alcoholic Talal wakes up inside a s...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


since we already removed NA, it is a good practice to reset index

In [10]:
df.reset_index(inplace = True, drop=True)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
1,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
2,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
3,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...
4,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"September 21, 2021",1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...
...,...,...,...,...,...,...,...,...,...,...,...,...
5327,s8802,Movie,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...","United Arab Emirates, Jordan","March 9, 2016",2015,TV-MA,96 min,"Dramas, International Movies, Thrillers",Recovering alcoholic Talal wakes up inside a s...
5328,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
5329,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
5330,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


Let's just keep only movie data

In [11]:
movie_df = df[df["type"] == "Movie"]

In [12]:
movie_df.reset_index( inplace=True,drop = True)

# Keep useful columns

In [13]:
movie_columns = ["title", "director", "listed_in", "description", "country"] 
movie_df.drop(columns= [col for col in df if col not in movie_columns], inplace=True)
movie_df.head()

Unnamed: 0,title,director,country,listed_in,description
0,Sankofa,Haile Gerima,"United States, Ghana, Burkina Faso, United Kin...","Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
1,The Starling,Theodore Melfi,United States,"Comedies, Dramas",A woman adjusting to life after a loss contend...
2,Je Suis Karl,Christian Schwochow,"Germany, Czech Republic","Dramas, International Movies",After most of her family is murdered in a terr...
3,Jeans,S. Shankar,India,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...
4,Grown Ups,Dennis Dugan,United States,Comedies,Mourning the loss of their beloved junior high...


# Cleaning Text Data

As a good practice, it's better to remove all unnecessary notations, remove stop words, use lemmatization (Lemmatization helps us to achieve the root forms (sometimes called synonyms in search context) of inflected (derived) words)
, and convert all letters to lower cases (Since I'm going to apply count vectorizer, if there exist a word with both upper and lower cases, then count vectorizer identify that word as two words).

In [14]:
keywords = []
for i in range(movie_df.shape[0]):
    keywords.append(' '.join(list(movie_df.iloc[i].values)))
keywords[:3]

['Sankofa Haile Gerima United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia Dramas, Independent Movies, International Movies On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.',
 "The Starling Theodore Melfi United States Comedies, Dramas A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward.",
 'Je Suis Karl Christian Schwochow Germany, Czech Republic Dramas, International Movies After most of her family is murdered in a terrorist bombing, a young woman is unknowingly lured into joining the very group that killed them.']

In [15]:
test =re.sub('[^a-zA-Z]',' ','a basd')
print(test)
print(test.split())

a basd
['a', 'basd']


In [16]:
for i in range(5):
    print(keywords[i])
    review=re.sub('[^a-zA-Z]',' ',keywords[i])
    print(review)

Sankofa Haile Gerima United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia Dramas, Independent Movies, International Movies On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.
Sankofa Haile Gerima United States  Ghana  Burkina Faso  United Kingdom  Germany  Ethiopia Dramas  Independent Movies  International Movies On a photo shoot in Ghana  an American model slips back in time  becomes enslaved on a plantation and bears witness to the agony of her ancestral past 
The Starling Theodore Melfi United States Comedies, Dramas A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward.
The Starling Theodore Melfi United States Comedies  Dramas A woman adjusting to life after a loss contends with a feisty bird that s taken over her garden   and a husband who s struggling to find a way for

In [17]:
lem=nltk.WordNetLemmatizer()
corpus=[]

for i in range(len(keywords)):
    review=re.sub('[^a-zA-Z]',' ',keywords[i])
    review=review.lower()
    review=review.split()
    review=[lem.lemmatize(w) for w in review if w not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In the char set matching rule [...] you can specify ^ as first char to mean "not in"

In [18]:
movie_df["keywords"] = corpus
movie_df.head()

Unnamed: 0,title,director,country,listed_in,description,keywords
0,Sankofa,Haile Gerima,"United States, Ghana, Burkina Faso, United Kin...","Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",sankofa haile gerima united state ghana burkin...
1,The Starling,Theodore Melfi,United States,"Comedies, Dramas",A woman adjusting to life after a loss contend...,starling theodore melfi united state comedy dr...
2,Je Suis Karl,Christian Schwochow,"Germany, Czech Republic","Dramas, International Movies",After most of her family is murdered in a terr...,je suis karl christian schwochow germany czech...
3,Jeans,S. Shankar,India,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...,jean shankar india comedy international movie ...
4,Grown Ups,Dennis Dugan,United States,Comedies,Mourning the loss of their beloved junior high...,grown ups dennis dugan united state comedy mou...


# Featuring Count Matrix

I'm going to use count vectorizer for this task. However, you can use TF-IDF as well.

CountVectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. ... The value of each cell is nothing but the count of the word in that particular text sample.

In [19]:
cv=CountVectorizer()
cvdf=cv.fit_transform(movie_df['keywords'])

In [20]:
#since cvdf is sparse matrix, we need to put toarray() part to show our matrix
cvdf.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Let's calculate Cosine similarity

We use cosine similarity to calculate similarity between movies (we can use linear kernal as well).

In [21]:
cs = cosine_similarity(cvdf)
cs

array([[1.        , 0.13977131, 0.17069719, ..., 0.17069719, 0.16012815,
        0.16343011],
       [0.13977131, 1.        , 0.09304842, ..., 0.18609684, 0.17457431,
        0.08908708],
       [0.17069719, 0.09304842, 1.        , ..., 0.04545455, 0.08528029,
        0.17407766],
       ...,
       [0.17069719, 0.18609684, 0.04545455, ..., 1.        , 0.17056057,
        0.04351941],
       [0.16012815, 0.17457431, 0.08528029, ..., 0.17056057, 1.        ,
        0.08164966],
       [0.16343011, 0.08908708, 0.17407766, ..., 0.04351941, 0.08164966,
        1.        ]])

# Recommendations

In [22]:
#let's write a function to get recommendations for given movie
def movie_rec(title):
    
    #extract movie index of given movie title
    ##### to clarify [title in name for name in movie_df["title"] is a list of boolean, and you can slice a dataframe using a list of boolean
    movie_index= movie_df[[title in name for name in movie_df["title"]]].index[0]
    
    #get similarity score and its index for given movie title
    
    similarity_score=list(enumerate(cs[movie_index]))
    
    #sorted similarity scores for given movie title (Descending order)
    similarity_score=sorted(similarity_score,key=lambda x:x[1],reverse=True)
    
    #print(similarity_score[0:12])
    
    #extract top 10 similarity scores for given movie. we choose sequence 1:11 because index 0 would be the entered movie itself
    similarity_score=similarity_score[1:11]
    
    #extract index values of top 10 movies
    movie_indices=[idx[0] for idx in similarity_score]
    
    #return recommended movies with their index values
    return movie_df['title'][movie_indices]

In [23]:
movie_rec("mama")

681                      Leap!
716            Cemara's Family
4771           The Breadwinner
1746            Big Time Movie
1311             Fan of Amoory
2786    Ghost of the Mountains
2993                    Zapped
3647              Dear Dracula
2904                    Sahara
4894         The Little Prince
Name: title, dtype: object

According to above results, we can recommend above 10 movies to people who have watched 'Jeans' movie.