In [1]:
# import dependencies

import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
## import csv files from local drive
## from google.colab import files
## uploaded = files.upload()

## import io
## df2 = pd.read_csv(io.BytesIO(uploaded['Filename.csv']))

# Dataset is now stored in a Pandas Dataframe

In [3]:
# Dataset is now stored in a Pandas Dataframe
# PD DataFrame converted to an interactive data table

import io
data = pd.read_csv(io.BytesIO(uploaded['netflixData.csv']))
data.head(10)

NameError: name 'uploaded' is not defined

In [None]:
# Check for null values

print(data.isnull().sum())

In [37]:
# select the columns that we can use to build a Netflix recommendation system:

data = data[["Title", "Description", "Content Type", "Genres"]]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [38]:
# drop the rows containing null values

data = data.dropna()
data

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"
...,...,...,...,...
5962,الف مبروك,"On his wedding day, an arrogant, greedy accoun...",Movie,"Comedies, Dramas, International Movies"
5963,دفعة القاهرة,A group of women leaves Kuwait to attend unive...,TV Show,"International TV Shows, TV Dramas"
5964,海的儿子,"Two brothers start a new life in Singapore, wh...",TV Show,"International TV Shows, TV Dramas"
5965,반드시 잡는다,After people in his town start turning up dead...,Movie,"Dramas, International Movies, Thrillers"


In [39]:
# Now clean the Title column as it contains some data preparation:

import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
# exploring data after cleaning

print(data.Title.sample(10))

2511    ken jeong complet ho
1174          danc forti one
4993             letter king
4533                    take
4309              small chop
4563                    taye
2753               live name
2974    marlon wayan wokeish
76          danger anim asia
2716             life plan b
Name: Title, dtype: object


In [41]:
# Now I use the Genres column as the feature to recommend similar content to the user. 
# Use the concept of cosine similarity here (used to find similarities in two documents):

feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [42]:
# Now set the Title column as an index so that we can find similar content by giving the title of the movie or TV show as an input:

indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [43]:
# Function to recommend Movies and TV shows on Netflix:

def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("girlfriend"))

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object
