In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
df = pd.read_csv('/content/netflix_titles.csv') #read data
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...


In [3]:
df['description'] = df['description'].str.lower()
df['title'] = df['title'].str.lower()

def tokenization(token_colum):
  column_to_tokenize = token_colum['description']
  token = nltk.word_tokenize(column_to_tokenize)
  words = [w for w in token if w.isalpha()]
  return words

df['description_tokens'] = df.apply(tokenization,axis=1)
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,description_tokens
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",in a future where the elite inhabit an island ...,"[in, a, future, where, the, elite, inhabit, an..."


In [4]:
sw = set (stopwords.words('english'))

def stpWords_fun(stpWords):
  lst_stpw = stpWords['description_tokens']
  cleaned = [w for w in lst_stpw if not w in sw]
  return cleaned

df['stopwords_removed'] = df.apply(stpWords_fun,axis=1)
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,description_tokens,stopwords_removed
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",in a future where the elite inhabit an island ...,"[in, a, future, where, the, elite, inhabit, an...","[future, elite, inhabit, island, paradise, far..."


In [5]:
lemmatizer = WordNetLemmatizer()

def lem_fun(column):
  to_lem = column['stopwords_removed']
  lemmed = [lemmatizer.lemmatize(w) for w in to_lem]
  return lemmed

df['description_lem'] = df.apply(lem_fun, axis=1)
df['description_lem']

0       [future, elite, inhabit, island, paradise, far...
1       [devastating, earthquake, hit, mexico, city, t...
2       [army, recruit, found, dead, fellow, soldier, ...
3       [postapocalyptic, world, robot, hide, fear, da...
4       [brilliant, group, student, become, expert, in...
                              ...                        
7782    [lebanon, civil, war, deprives, zozo, family, ...
7783    [scrappy, poor, boy, worm, way, tycoon, dysfun...
7784    [documentary, south, african, rapper, nasty, c...
7785    [dessert, wizard, adriano, zumbo, look, next, ...
7786    [documentary, delf, mystique, behind, trio, ex...
Name: description_lem, Length: 7787, dtype: object

In [6]:
def rejoin_words(row):
    my_list = row['description_lem']
    joined_words = ( " ".join(my_list))
    return joined_words

df['description_lem_join'] = df.apply(rejoin_words, axis=1)
df['description_lem_join']

0       future elite inhabit island paradise far crowd...
1       devastating earthquake hit mexico city trapped...
2       army recruit found dead fellow soldier forced ...
3       postapocalyptic world robot hide fear dangerou...
4       brilliant group student become expert intent s...
                              ...                        
7782    lebanon civil war deprives zozo family left gr...
7783    scrappy poor boy worm way tycoon dysfunctional...
7784    documentary south african rapper nasty c hit s...
7785    dessert wizard adriano zumbo look next willy w...
7786    documentary delf mystique behind trio explores...
Name: description_lem_join, Length: 7787, dtype: object

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
X = vectorizer.fit_transform(df['description_lem_join'])
print(vectorizer.get_feature_names())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
from sklearn.metrics.pairwise import linear_kernel

linear_model = linear_kernel(X,X)

In [10]:
title_index = pd.Series(df.index, index=df['title']).drop_duplicates()

In [11]:
def get_recommendations(title, linear_model=linear_model):
  title = title.lower()
  index = title_index[title]
  scores = list(enumerate(linear_model[index]))
  scores = sorted(scores, key=lambda x: x[1], reverse=True)
  scores = scores[1:11]
  movie_index = [i[0] for i in scores]

  
  return df['title'].iloc[movie_index],scores
    


In [None]:
list (df.title)

In [None]:
get_recommendations( 'big')

In [None]:
#splitting dataframe in a particular size 
df2 = df.sample(frac=0.6,random_state=200) 
df2.reset_index()


In [None]:
df2.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'description_tokens', 'stopwords_removed', 'description_lem',
       'description_lem_join'],
      dtype='object')

In [None]:
df2 = df.drop(columns=['show_id','country', 'date_added','release_year','duration', 'description','description_tokens', 'stopwords_removed','description_lem'])

In [None]:
df.to_csv('processed_data.csv', index=False)

In [None]:
df2.head(1)

Unnamed: 0,type,title,director,cast,rating,listed_in,description_lem_join
0,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &...",future elite inhabit island paradise far crowd...


In [None]:
df2.isnull().sum()

type                       0
title                      0
director                2389
cast                     718
rating                     7
listed_in                  0
description_lem_join       0
dtype: int64

In [None]:
df2=df2.fillna('')
df2

Unnamed: 0,type,title,director,cast,rating,listed_in,description_lem_join
0,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &...",future elite inhabit island paradise far crowd...
1,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",TV-MA,"Dramas, International Movies",devastating earthquake hit mexico city trapped...
2,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",R,"Horror Movies, International Movies",army recruit found dead fellow soldier forced ...
3,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",PG-13,"Action & Adventure, Independent Movies, Sci-Fi...",postapocalyptic world robot hide fear dangerou...
4,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",PG-13,Dramas,brilliant group student become expert intent s...
...,...,...,...,...,...,...,...
7782,Movie,zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...",TV-MA,"Dramas, International Movies",lebanon civil war deprives zozo family left gr...
7783,Movie,zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",TV-14,"Dramas, International Movies, Music & Musicals",scrappy poor boy worm way tycoon dysfunctional...
7784,Movie,zulu man in japan,,Nasty C,TV-MA,"Documentaries, International Movies, Music & M...",documentary south african rapper nasty c hit s...
7785,TV Show,zumbo's just desserts,,"Adriano Zumbo, Rachel Khoo",TV-PG,"International TV Shows, Reality TV",dessert wizard adriano zumbo look next willy w...


In [None]:
df2['type'] = pd.factorize(df2.type)[0] #factorize category 
df2['rating'] = pd.factorize(df2.rating)[0] #factorize category 
df2['title_id'] = pd.factorize(df2.title)[0] #factorize category 
df2['listed_in'] = pd.factorize(df2.title)[0] #factorize category 
df2['cast'] = pd.factorize(df2.title)[0] #factorize category 



In [None]:
df2.head(10)

Unnamed: 0,type,title,director,cast,rating,listed_in,description_lem_join,title_id
0,0,3%,,0,0,0,future elite inhabit island paradise far crowd...,0
1,1,7:19,Jorge Michel Grau,1,0,1,devastating earthquake hit mexico city trapped...,1
2,1,23:59,Gilbert Chan,2,1,2,army recruit found dead fellow soldier forced ...,2
3,1,9,Shane Acker,3,2,3,postapocalyptic world robot hide fear dangerou...,3
4,1,21,Robert Luketic,4,2,4,brilliant group student become expert intent s...,4
5,0,46,Serdar Akar,5,0,5,genetics professor experiment treatment comato...,5
6,1,122,Yasir Al Yasiri,6,0,6,awful accident couple admitted grisly hospital...,6
7,1,187,Kevin Reynolds,7,1,7,one high school student attack dedicated teach...,7
8,1,706,Shravan Kumar,8,3,8,doctor go missing psychiatrist wife treat biza...,8
9,1,1920,Vikram Bhatt,9,0,9,architect wife move castle slated become luxur...,9


In [None]:
df2.to_csv('processed_data.csv', index=False)