In [1]:
!pip install rake-nltk # installing rake

# importing the required libraries and packages
from rake_nltk import Rake
import pandas as pd




In [21]:
# reading the netflix data set and assigning columns. 
# Dropping any na values in the dataset.
df = pd.read_csv('/content/netflix_titles.csv') 
df = df[['title', 'type', 'director', 'cast','description']]
df = df.dropna()
df.head(3) # display the first 3

Unnamed: 0,title,type,director,cast,description
1,7:19,Movie,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",After a devastating earthquake hits Mexico Cit...
2,23:59,Movie,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...","When an army recruit is found dead, his fellow..."
3,9,Movie,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...","In a postapocalyptic world, rag-doll robots hi..."


In [22]:
df['keywords'] = '' # creating a keyword column

for index, row in df.iterrows(): # iterating through the description and obtaining the keywords using rake
    plot = row['description']
    r = Rake()
    r.extract_keywords_from_text(plot)
    key_words_dict_scores = r.get_word_degrees()
    row['keywords'] = list (key_words_dict_scores.keys())

df.drop(columns=['description'], inplace=True) # remove the description column 
df.head(3) # display the first 3 data points

Unnamed: 0,title,type,director,cast,keywords
1,7:19,Movie,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...","[trapped, survivors, trying, desperately, deva..."
2,23:59,Movie,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...","[army, recruit, terrifying, secret, found, dea..."
3,9,Movie,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...","[rag, brave, newcomer, joins, postapocalyptic,..."


In [24]:
# mapping the data 
df['title'] = df['title'].map(lambda x: x.split(' '))
df['cast'] = df['cast'].map(lambda x: x.split(',')[:3])
df['director'] = df['director'].map(lambda x: x.split(','))
df['type'] = df['type'].map(lambda x: x.split(','))

# lowercase and joining the names, movies, and cast
for i, r in df.iterrows():
  r['title'] = [x.lower().replace(' ', '') for x in r['title']]
  r['cast'] = [x.lower().replace(' ', '') for x in r['cast']]
  r['director'] = [x.lower().replace(' ','') for x in r['director']]
  r['type'] = [x.lower().replace(' ','') for x in r['type']]

df.head(3) #looking at the first 3 data points

Unnamed: 0,title,type,director,cast,keywords
1,[7:19],[movie],[jorgemichelgrau],"[demiánbichir, héctorbonilla, oscarserrano]","[trapped, survivors, trying, desperately, deva..."
2,[23:59],[movie],[gilbertchan],"[teddchan, stellachung, henleyhii]","[army, recruit, terrifying, secret, found, dea..."
3,[9],[movie],[shaneacker],"[elijahwood, johnc.reilly, jenniferconnelly]","[rag, brave, newcomer, joins, postapocalyptic,..."


In [25]:
df['BOW'] = '' #creaing a BOW coolumn 
columns = ['title', 'cast', 'director', 'type','keywords'] # the columns that will be added to the keywords 

# iterate over the data and add them to the keywords column
for index, row in df.iterrows():
  words = ' '
  for col in columns:
    words += ' '.join(row[col]) + ' '
    row['BOW'] = words

    
df2 = df[['BOW']] # assigning the datagrame 

In [29]:
df2.head(50)

Unnamed: 0,BOW
1,7:19 demiánbichir héctorbonilla oscarserrano ...
2,23:59 teddchan stellachung henleyhii gilbertc...
3,9 elijahwood johnc.reilly jenniferconnelly sh...
4,21 jimsturgess kevinspacey katebosworth rober...
5,46 erdalbeşikçioğlu yaseminallen melisbirkan ...
6,122 aminakhalil ahmeddawood tareklotfy yasira...
7,187 samuell.jackson johnheard kellyrowan kevi...
8,706 divyadutta atulkulkarni mohanagashe shrav...
9,1920 rajneeshduggal adahsharma indraneilsengu...
10,1922 thomasjane mollyparker dylanschmid zakhi...


In [30]:
df = pd.read_csv('/content/netflix_titles.csv') 
df = df.dropna()
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."


In [31]:
result = pd.concat([df['title'], df2['BOW']], axis=1)
result.head(3)

Unnamed: 0,title,BOW
1,7:19,7:19 demiánbichir héctorbonilla oscarserrano ...
2,23:59,23:59 teddchan stellachung henleyhii gilbertc...
3,9,9 elijahwood johnc.reilly jenniferconnelly sh...


In [34]:
result['BOW'][150]

' a billion colour story dhruvapadmakumar gauravsharma vasuki padmakumarnarasimhamurthy movie increasingly intolerant world curious child family faces financial strain idealistic interfaith parents observes '

In [36]:
result.to_csv('title_and_BOW.csv', index=False) # saving the data to be used for modeling