# 기본 설정

### 참고할 만한 사이트들
- Recommender using NLP: https://medium.com/@armandj.olivares/building-nlp-content-based-recommender-systems-b104a709c042  
- TfidfVectorizer 정리된 곳: https://chan-lab.tistory.com/27


In [155]:
from collections import Counter
import konlpy
from konlpy.tag import Mecab
from konlpy.tag import Okt

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import fasttext
from gensim.models import Word2Vec, FastText

In [24]:
pd.set_option("display.max_rows", 101)
pd.set_option("display.max_columns", 101)

### nltk가 돌아가지 않을 때 Tip!
``` python 
nltk.download()
```
- 이 코드를 돌려준다!  
<br>  

### 만약 [ SSL: certificate verify failed ] 에러가 뜬다면?!?!
- python 3.8 폴더로 들어간다!
- install Certificates.command를 실행시켜준다!
- 그 다음에 nltk.download()를 다시 실행시켜주면 된다!!


In [2]:
import nltk

In [8]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [9]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/junghyunwoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 시자아아악~~

In [3]:
from rake_nltk import Rake

In [14]:
df = pd.read_csv('movieData.csv')

df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [15]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [16]:
df

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","[years, eventual, redemption, two, imprisoned,..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...","[organized, crime, dynasty, transfers, control..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...","[tightens, grip, 1920s, new, york, son, michae..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","[mysterious, past, people, ability, wreaks, ha..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","[colleagues, forcing, justice, evidence, recon..."
...,...,...,...,...,...
245,The Lost Weekend,"Drama, Film-Noir",Billy Wilder,"Ray Milland, Jane Wyman, Phillip Terry, Howard...","[four, followed, day, drinking, bout, desperat..."
246,Short Term 12,Drama,Destin Daniel Cretton,"Brie Larson, John Gallagher Jr., Stephanie Bea...","[world, alongside, something, supervising, sta..."
247,His Girl Friday,"Comedy, Drama, Romance",Howard Hawks,"Cary Grant, Rosalind Russell, Ralph Bellamy, G...","[wife, book, newspaper, editor, uses, every, t..."
248,The Straight Story,"Biography, Drama",David Lynch,"Sissy Spacek, Jane Galloway Heitz, Joseph A. C...","[lawn, long, journey, mend, mover, tractor, il..."


In [19]:
# instantiating and generating the count matrix
count = CountVectorizer()

In [21]:
count_matrix = count.fit_transform(df[['Title', 'Genre', 'Director', 'Actors', 'Key_words']])

In [22]:
count_matrix

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [23]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [25]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [28]:
recommendations('Fargo', cosine_sim=cosine_sim)

IndexError: index 0 is out of bounds for axis 0 with size 0

# NLP로 recommend system 만들기!
https://www.kdnuggets.com/2019/11/content-based-recommender-using-natural-language-processing-nlp.html

In [93]:
from rake_nltk import Rake
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('movieData.csv')
del df['Unnamed: 0']
df.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


## 줄거리 핵심만 남기기

In [94]:
r = Rake()
def rake_column(column):
    r.extract_keywords_from_text(column)
    key_words_dict_scores = r.get_word_degrees()
    return list( key_words_dict_scores.keys())

In [95]:
df['Key_words'] = df['Plot'].apply(rake_column)

In [96]:
df[['Key_words', 'Plot']]

Unnamed: 0,Key_words,Plot
0,"[two, imprisoned, men, bond, years, acts, find...",Two imprisoned men bond over a number of years...
1,"[clandestine, empire, aging, patriarch, organi...",The aging patriarch of an organized crime dyna...
2,"[grip, michael, career, tightens, family, crim...",The early life and career of Vito Corleone in ...
3,"[people, greatest, psychological, dark, knight...",When the menace known as the Joker emerges fro...
4,"[evidence, colleagues, jury, holdout, attempts...",A jury holdout attempts to prevent a miscarria...
...,...,...
245,"[followed, desperate, life, day, drinking, bou...",The desperate life of a chronic alcoholic is f...
246,"[troubled, waters, worker, longtime, boyfriend...",A 20-something supervising staff member of a r...
247,"[keep, ace, reporter, ex, newspaper, editor, u...",A newspaper editor uses every trick in the boo...
248,"[mover, tractor, old, man, makes, long, journe...",An old man makes a long journey by lawn-mover ...


In [97]:
df.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response,Key_words
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True,"[two, imprisoned, men, bond, years, acts, find..."
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True,"[clandestine, empire, aging, patriarch, organi..."
2,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True,"[grip, michael, career, tightens, family, crim..."
3,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True,"[people, greatest, psychological, dark, knight..."
4,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True,"[evidence, colleagues, jury, holdout, attempts..."


## 다른 column들 전처리해주기

In [131]:
def lower_sentences(column):
    content = column.lower().replace(' ','')
    content = content.replace(',', ' ')
    return content

In [132]:
temp = df.copy()

In [133]:
temp['Genre'] = temp['Genre'].apply(lower_sentences)
temp['Actors'] = temp['Actors'].apply(lower_sentences)
temp['Director'] = temp['Director'].apply(lower_sentences)

In [134]:
temp.head(2)

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response,Key_words
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,crime drama,frankdarabont,"Stephen King (short story ""Rita Hayworth and S...",timrobbins morganfreeman bobgunton williamsadler,Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True,"[two, imprisoned, men, bond, years, acts, find..."
1,The Godfather,1972,R,24 Mar 1972,175 min,crime drama,francisfordcoppola,"Mario Puzo (screenplay), Francis Ford Coppola ...",marlonbrando alpacino jamescaan richards.caste...,The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True,"[clandestine, empire, aging, patriarch, organi..."


In [136]:
def concat_columns(columns):
    content = ''
    for col in columns:
        content = content + col
        
    return content

In [135]:
temp['Genre'][0]

'crime drama'

In [144]:
temp['Bag_of_words'] = (temp['Genre'].astype(str) + ' ' + temp['Director'].astype(str) + ' ' +  temp['Actors'].astype(str) + ' ' + temp['Key_words'].astype(str))

In [145]:
temp

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response,Key_words,Bag_of_words
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,crime drama,frankdarabont,"Stephen King (short story ""Rita Hayworth and S...",timrobbins morganfreeman bobgunton williamsadler,Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True,"[two, imprisoned, men, bond, years, acts, find...",crime drama frankdarabont timrobbins morganfre...
1,The Godfather,1972,R,24 Mar 1972,175 min,crime drama,francisfordcoppola,"Mario Puzo (screenplay), Francis Ford Coppola ...",marlonbrando alpacino jamescaan richards.caste...,The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True,"[clandestine, empire, aging, patriarch, organi...",crime drama francisfordcoppola marlonbrando al...
2,The Godfather: Part II,1974,R,20 Dec 1974,202 min,crime drama,francisfordcoppola,"Francis Ford Coppola (screenplay), Mario Puzo ...",alpacino robertduvall dianekeaton robertdeniro,The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True,"[grip, michael, career, tightens, family, crim...",crime drama francisfordcoppola alpacino robert...
3,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,action crime drama,christophernolan,"Jonathan Nolan (screenplay), Christopher Nolan...",christianbale heathledger aaroneckhart michael...,When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True,"[people, greatest, psychological, dark, knight...",action crime drama christophernolan christianb...
4,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,crime drama,sidneylumet,"Reginald Rose (story), Reginald Rose (screenplay)",martinbalsam johnfiedler leej.cobb e.g.marshall,A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True,"[evidence, colleagues, jury, holdout, attempts...",crime drama sidneylumet martinbalsam johnfiedl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,The Lost Weekend,1945,NOT RATED,01 Jan 1946,101 min,drama film-noir,billywilder,"Charles R. Jackson (from the novel by), Charle...",raymilland janewyman phillipterry howarddasilva,The desperate life of a chronic alcoholic is f...,English,USA,Won 4 Oscars. Another 12 wins & 3 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.0/10,,8.0,26834,tt0037884,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/lost_weekend/,06 Feb 2001,,Paramount Pictures,,True,"[followed, desperate, life, day, drinking, bou...",drama film-noir billywilder raymilland janewym...
246,Short Term 12,2013,R,23 Aug 2013,96 min,drama,destindanielcretton,Destin Daniel Cretton,brielarson johngallagherjr. stephaniebeatriz r...,A 20-something supervising staff member of a r...,English,USA,35 wins & 70 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.0/10,82.0,8.0,62348,tt2370248,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/short_term_12_...,14 Jan 2014,,Cinedigm,http://shortterm12.com,True,"[troubled, waters, worker, longtime, boyfriend...",drama destindanielcretton brielarson johngalla...
247,His Girl Friday,1940,APPROVED,18 Jan 1940,92 min,comedy drama romance,howardhawks,"Charles Lederer (screen play), Ben Hecht (from...",carygrant rosalindrussell ralphbellamy geneloc...,A newspaper editor uses every trick in the boo...,"English, French",USA,2 wins.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.0/10,,8.0,43551,tt0032599,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/his_girl_friday/,28 Dec 2004,,Columbia Pictures,,True,"[keep, ace, reporter, ex, newspaper, editor, u...",comedy drama romance howardhawks carygrant ros...
248,The Straight Story,1999,G,03 Nov 1999,112 min,biography drama,davidlynch,"John Roach, Mary Sweeney",sissyspacek janegallowayheitz josepha.carpente...,An old man makes a long journey by lawn-mover ...,English,"France, UK, USA",Nominated for 1 Oscar. Another 14 wins & 37 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.0/10,86.0,8.0,67045,tt0166896,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/straight_story/,07 Nov 2000,,Buena Vista Pictures,http://disney.go.com/DisneyPictures/straightst...,True,"[mover, tractor, old, man, makes, long, journe...",biography drama davidlynch sissyspacek janegal...


In [146]:
df = temp[['Title','Bag_of_words']]

In [148]:
df

Unnamed: 0,Title,Bag_of_words
0,The Shawshank Redemption,crime drama frankdarabont timrobbins morganfre...
1,The Godfather,crime drama francisfordcoppola marlonbrando al...
2,The Godfather: Part II,crime drama francisfordcoppola alpacino robert...
3,The Dark Knight,action crime drama christophernolan christianb...
4,12 Angry Men,crime drama sidneylumet martinbalsam johnfiedl...
...,...,...
245,The Lost Weekend,drama film-noir billywilder raymilland janewym...
246,Short Term 12,drama destindanielcretton brielarson johngalla...
247,His Girl Friday,comedy drama romance howardhawks carygrant ros...
248,The Straight Story,biography drama davidlynch sissyspacek janegal...


In [156]:
tfid = TfidfVectorizer()
tfid_matrix = tfid.fit_transform(df['Bag_of_words'])
cosine_sim = cosine_similarity(tfid_matrix, tfid_matrix)
print(cosine_sim)

[[1.         0.02693012 0.02491061 ... 0.0036137  0.00358765 0.00349224]
 [0.02693012 1.         0.17513483 ... 0.00377687 0.00374964 0.00364992]
 [0.02491061 0.17513483 1.         ... 0.00349364 0.00346845 0.00337621]
 ...
 [0.0036137  0.00377687 0.00349364 ... 1.         0.00365226 0.00355513]
 [0.00358765 0.00374964 0.00346845 ... 0.00365226 1.         0.0035295 ]
 [0.00349224 0.00364992 0.00337621 ... 0.00355513 0.0035295  1.        ]]


In [165]:
sorted(tfid.vocabulary_.items())

[('000', 0),
 ('10', 1),
 ('100', 2),
 ('12', 3),
 ('17', 4),
 ('1820s', 5),
 ('18th', 6),
 ('1900', 7),
 ('1920s', 8),
 ('1936', 9),
 ('1940s', 10),
 ('1941', 11),
 ('1948', 12),
 ('1950s', 13),
 ('1951', 14),
 ('1954', 15),
 ('1960s', 16),
 ('1970s', 17),
 ('1972', 18),
 ('1979', 19),
 ('1980', 20),
 ('1984', 21),
 ('19th', 22),
 ('20', 23),
 ('2029', 24),
 ('24', 25),
 ('25', 26),
 ('30', 27),
 ('40', 28),
 ('7th', 29),
 ('9000', 30),
 ('aaroneckhart', 31),
 ('abagnale', 32),
 ('abandoned', 33),
 ('abandons', 34),
 ('abducted', 35),
 ('abhishekbharate', 36),
 ('ability', 37),
 ('able', 38),
 ('abrams', 39),
 ('accelerated', 40),
 ('accept', 41),
 ('accepted', 42),
 ('accepts', 43),
 ('accident', 44),
 ('accidentally', 45),
 ('according', 46),
 ('account', 47),
 ('accumulates', 48),
 ('accuracy', 49),
 ('accused', 50),
 ('accuses', 51),
 ('ace', 52),
 ('achieve', 53),
 ('acrophobia', 54),
 ('across', 55),
 ('action', 56),
 ('activities', 57),
 ('actress', 58),
 ('acts', 59),
 ('actua

In [163]:
tfid_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [157]:
indices = pd.Series(df['Title'])

In [158]:
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_movies.append(list(df['Title'])[i])
        
    return recommended_movies

In [159]:
indices[:50]

0                              The Shawshank Redemption
1                                         The Godfather
2                                The Godfather: Part II
3                                       The Dark Knight
4                                          12 Angry Men
5                                      Schindler's List
6         The Lord of the Rings: The Return of the King
7                                          Pulp Fiction
8                                            Fight Club
9     The Lord of the Rings: The Fellowship of the Ring
10                                         Forrest Gump
11       Star Wars: Episode V - The Empire Strikes Back
12                                            Inception
13                The Lord of the Rings: The Two Towers
14                      One Flew Over the Cuckoo's Nest
15                                           Goodfellas
16                                           The Matrix
17                   Star Wars: Episode IV - A N

In [160]:
recommend('The Lion King')

['Monsters, Inc.',
 'WALL·E',
 'The Nightmare Before Christmas',
 "One Flew Over the Cuckoo's Nest",
 'Butch Cassidy and the Sundance Kid',
 'Monty Python and the Holy Grail',
 'Die Hard',
 'Song of the Sea',
 'V for Vendetta',
 'Braveheart']

In [152]:
recommend('The Avengers')

['Guardians of the Galaxy Vol. 2',
 'Aliens',
 'Guardians of the Galaxy',
 'The Martian',
 'Interstellar',
 'Blade Runner',
 'Terminator 2: Judgment Day',
 'The Thing',
 'The Terminator',
 'Spider-Man: Homecoming']

In [154]:
recommend('The Godfather')

['The Godfather: Part II',
 'Scarface',
 'Fargo',
 'Rope',
 'On the Waterfront',
 'Goodfellas',
 'Baby Driver',
 'Cool Hand Luke',
 'Casino',
 'A Clockwork Orange']