In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from rake_nltk import Rake
import re

# Gathering the top 250 rated movies from IMDB
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')


#subset to columns we will use for the recommendation engine
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [5]:
# initializing the new column that will contain key words from the plot description
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting plot's key words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary with words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [6]:
#creating bag of words to vectorize -- > genre + director + actors + plot key words
#we want director and actor names to be concatenated so it's person level
df['bag_of_words'] = df['Genre'].apply(lambda x: re.sub(r'\W+', ' ', x).lower()) + ' ' + df['Director'].apply(lambda x:  x.replace(' ','').lower()) + ' ' + df['Actors'].apply(lambda x: re.sub(',', ' ', x.replace(' ','')).lower()) + ' ' + df['Key_words'].apply(lambda x: ' '.join(x).lower())

In [7]:
df.head()


Unnamed: 0,Title,Genre,Director,Actors,Key_words,bag_of_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","[eventual, redemption, finding, solace, two, i...",crime drama frankdarabont timrobbins morganfre...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...","[clandestine, empire, aging, patriarch, reluct...",crime drama francisfordcoppola marlonbrando al...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...","[1920s, new, york, son, portrayed, expands, fa...",crime drama francisfordcoppola alpacino robert...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","[chaos, wreaks, havoc, menace, known, dark, kn...",action crime drama christophernolan christianb...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","[colleagues, reconsider, evidence, forcing, ju...",crime drama sidneylumet martinbalsam johnfiedl...


In [8]:
# instantiating and generating the count matrix
# frequency counter for each word in bag of words
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [9]:
#series of movie titles with index, to match to the similarity matrix 
indices = pd.Series(index=df.index, data=df['Title'])

In [10]:
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # getting the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(df.iloc[i]['Title'])
        
    return recommended_movies

In [11]:
recommendations('Fargo')

['No Country for Old Men',
 'The Departed',
 'Rope',
 'The Big Lebowski',
 'Reservoir Dogs',
 'The Godfather',
 'The Godfather: Part II',
 'On the Waterfront',
 'Goodfellas',
 'Arsenic and Old Lace']