# Movie Recommendation
## Text Mining - TF-IDF Score - Cosine Similarity

In [1]:
import pandas as pd
import re
import nltk
import ast
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### ETL Process - Create a Master Dataframe with all info needed

In [2]:
#  Read tmdb_5000_credits.csv into dataframe df_cre
df_cre = pd.read_csv('data/tmdb_5000_credits.csv')
df_cre.rename(columns = {'movie_id':'id'}, inplace = True) #rename for future merge
df_cre.head()

Unnamed: 0,id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [3]:
#  Read tmdb_5000_movies.csv into dataframe df_mov
df_mov = pd.read_csv('data/tmdb_5000_movies.csv')
df_mov.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
# Review shape of the datasets created above
print("Input credits data has {} rows and {} columns".format(len(df_cre), len(df_cre.columns)))
print("Input movies data has {} rows and {} columns".format(len(df_mov), len(df_mov.columns)))

Input credits data has 4803 rows and 4 columns
Input movies data has 4803 rows and 20 columns


#### Cast feature from Credits, and Genres from Movies

In [5]:
# Merging datasets with 1 feature from each - all_df
df_cre = df_cre[['id','title','cast']] # cast feature from credit
df_mov = df_mov[['id','title','genres']] # genres feature from movie
all_df = pd.merge(df_cre, df_mov, how='inner', on = ['id', 'title']) # all_df has cast & genres
all_df

Unnamed: 0,id,title,cast,genres
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
...,...,...,...,...
4798,9367,El Mariachi,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam..."
4799,72766,Newlyweds,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""..."
4800,231617,"Signed, Sealed, Delivered","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam..."
4801,126186,Shanghai Calling,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...",[]


In [6]:
# Check for NaN
all_df.isna().sum()

id        0
title     0
cast      0
genres    0
dtype: int64

In [7]:
# Function to extract cast name & genres

def get_name(name):
    ls = []
    for i in ast.literal_eval(name):
        ls.append(i['name'])  # dictionary key to extract genre name
    return ls

# Get Cast & Genres names
all_df['cast']=all_df['cast'].apply(get_name)
all_df['genres']=all_df['genres'].apply(get_name)
all_df

Unnamed: 0,id,title,cast,genres
0,19995,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Action, Adventure, Fantasy, Science Fiction]"
1,285,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Adventure, Fantasy, Action]"
2,206647,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[Action, Adventure, Crime]"
3,49026,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...","[Action, Crime, Drama, Thriller]"
4,49529,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[Action, Adventure, Science Fiction]"
...,...,...,...,...
4798,9367,El Mariachi,"[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...","[Action, Crime, Thriller]"
4799,72766,Newlyweds,"[Edward Burns, Kerry Bishé, Marsha Dietlein, C...","[Comedy, Romance]"
4800,231617,"Signed, Sealed, Delivered","[Eric Mabius, Kristin Booth, Crystal Lowe, Geo...","[Comedy, Drama, Romance, TV Movie]"
4801,126186,Shanghai Calling,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan...",[]


### Text Mining Process

In [8]:
# Function to perform Text Mining Process

# Lower Case, Remove Punctuation, Tokenization, Remove Stopwords, Lemmatizing (5 text mining techniques)
punc = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~' # exclude , and .
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def clean_text_lemm(text):
    text = ", ".join([word.lower() for word in text if word not in punc])
    tokens = re.split(',', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

# Apply text mining process for cast & genres
all_df['cast'] = all_df['cast'].apply(lambda x: clean_text_lemm(x))
all_df['genres'] = all_df['genres'].apply(lambda x: clean_text_lemm(x))
all_df

Unnamed: 0,id,title,cast,genres
0,19995,Avatar,"[sam worthington, zoe saldana, sigourney wea...","[action, adventure, fantasy, science fiction]"
1,285,Pirates of the Caribbean: At World's End,"[johnny depp, orlando bloom, keira knightley...","[adventure, fantasy, action]"
2,206647,Spectre,"[daniel craig, christoph waltz, léa seydoux,...","[action, adventure, crime]"
3,49026,The Dark Knight Rises,"[christian bale, michael caine, gary oldman,...","[action, crime, drama, thriller]"
4,49529,John Carter,"[taylor kitsch, lynn collins, samantha morto...","[action, adventure, science fiction]"
...,...,...,...,...
4798,9367,El Mariachi,"[carlos gallardo, jaime de hoyos, peter marq...","[action, crime, thriller]"
4799,72766,Newlyweds,"[edward burns, kerry bishé, marsha dietlein,...","[comedy, romance]"
4800,231617,"Signed, Sealed, Delivered","[eric mabius, kristin booth, crystal lowe, ...","[comedy, drama, romance, tv movie]"
4801,126186,Shanghai Calling,"[daniel henney, eliza coupe, bill paxton, a...",[]


In [9]:
# Convert Cast & Genres back to string object for Similarity Measures
all_df['cast'] = all_df['cast'].apply(', '.join)
all_df['genres'] = all_df['genres'].apply(', '.join)
all_df

Unnamed: 0,id,title,cast,genres
0,19995,Avatar,"sam worthington, zoe saldana, sigourney weav...","action, adventure, fantasy, science fiction"
1,285,Pirates of the Caribbean: At World's End,"johnny depp, orlando bloom, keira knightley,...","adventure, fantasy, action"
2,206647,Spectre,"daniel craig, christoph waltz, léa seydoux, ...","action, adventure, crime"
3,49026,The Dark Knight Rises,"christian bale, michael caine, gary oldman, ...","action, crime, drama, thriller"
4,49529,John Carter,"taylor kitsch, lynn collins, samantha morton...","action, adventure, science fiction"
...,...,...,...,...
4798,9367,El Mariachi,"carlos gallardo, jaime de hoyos, peter marqu...","action, crime, thriller"
4799,72766,Newlyweds,"edward burns, kerry bishé, marsha dietlein, ...","comedy, romance"
4800,231617,"Signed, Sealed, Delivered","eric mabius, kristin booth, crystal lowe, g...","comedy, drama, romance, tv movie"
4801,126186,Shanghai Calling,"daniel henney, eliza coupe, bill paxton, al...",


### Recommendation System

In [10]:
# Calculate the TF-IDF score for each, word by word
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# Create a variable where we store the matrix containing each word and its TF-IDF score
cast_matrix = tfidf.fit_transform(all_df['cast']) # with each cast
gen_matrix = tfidf.fit_transform(all_df['genres']) # with each genre

In [11]:
# Cast & Gen Cosine Similarities
cast_cosine_similarities = cosine_similarity(cast_matrix)
gen_cosine_similarities = cosine_similarity(gen_matrix)

gen_cosine_similarities.shape # same shape as cast_cosine_similarities

(4803, 4803)

In [12]:
# Store the names of the 10 most similar movies for each movie in our dataset

# Cast Similarities
cast_similarities = {}
for i in range(len(cast_cosine_similarities)):
    # Sort each element in cast_cosine_similarities and get the indexes of the 10 most similar movie
    similar_indices = cast_cosine_similarities[i].argsort()[:-10:-1]
    # Store in similarities each name of the 10 most similar movies
    cast_similarities[all_df['title'].iloc[i]] = [(cast_cosine_similarities[i][x], all_df['title'][x])
                                           for x in similar_indices][1:]

# Genres Similarities
gen_similarities = {}
for i in range(len(gen_cosine_similarities)):
    # Sort each element in gen_cosine_similarities and get the indexes of the 10 most similar movie
    similar_indices = gen_cosine_similarities[i].argsort()[:-10:-1]
    # Store in similarities each name of the 10 most similar movies
    gen_similarities[all_df['title'].iloc[i]] = [(gen_cosine_similarities[i][x], all_df['title'][x])
                                           for x in similar_indices][1:]

In [13]:
# Display the first element in each of similarity dictionary generated above
print("Cast Similarities" +"\n", list(cast_similarities.items())[0])
print("")
print("Genres Similarities" +"\n", list(gen_similarities.items())[0])

Cast Similarities
 ('Avatar', [(0.08721335985556876, 'The Royal Tenenbaums'), (0.08646560831844316, 'The Dark Knight'), (0.08557927364950112, "Charlie's Angels"), (0.08316447044071226, 'Idiocracy'), (0.08061549287320972, 'Bottle Rocket'), (0.08059803030942912, 'Enemy of the State'), (0.07983366028322471, 'Star Trek Into Darkness'), (0.0787113888057096, 'The SpongeBob Movie: Sponge Out of Water')])

Genres Similarities
 ('Avatar', [(1.0000000000000002, 'Superman Returns'), (1.0000000000000002, 'Beastmaster 2: Through the Portal of Time'), (1.0000000000000002, 'Superman II'), (1.0000000000000002, 'Superman'), (1.0000000000000002, 'Jupiter Ascending'), (1.0000000000000002, 'X-Men: Days of Future Past'), (1.0000000000000002, 'Man of Steel'), (1.0000000000000002, 'The Wolverine')])


In [14]:
# Use the similarity scores (generated above) to access the most similar movies and make recommendations
# Create Content based recommender class
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, mov, recom_mov):
        rec_items = len(recom_mov)
        print(f'The {rec_items} recommended movies for {mov} are:')
        for i in range(rec_items):
            print(f"{i+1}: {recom_mov[i][1]} with {round(recom_mov[i][0], 3)} similarity score") 
        
    def recommend(self, recommendation):
        # Get movie to find recommendations for
        mov = recommendation['mov']
        # Recommend 5 movies
        number_mov = 5
        # Get the number of movies most similars from matrix similarities
        recom_mov = self.matrix_similar[mov][:number_mov]
        # print each item
        self._print_message(mov=mov, recom_mov=recom_mov)

In [15]:
# Instantiate Content based recommender class with the precomputed similarities
cast_recommedations = ContentBasedRecommender(cast_similarities)
gen_recommedations = ContentBasedRecommender(gen_similarities)

In [16]:
# Recommend a movie with input from user

recommendation = {
    "mov": input() # input must be the exact match name in the data; for example "Avatar" or "Superman Returns"
}

print("")
print("Based on Cast Similarities")
try:
    cast_recommedations.recommend(recommendation)
except:
    print("The movie name you entered was not found!")

print("")
print("Based on Genres Similarities")
try:
    gen_recommedations.recommend(recommendation)
except:
    print("The movie name you entered was not found!")

Avatar

Based on Cast Similarities
The 5 recommended movies for Avatar are:
1: The Royal Tenenbaums with 0.087 similarity score
2: The Dark Knight with 0.086 similarity score
3: Charlie's Angels with 0.086 similarity score
4: Idiocracy with 0.083 similarity score
5: Bottle Rocket with 0.081 similarity score

Based on Genres Similarities
The 5 recommended movies for Avatar are:
1: Superman Returns with 1.0 similarity score
2: Beastmaster 2: Through the Portal of Time with 1.0 similarity score
3: Superman II with 1.0 similarity score
4: Superman with 1.0 similarity score
5: Jupiter Ascending with 1.0 similarity score
