<center> <h1 style="background-color:DarkSlateBlue; color:white" >Movie Recommendation System</h1> 

![Image](https://www.vshsolutions.com/wp-content/uploads/2020/02/recommender-system-for-movie-recommendation.jpg)

<center>
<br>    
<a id="top"></a>    
<div class="list-group" id="list-tab" role="tablist">
  <h3 class="list-group-item list-group-item-action active" style="background-color:DarkSlateBlue; color:white" data-toggle="list"  role="tab" aria-controls="home">Notebook Content!</h3>  
  <a class="list-group-item list-group-item-action" data-toggle="list" href="#Required libraries" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Required libraries<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">1</span></a>
   <a class="list-group-item list-group-item-action" data-toggle="list" href="#I/O" role="tab" aria-controls="profile" style="color:DarkSlateBlue">I/O<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">2</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Custom functions" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Custom functions<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">3</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Data loading" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Data loading<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">4</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Content based recommender" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Content based recommender<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">5</span></a>

<a id='Required libraries'></a>
<h1 style="color:DarkSlateBlue" >Required libraries</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [11]:
#################Libraries##############
#General libraries
import os
import sys

#Data analysis libraries
import pandas as pd
pd.options.display.max_colwidth = 1000
import numpy as np
import operator
import ast
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

#Visualization libraries
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
plotly.offline.init_notebook_mode (connected = True)
import ipywidgets as widgets

#Sklearn
from mlxtend.preprocessing import TransactionEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity

#Nltk
import nltk
#nltk.download('wordnet')
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

#Image processing
from PIL import Image
import requests
from io import BytesIO

#String similarity
from pyjarowinkler import distance as jaro_distance
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler

#Singular value decomposition
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

############################WARNINGS######################
import warnings
warnings.filterwarnings('ignore')

##################################DISPLAY###################################
from IPython.core.display import display, HTML,clear_output, Markdown
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:90% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

#from itertools import combinations,product
#!pip install strsimpy
#from strsimpy.normalized_levenshtein import NormalizedLevenshtein

<a id='I/O'></a>
<h1 style="color:DarkSlateBlue" >I/O</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [2]:
path_metadata = './dat/movies_metadata.csv'
path_credits = './dat/credits.csv'
path_keywords = './dat/keywords.csv'

<a id='Custom functions'></a>
<h1 style="color:DarkSlateBlue" >Custom functions</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [3]:
########################STEP 1###############################################
#Tokens
#def processing(df_input,column_identifier,column_similarities):
#    tokens = df_input[[column_identifier, column_similarities]].drop_duplicates(column_similarities).reset_index().drop(columns=['index'])
#    print("Number of different titles:",tokens.shape[0])
#    tokens[column_similarities] = tokens[column_similarities].fillna('NA')
#    indices = pd.Series(tokens.index, index=tokens[column_identifier]).drop_duplicates()
#    return tokens, indices

#########################STEP 2###############################################
#Let's create the vectorizer and the tfidf matrix
def tfidf(df_input, column_similarities, column_identifier):
    vectorizer = TfidfVectorizer(stop_words='english',max_features=None) #It gets the features that will make up the sparse matrix
    tfidf_matrix = vectorizer.fit_transform(df_input[column_similarities])
    #column_names = vectorizer.get_feature_names()
    #df_tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=column_names,index = df_input[column_identifier])
    #n_components = df_tfidf_matrix.shape[1]
    #n_vectors = df_tfidf_matrix.shape[0]
    #print("Tfidf matrix shape:",df_tfidf_matrix.shape)
    #display("Every title is transformed in a vector of {} components, total number of different words. As there are {} distinct titles we have {} distinct vectors".format(n_components,n_vectors,n_vectors),df_tfidf_matrix)
    return tfidf_matrix

########################STEP 3################################################
#Let's create the cosine similarity matrix
def similarities(df_input, tfidf_matrix, column_identifier):
    cosine_sim = cosine_similarity(tfidf_matrix)
    df_cosine_sim = pd.DataFrame(cosine_sim, columns = df_input[column_identifier],index = df_input[column_identifier])
    print("Cosine similarity matrix shape:",df_cosine_sim.shape)
    return df_cosine_sim

In [4]:
def calculate_jaro_distance(*, selected_title, all_possible_titles, num_similarities):
    #Jaro similarity
    similarity_jaro_list = []
    title_names = [title for title in all_possible_titles if str(title)!="nan"]
    for title_name in title_names:
        similarity_jaro = jaro_distance.get_jaro_distance(selected_title.lower(), title_name.lower())
        similarity_jaro_list.append(similarity_jaro)
    titles_dict_jaro = dict(zip(title_names, similarity_jaro_list))
    titles_dict_sorted_jaro = dict(sorted(titles_dict_jaro.items(), key=operator.itemgetter(1),reverse=True)[:num_similarities])
    df_similarities_jaro = pd.DataFrame(list(titles_dict_sorted_jaro.items()),columns = ['title','similarity']) 
    return df_similarities_jaro

In [5]:
def get_most_likely_items_cosine_similarity(*,items,max_number_of_predictions,df_similarity):
    df_transactions_cosine = df_similarity[df_similarity['title'].isin(items)].drop(columns=items)
    display(df_transactions_cosine.head())
    df_most_similar_items = df_transactions_cosine.drop(columns=['title']).sum(axis = 0).reset_index().rename(columns={0:'similarity'}).sort_values(by="similarity",ascending=False)
    fig = px.bar(df_most_similar_items.head(max_number_of_predictions), x="title",y="similarity",title="Recommended movies (using cosine similarities)", 
           labels={'similarity': "Similarityyy"}, height=500)
    fig.show()

<a id='Data loading'></a>
<h1 style="color:DarkSlateBlue" >Data loading</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [28]:
df_metadata = pd.read_csv(path_metadata,low_memory=False)
df_metadata['id'] = df_metadata['id'].astype('str')
df_metadata['revenue'] = df_metadata['revenue'].astype('float')
df_metadata['imdb_id'] = df_metadata['imdb_id'].str.replace('tt','')
df_metadata = df_metadata[df_metadata['revenue']>10_000_000].drop_duplicates('title')
display(df_metadata.head(1))

#Credits dataset
df_credits = pd.read_csv(path_credits)
df_credits['id'] = df_credits['id'].astype('str')

#Keywords dataset
df_keywords = pd.read_csv(path_keywords)
df_keywords['id'] = df_keywords['id'].astype('str')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [29]:
#Image that we will use in the notebook
#response = requests.get("https://wpamelia.com/wp-content/uploads/2019/06/loading1.jpg")
#image = Image.open(BytesIO(response.content)) 

<a id='Content based recommender'></a>
<h1 style="color:DarkSlateBlue"> Content based recommender </h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

Previously, we built a recommender that only took the title into consideration. Now we are gonna build a content based recommender that takes **genre**, **director**, **cast** and **keywords** into consideration.

From now on, in order to reduce the computation cost, we will be analysing just the movies that had **at least 1 million dollars in revenues**.

In [30]:
####################Let's create a column with all the info for the content based recommender############################
#Let's merge the movies dataset with the credits dataset and the keywords dataset
df_metadata1 = df_metadata.merge(df_credits, on='id',how="inner").drop_duplicates()
df_metadata2 = df_metadata1.merge(df_keywords, on='id',how="inner").drop_duplicates().drop_duplicates('title')


print("Number of movies with revenues greater than 10 million dollars:",df_metadata2.shape[0])

#Literal eval
df_metadata2['crew'] = df_metadata2['crew'].apply(ast.literal_eval)
df_metadata2['cast'] = df_metadata2['cast'].apply(ast.literal_eval)
df_metadata2['keywords'] = df_metadata2['keywords'].apply(ast.literal_eval)

#######################GENRES###########################
#Let's get the genres of each movie
df_metadata2['genre_formatted'] = df_metadata2['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


######################DIRECTOR#########################
#Let's get the directors of each movie
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

df_metadata2['director'] = df_metadata2['crew'].apply(get_director) #get diretor from crew column
df_metadata2['director'] = df_metadata2['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", ""))) #lower case and remove white spaces 
df_metadata2['director'] = df_metadata2['director'].apply(lambda x: [x,x]) #mention director twice times to weigh it more



#####################CAST##############################
df_metadata2['cast_formatted'] = df_metadata2['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df_metadata2['cast_formatted'] = df_metadata2['cast_formatted'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
df_metadata2['cast_formatted'] = df_metadata2['cast_formatted'].apply(lambda x: x[:3] if len(x) >=3 else x) #keep first 3 actors from the list



####################KEYWORDS############################
#stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
df_metadata2['keywords_formatted'] = df_metadata2['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
#df_metadata2['keywords_formatted'] = df_metadata2['keywords_formatted'].apply(lambda x: [stemmer.stem(i) for i in x]) #stem of the word
df_metadata2['keywords_formatted'] = df_metadata2['keywords_formatted'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x]) #lemma of the word
df_metadata2['keywords_formatted'] = df_metadata2['keywords_formatted'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x]) #lowercase and without white spaces

##################COMBINE PREVIOUS COLUMNS: GENRE, DIRECTOR, CAST, KEYWORDS#############
df_metadata2['soup'] = df_metadata2['genre_formatted'] + df_metadata2['director'] + df_metadata2['cast_formatted'] + df_metadata2['keywords_formatted']
df_metadata2['soup'] = df_metadata2['soup'].apply(lambda x: ' '.join(x))

df_metadata2.head()

Number of movies with revenues greater than 10 million dollars: 4270


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords,genre_formatted,director,cast_formatted,keywords_formatted,soup
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2x...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, 'job': 'Director', 'name': 'John Lasseter', 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}, {'credit_id': '52fe4284c3a36847f8024f4f', 'department': 'Writing', 'gender': 2, 'id': 12891, 'job': 'Screenplay', 'name': 'Joss Whedon', 'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'}, {'credit_id': '52fe4284c3a36847f8024f55', 'department': 'Writing', 'gender': 2, 'id': 7, 'job': 'Screenplay', 'name': 'Andrew Stanton', 'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'}, {'credit_id': '52fe4284c3a36847f8024f5b', 'department': 'Writing', 'gender': 2, 'id': 12892, 'job': 'Screenplay', 'name': 'Joel Cohen', 'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'}, {'credit_id': '52fe4284c3a36847f8024f61', 'department': 'Writing', 'gender': 0, 'id': 12893, 'job': 'Screenplay', 'name': 'Alec Sokolow', 'profile_path': '/v79vlRYi94BZUQnkkyznbGUZLjT.jpg'}, {'credit_id': '52fe4284c3a36847f8024f67', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]","[Animation, Comedy, Family]","[johnlasseter, johnlasseter]","[tomhanks, timallen, donrickles]","[jealousy, toy, boy, friendship, friend, rivalry, boynextdoor, newtoy, toycomestolife]",Animation Comedy Family johnlasseter johnlasseter tomhanks timallen donrickles jealousy toy boy friendship friend rivalry boynextdoor newtoy toycomestolife
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",,8844,113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a36847f80a7c73', 'gender': 2, 'id': 2157, 'name': 'Robin Williams', 'order': 0, 'profile_path': '/sojtJyIV3lkUeThD7A2oHNm8183.jpg'}, {'cast_id': 8, 'character': 'Samuel Alan Parrish / Van Pelt', 'credit_id': '52fe44bfc3a36847f80a7c99', 'gender': 2, 'id': 8537, 'name': 'Jonathan Hyde', 'order': 1, 'profile_path': '/7il5D76vx6QVRVlpVvBPEC40MBi.jpg'}, {'cast_id': 2, 'character': 'Judy Sheperd', 'credit_id': '52fe44bfc3a36847f80a7c77', 'gender': 1, 'id': 205, 'name': 'Kirsten Dunst', 'order': 2, 'profile_path': '/wBXvh6PJd0IUVNpvatPC1kzuHtm.jpg'}, {'cast_id': 24, 'character': 'Peter Shepherd', 'credit_id': '52fe44c0c3a36847f80a7ce7', 'gender': 0, 'id': 145151, 'name': 'Bradley Pierce', 'order': 3, 'profile_path': '/j6iW0vVA23GQniAPSYI6mi4hiEW.jpg'}, {'cast_id': 10, 'character': 'Sarah Whittle', 'credit_id': '52fe44bfc3a36847f80a7c9d', 'gender': 1, 'id': 5149, 'name': 'Bonnie Hunt', 'order': 4, 'profile_path': '/7spiVQ...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', 'gender': 2, 'id': 511, 'job': 'Executive Producer', 'name': 'Larry J. Franco', 'profile_path': None}, {'credit_id': '52fe44bfc3a36847f80a7c89', 'department': 'Writing', 'gender': 2, 'id': 876, 'job': 'Screenplay', 'name': 'Jonathan Hensleigh', 'profile_path': '/l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg'}, {'credit_id': '52fe44bfc3a36847f80a7cdd', 'department': 'Sound', 'gender': 2, 'id': 1729, 'job': 'Original Music Composer', 'name': 'James Horner', 'profile_path': '/oLOtXxXsYk8X4qq0ud4xVypXudi.jpg'}, {'credit_id': '52fe44bfc3a36847f80a7c7d', 'department': 'Directing', 'gender': 2, 'id': 4945, 'job': 'Director', 'name': 'Joe Johnston', 'profile_path': '/fok4jaO62v5IP6hkpaaAcXuw2H.jpg'}, {'credit_id': '52fe44bfc3a36847f80a7cd7', 'department': 'Editing', 'gender': 2, 'id': 4951, 'job': 'Editor', 'name': 'Robert Dalva', 'profile_path': None}, {'credit_id': '573523bec3a368025100062c', 'department': 'Production', 'gender': 0...","[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'name': 'based on children's book'}, {'id': 33467, 'name': 'new home'}, {'id': 158086, 'name': 'recluse'}, {'id': 158091, 'name': 'giant insect'}]","[Adventure, Fantasy, Family]","[joejohnston, joejohnston]","[robinwilliams, jonathanhyde, kirstendunst]","[boardgame, disappearance, basedonchildren'sbook, newhome, recluse, giantinsect]",Adventure Fantasy Family joejohnston joejohnston robinwilliams jonathanhyde kirstendunst boardgame disappearance basedonchildren'sbook newhome recluse giantinsect
2,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",,31357,114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"[{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself... and never let you forget it.,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': 'Savannah 'Vannah' Jackson', 'credit_id': '52fe44779251416c91011aad', 'gender': 1, 'id': 8851, 'name': 'Whitney Houston', 'order': 0, 'profile_path': '/69ouDnXnmklYPr4sMJXWKYz81AL.jpg'}, {'cast_id': 2, 'character': 'Bernadine 'Bernie' Harris', 'credit_id': '52fe44779251416c91011ab1', 'gender': 1, 'id': 9780, 'name': 'Angela Bassett', 'order': 1, 'profile_path': '/tHkgSzhEuJKp5hqp0DZLad8HNZ9.jpg'}, {'cast_id': 3, 'character': 'Gloria 'Glo' Matthews', 'credit_id': '52fe44779251416c91011ab5', 'gender': 1, 'id': 18284, 'name': 'Loretta Devine', 'order': 2, 'profile_path': '/zLQFwQTFtHkb8sbFdkPNamFI7jv.jpg'}, {'cast_id': 4, 'character': 'Robin Stokes', 'credit_id': '52fe44779251416c91011ab9', 'gender': 1, 'id': 51359, 'name': 'Lela Rochon', 'order': 3, 'profile_path': '/9DBu3r5O4fBosSS4FnSzFCVpm0O.jpg'}, {'cast_id': 5, 'character': 'Marvin King', 'credit_id': '52fe44779251416c91011abd', 'gender': 2, 'id': 66804, 'name': 'Gregory Hines', 'order': 4, 'profile_...","[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', 'gender': 2, 'id': 2178, 'job': 'Director', 'name': 'Forest Whitaker', 'profile_path': '/4pMQkelS5lK661m9Kz3oIxLYiyS.jpg'}, {'credit_id': '52fe44779251416c91011ae1', 'department': 'Writing', 'gender': 0, 'id': 5144, 'job': 'Screenplay', 'name': 'Ronald Bass', 'profile_path': None}, {'credit_id': '52fe44779251416c91011ae7', 'department': 'Production', 'gender': 0, 'id': 5144, 'job': 'Producer', 'name': 'Ronald Bass', 'profile_path': None}, {'credit_id': '52fe44779251416c91011aff', 'department': 'Production', 'gender': 2, 'id': 21968, 'job': 'Producer', 'name': 'Ezra Swerdlow', 'profile_path': None}, {'credit_id': '52fe44779251416c91011af9', 'department': 'Production', 'gender': 1, 'id': 70592, 'job': 'Producer', 'name': 'Deborah Schindler', 'profile_path': '/2vFzdHxcB8cEtvPlNSs2VGZ7WG3.jpg'}, {'credit_id': '52fe44779251416c91011adb', 'department': 'Writing', 'gender': 0, 'id': 111118, 'job': 'Screenplay', 'name': ...","[{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id': 14768, 'name': 'single mother'}, {'id': 15160, 'name': 'divorce'}, {'id': 33455, 'name': 'chick flick'}]","[Comedy, Drama, Romance]","[forestwhitaker, forestwhitaker]","[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, singlemother, divorce, chickflick]",Comedy Drama Romance forestwhitaker forestwhitaker whitneyhouston angelabassett lorettadevine basedonnovel interracialrelationship singlemother divorce chickflick
3,False,"{'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg'}",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,113041,en,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}, {'name': 'Touchstone Pictures', 'id': 9195}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's In For The Surprise Of His Life!,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251416c75039eb9', 'gender': 2, 'id': 67773, 'name': 'Steve Martin', 'order': 0, 'profile_path': '/rI2EMvkfKKPKa5z0nM2pFVBtUyO.jpg'}, {'cast_id': 2, 'character': 'Nina Banks', 'credit_id': '52fe44959251416c75039ebd', 'gender': 1, 'id': 3092, 'name': 'Diane Keaton', 'order': 1, 'profile_path': '/fzgUMnbOkxC6E3EFcYHWHFaiKyp.jpg'}, {'cast_id': 3, 'character': 'Franck Eggelhoffer', 'credit_id': '52fe44959251416c75039ec1', 'gender': 2, 'id': 519, 'name': 'Martin Short', 'order': 2, 'profile_path': '/oZQorXBjTxrdkTJFpoDwOcQ91ji.jpg'}, {'cast_id': 4, 'character': 'Annie Banks-MacKenzie', 'credit_id': '52fe44959251416c75039ec5', 'gender': 1, 'id': 70696, 'name': 'Kimberly Williams-Paisley', 'order': 3, 'profile_path': '/nVp4F4VFqVvjh6huOULUQoiAguY.jpg'}, {'cast_id': 13, 'character': 'Bryan MacKenzie', 'credit_id': '52fe44959251416c75039ef3', 'gender': 2, 'id': 59222, 'name': 'George Newbern', 'order': 4, 'profile_path': '/4...","[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gender': 2, 'id': 37, 'job': 'Original Music Composer', 'name': 'Alan Silvestri', 'profile_path': '/chEsfnDEtRmv1bfOaNAoVEzhCc6.jpg'}, {'credit_id': '52fe44959251416c75039ee9', 'department': 'Camera', 'gender': 2, 'id': 5506, 'job': 'Director of Photography', 'name': 'Elliot Davis', 'profile_path': None}, {'credit_id': '52fe44959251416c75039ecb', 'department': 'Writing', 'gender': 1, 'id': 17698, 'job': 'Screenplay', 'name': 'Nancy Meyers', 'profile_path': '/nMPHU06dnvVxEjjcnPCPUQgQ2Mp.jpg'}, {'credit_id': '52fe44959251416c75039edd', 'department': 'Production', 'gender': 1, 'id': 17698, 'job': 'Producer', 'name': 'Nancy Meyers', 'profile_path': '/nMPHU06dnvVxEjjcnPCPUQgQ2Mp.jpg'}, {'credit_id': '52fe44959251416c75039ed1', 'department': 'Writing', 'gender': 2, 'id': 26160, 'job': 'Screenplay', 'name': 'Albert Hackett', 'profile_path': None}, {'credit_id': '52fe44959251416c75039eef', 'department': 'Directing', 'gender...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'confidence'}, {'id': 4995, 'name': 'aging'}, {'id': 5600, 'name': 'daughter'}, {'id': 10707, 'name': 'mother daughter relationship'}, {'id': 13149, 'name': 'pregnancy'}, {'id': 33358, 'name': 'contraception'}, {'id': 170521, 'name': 'gynecologist'}]",[Comedy],"[charlesshyer, charlesshyer]","[stevemartin, dianekeaton, martinshort]","[baby, midlifecrisis, confidence, aging, daughter, motherdaughterrelationship, pregnancy, contraception, gynecologist]",Comedy charlesshyer charlesshyer stevemartin dianekeaton martinshort baby midlifecrisis confidence aging daughter motherdaughterrelationship pregnancy contraception gynecologist
4,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]",,949,113277,en,Heat,"Obsessive master thief, Neil McCauley leads a top-notch crew on various insane heists throughout Los Angeles while a mentally unstable detective, Vincent Hanna pursues him without rest. Each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence.",17.924927,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,"[{'name': 'Regency Enterprises', 'id': 508}, {'name': 'Forward Pass', 'id': 675}, {'name': 'Warner Bros.', 'id': 6194}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"[{'cast_id': 25, 'character': 'Lt. Vincent Hanna', 'credit_id': '52fe4292c3a36847f80291f5', 'gender': 2, 'id': 1158, 'name': 'Al Pacino', 'order': 0, 'profile_path': '/ks7Ba8x9fJUlP9decBr6Dh5mThX.jpg'}, {'cast_id': 26, 'character': 'Neil McCauley', 'credit_id': '52fe4292c3a36847f80291f9', 'gender': 2, 'id': 380, 'name': 'Robert De Niro', 'order': 1, 'profile_path': '/lvTSwUcvJRLAJ2FB5qFaukel516.jpg'}, {'cast_id': 27, 'character': 'Chris Shiherlis', 'credit_id': '52fe4292c3a36847f80291fd', 'gender': 2, 'id': 5576, 'name': 'Val Kilmer', 'order': 2, 'profile_path': '/AlhPeiH8R4reMNGNQ9ag1FPbuW9.jpg'}, {'cast_id': 28, 'character': 'Nate', 'credit_id': '52fe4292c3a36847f8029201', 'gender': 2, 'id': 10127, 'name': 'Jon Voight', 'order': 3, 'profile_path': '/c7BvyqlvqDkfkFqSBUCiR21fvTh.jpg'}, {'cast_id': 29, 'character': 'Michael Cheritto', 'credit_id': '52fe4292c3a36847f8029205', 'gender': 2, 'id': 3197, 'name': 'Tom Sizemore', 'order': 4, 'profile_path': '/soINOuacuiThRb2LyPD4tTWve7C.jp...","[{'credit_id': '52fe4292c3a36847f802916d', 'department': 'Directing', 'gender': 2, 'id': 638, 'job': 'Director', 'name': 'Michael Mann', 'profile_path': '/nKmUpRpuQIsYubR7vIxVKhkbaTW.jpg'}, {'credit_id': '52fe4292c3a36847f8029173', 'department': 'Writing', 'gender': 2, 'id': 638, 'job': 'Screenplay', 'name': 'Michael Mann', 'profile_path': '/nKmUpRpuQIsYubR7vIxVKhkbaTW.jpg'}, {'credit_id': '52fe4292c3a36847f8029179', 'department': 'Production', 'gender': 2, 'id': 1254, 'job': 'Producer', 'name': 'Art Linson', 'profile_path': '/dEtVivCXxQBtIzmJcUNupT1AB4H.jpg'}, {'credit_id': '52fe4292c3a36847f802917f', 'department': 'Production', 'gender': 2, 'id': 638, 'job': 'Producer', 'name': 'Michael Mann', 'profile_path': '/nKmUpRpuQIsYubR7vIxVKhkbaTW.jpg'}, {'credit_id': '52fe4292c3a36847f8029185', 'department': 'Sound', 'gender': 2, 'id': 5581, 'job': 'Original Music Composer', 'name': 'Elliot Goldenthal', 'profile_path': '/mr1rr5bQySCwp564E1Ag363SgLH.jpg'}, {'credit_id': '52fe4292c3a36847f...","[{'id': 642, 'name': 'robbery'}, {'id': 703, 'name': 'detective'}, {'id': 974, 'name': 'bank'}, {'id': 1523, 'name': 'obsession'}, {'id': 3713, 'name': 'chase'}, {'id': 7281, 'name': 'shooting'}, {'id': 9727, 'name': 'thief'}, {'id': 9812, 'name': 'honor'}, {'id': 9826, 'name': 'murder'}, {'id': 9937, 'name': 'suspense'}, {'id': 10051, 'name': 'heist'}, {'id': 10085, 'name': 'betrayal'}, {'id': 10594, 'name': 'money'}, {'id': 10726, 'name': 'gang'}, {'id': 15076, 'name': 'cat and mouse'}, {'id': 18023, 'name': 'criminal mastermind'}, {'id': 34117, 'name': 'cult film'}, {'id': 156121, 'name': 'ex-con'}, {'id': 159343, 'name': 'heist movie'}, {'id': 159434, 'name': 'one last job'}, {'id': 167104, 'name': 'loner'}, {'id': 192261, 'name': 'bank job'}, {'id': 207268, 'name': 'neo-noir'}, {'id': 208009, 'name': 'gun fight'}, {'id': 214983, 'name': 'crime epic'}]","[Action, Crime, Drama, Thriller]","[michaelmann, michaelmann]","[alpacino, robertdeniro, valkilmer]","[robbery, detective, bank, obsession, chase, shooting, thief, honor, murder, suspense, heist, betrayal, money, gang, catandmouse, criminalmastermind, cultfilm, ex-con, heistmovie, onelastjob, loner, bankjob, neo-noir, gunfight, crimeepic]",Action Crime Drama Thriller michaelmann michaelmann alpacino robertdeniro valkilmer robbery detective bank obsession chase shooting thief honor murder suspense heist betrayal money gang catandmouse criminalmastermind cultfilm ex-con heistmovie onelastjob loner bankjob neo-noir gunfight crimeepic


#Filtering keywords that only appear once
s = df_metadata2.apply(lambda x: pd.Series(x['keywords_formatted']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword_formatted'
s = s.value_counts()

s = s[s > 1]

def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

df_metadata2['keywords_formatted'] = df_metadata2['keywords_formatted'].apply(filter_keywords)

#from nltk.corpus import stopwords
#stop = stopwords.words('english')
#tokens['title_clean'] = tokens['title'].apply(lambda x: x.lower()).apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))

In [31]:
###########Let's call the functions
#tokens_content_based, indices_content_based = processing(df_input = df_metadata2, column_identifier = 'title', column_similarities = 'soup')
tfidf_matrix_content_based= tfidf(df_input = df_metadata2, column_similarities = 'soup', column_identifier = 'title')
cosine_sim_content_based = similarities(df_input = df_metadata2, tfidf_matrix = tfidf_matrix_content_based, column_identifier = 'title').reset_index()
print("Size of matrix:",sys.getsizeof(cosine_sim_content_based)/10**6 ,"MB")
display(cosine_sim_content_based)

Cosine similarity matrix shape: (4270, 4270)
Size of matrix: 146.174817 MB


title,title.1,Toy Story,Jumanji,Waiting to Exhale,Father of the Bride Part II,Heat,Sudden Death,GoldenEye,The American President,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Assassins,Leaving Las Vegas,Now and Then,Dangerous Minds,Twelve Monkeys,Babe,Dead Man Walking,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,The Usual Suspects,Home for the Holidays,Mr. Holland's Opus,Friday,From Dusk Till Dawn,Fair Game,Bed of Roses,White Squall,Mary Reilly,Vampire in Brooklyn,Broken Arrow,Happy Gilmore,The Bridges of Madison County,Muppet Treasure Island,Braveheart,Taxi Driver,Rumble in the Bronx,Boomerang,Flirting with Disaster,The Birdcage,Bad Boys,...,The Shack,Power Rangers,Alien: Covenant,CHiPS,Going in Style,RRRrrrr!!!,Smurfs: The Lost Village,The Zookeeper's Wife,Gifted,24,Diary of a Wimpy Kid: The Long Haul,Baywatch,Unforgettable,Snatched,The Fate of the Furious,Captain Underpants: The First Epic Movie,Cars 3,In This Corner of the World,How to Be a Latin Lover,Nasha Russia: Yaytsa sudby,Baahubali 2: The Conclusion,The Irony of Fate. The Sequel,Kabhi Alvida Naa Kehna,The Beguiled,Baby Driver,Rough Night,Weekend Pass,Boj S Tenyu 2: Revansh,Lovey-Dovey 2,Love and the City 2,High Security Vacation,Despicable Me 3,War for the Planet of the Apes,Kidnap,Valerian and the City of a Thousand Planets,Mudhalvan,Fanaa,Atomic Blonde,Dunkirk,Bairavaa,Confidential Assignment,Transformers: The Last Knight,"Mommies, Happy New Year!",Good Time,The Dark Tower,My Old Classmate,The Emoji Movie,Wind River,Baasha,Sivaji: The Boss
0,Toy Story,1.000000,0.010875,0.005908,0.004972,0.000000,0.000000,0.000000,0.005690,0.024766,0.000000,0.000000,0.000000,0.000000,0.005282,0.005687,0.004462,0.035680,0.000000,0.015013,0.000000,0.000000,0.013370,0.000000,0.000000,0.005329,0.000000,0.000000,0.027187,0.000000,0.005437,0.010275,0.006011,0.000000,0.000000,0.000000,0.000000,0.000000,0.006603,0.000000,0.005961,0.000000,0.017065,0.000000,0.000000,0.004720,0.004821,0.004768,0.004826,0.003570,...,0.000000,0.000000,0.000000,0.004862,0.005093,0.006404,0.040040,0.000000,0.000000,0.000000,0.020740,0.005248,0.000000,0.005142,0.000000,0.034818,0.038148,0.019413,0.005285,0.0,0.000000,0.006811,0.000000,0.000000,0.000000,0.005217,0.004956,0.000000,0.006018,0.006035,0.006112,0.041439,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004478,0.000000,0.006205,0.000000,0.000000,0.000000,0.035808,0.000000,0.000000,0.005329
1,Jumanji,0.010875,1.000000,0.000000,0.000000,0.000000,0.009606,0.006698,0.000000,0.017142,0.000000,0.009396,0.000000,0.000000,0.009006,0.000000,0.000000,0.007027,0.000000,0.010342,0.000000,0.000000,0.019927,0.000000,0.011826,0.013692,0.000000,0.000000,0.011689,0.000000,0.000000,0.010287,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.006117,0.000000,0.000000,0.020832,0.000000,0.000000,0.000000,0.000000,0.000000,0.043208,0.000000,...,0.012252,0.008161,0.000000,0.000000,0.000000,0.000000,0.023549,0.000000,0.000000,0.000000,0.014287,0.000000,0.000000,0.000000,0.000000,0.011555,0.022436,0.000000,0.000000,0.0,0.009580,0.000000,0.000000,0.052612,0.000000,0.000000,0.000000,0.000000,0.015462,0.000000,0.000000,0.024372,0.000000,0.000000,0.005772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010849,0.000000,0.000000,0.014629,0.000000,0.011884,0.000000,0.000000,0.000000
2,Waiting to Exhale,0.005908,0.000000,1.000000,0.005965,0.003651,0.000000,0.000000,0.024787,0.000000,0.005668,0.000000,0.005901,0.034737,0.006336,0.006823,0.026367,0.000000,0.013058,0.010295,0.004241,0.000000,0.009168,0.003371,0.000000,0.011714,0.013856,0.000000,0.009429,0.004937,0.023688,0.004652,0.007212,0.000000,0.019545,0.018267,0.004021,0.016393,0.118877,0.003582,0.007151,0.012511,0.006386,0.004530,0.004673,0.005663,0.016187,0.016009,0.005790,0.004283,...,0.027218,0.000000,0.000000,0.005833,0.011196,0.007683,0.007219,0.005035,0.004351,0.000000,0.007761,0.006296,0.005895,0.006169,0.000000,0.006277,0.006878,0.005622,0.006340,0.0,0.000000,0.029671,0.019995,0.005063,0.010602,0.011469,0.005946,0.000000,0.007219,0.020264,0.007333,0.007471,0.033474,0.055878,0.000000,0.000000,0.015859,0.000000,0.003993,0.000000,0.009844,0.000000,0.013641,0.005390,0.026815,0.013072,0.006456,0.000000,0.000000,0.011714
3,Father of the Bride Part II,0.004972,0.000000,0.005965,1.000000,0.000000,0.000000,0.000000,0.005745,0.000000,0.000000,0.000000,0.000000,0.000000,0.005333,0.005742,0.004505,0.000000,0.000000,0.086003,0.000000,0.000000,0.004211,0.000000,0.000000,0.005380,0.000000,0.000000,0.000000,0.000000,0.005490,0.000000,0.006070,0.000000,0.000000,0.000000,0.000000,0.000000,0.006667,0.000000,0.006018,0.000000,0.005374,0.000000,0.000000,0.004766,0.004867,0.004814,0.004873,0.003605,...,0.000000,0.000000,0.000000,0.004909,0.005142,0.006466,0.006075,0.000000,0.000000,0.000000,0.006532,0.005299,0.000000,0.053877,0.000000,0.005283,0.005788,0.000000,0.076572,0.0,0.000000,0.006876,0.000000,0.000000,0.000000,0.005268,0.005004,0.000000,0.006076,0.006093,0.006171,0.006288,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004521,0.000000,0.006265,0.000000,0.000000,0.000000,0.005433,0.000000,0.000000,0.005380
4,Heat,0.000000,0.000000,0.003651,0.000000,1.000000,0.010809,0.007537,0.003516,0.000000,0.003508,0.005265,0.045331,0.002729,0.007728,0.013756,0.059898,0.045649,0.002556,0.002894,0.008838,0.004452,0.002578,0.002086,0.004397,0.043415,0.002713,0.033581,0.001846,0.067999,0.003360,0.002879,0.000000,0.011434,0.080882,0.003576,0.006335,0.008211,0.000000,0.058214,0.000000,0.002449,0.005086,0.007138,0.116745,0.046955,0.000000,0.000000,0.000000,0.054773,...,0.002947,0.004573,0.003587,0.011760,0.010600,0.000000,0.000000,0.003116,0.002693,0.011289,0.000000,0.005015,0.009333,0.012437,0.109612,0.004999,0.000000,0.003479,0.000000,0.0,0.005368,0.004209,0.003914,0.003133,0.042238,0.003224,0.000000,0.005798,0.000000,0.000000,0.005840,0.005950,0.003624,0.031150,0.003234,0.005955,0.012744,0.024600,0.010143,0.005145,0.007046,0.012208,0.003835,0.016433,0.005440,0.000000,0.000000,0.028693,0.005282,0.008385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4265,My Old Classmate,0.000000,0.000000,0.013072,0.000000,0.000000,0.000000,0.000000,0.012590,0.000000,0.000000,0.000000,0.000000,0.009771,0.000000,0.000000,0.000000,0.000000,0.009153,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.009712,0.000000,0.006609,0.000000,0.012032,0.000000,0.000000,0.000000,0.006297,0.012804,0.000000,0.011491,0.014611,0.000000,0.000000,0.008770,0.000000,0.000000,0.000000,0.000000,0.010667,0.010549,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.015070,0.014015,0.000000,0.010869,0.000000,0.000000,0.000000,0.000000,0.013353,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.011116,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
4266,The Emoji Movie,0.035808,0.011884,0.006456,0.005433,0.000000,0.000000,0.000000,0.006217,0.027063,0.000000,0.000000,0.000000,0.000000,0.005772,0.006215,0.004876,0.000000,0.000000,0.016406,0.000000,0.000000,0.014611,0.000000,0.000000,0.005823,0.000000,0.000000,0.029709,0.000000,0.005942,0.011228,0.006569,0.000000,0.000000,0.000000,0.000000,0.000000,0.007216,0.000000,0.006514,0.000000,0.018648,0.000000,0.000000,0.005158,0.005268,0.005210,0.005274,0.003901,...,0.000000,0.000000,0.000000,0.005313,0.005566,0.006998,0.043753,0.000000,0.000000,0.000000,0.022664,0.005735,0.000000,0.005619,0.000000,0.038047,0.041687,0.021213,0.005775,0.0,0.000000,0.007442,0.000000,0.000000,0.000000,0.005701,0.005416,0.000000,0.006576,0.006594,0.006679,0.045283,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004893,0.000000,0.006781,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.005823
4267,Wind River,0.000000,0.000000,0.000000,0.000000,0.028693,0.012592,0.008780,0.000000,0.000000,0.000000,0.006133,0.010071,0.000000,0.009003,0.016025,0.056039,0.043483,0.000000,0.000000,0.007238,0.017393,0.000000,0.024080,0.024085,0.005979,0.000000,0.049714,0.028787,0.013974,0.000000,0.000000,0.000000,0.013320,0.045294,0.000000,0.004481,0.005827,0.000000,0.008018,0.000000,0.000000,0.005925,0.005049,0.007976,0.018595,0.000000,0.000000,0.000000,0.014065,...,0.000000,0.005327,0.004178,0.013700,0.008681,0.000000,0.000000,0.000000,0.000000,0.013151,0.000000,0.005842,0.006623,0.014489,0.020424,0.005824,0.000000,0.000000,0.000000,0.0,0.006254,0.000000,0.000000,0.000000,0.013840,0.000000,0.000000,0.006755,0.000000,0.000000,0.006804,0.006932,0.000000,0.004364,0.003768,0.006938,0.011229,0.028657,0.008937,0.005994,0.004984,0.014222,0.000000,0.037498,0.006337,0.000000,0.000000,1.000000,0.006154,0.005931
4268,Baasha,0.000000,0.000000,0.000000,0.000000,0.005282,0.008140,0.005676,0.000000,0.000000,0.000000,0.007961,0.000000,0.000000,0.000000,0.008217,0.039878,0.005954,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.006650,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004885,0.004112,0.000000,0.005817,0.000000,0.000000,0.005183,0.000000,0.000000,0.007691,0.006554,0.000000,0.006820,0.000000,0.000000,0.000000,0.005158,...,0.000000,0.006915,0.000000,0.007025,0.000000,0.000000,0.000000,0.000000,0.000000,0.008501,0.000000,0.007583,0.000000,0.007430,0.007490,0.007560,0.000000,0.000000,0.000000,0.0,0.008118,0.000000,0.000000,0.000000,0.007097,0.000000,0.000000,0.008768,0.000000,0.000000,0.008831,0.008998,0.000000,0.000000,0.004891,0.119316,0.007259,0.006740,0.005777,0.007780,0.006470,0.009193,0.000000,0.000000,0.008226,0.000000,0.000000,0.006154,1.000000,0.098207


In [32]:
#Let's plot the results
# Button, text box, output
layout = widgets.Layout(width='400px', height='25px') #set width and height
butt_cosine_content = widgets.Button(description='Display similar titles based on cosine similarity',layout = layout,button_style='success')
items_cosine_content = widgets.Text(value='Toy Story',description='Title',layout=layout)
num_similar_movies_cosine_content = widgets.Dropdown(options=list(range(1,21)),value=10,description='Num similar movies',disabled=False,layout=layout)
output_cosine_content = widgets.Output()

def on_butt_clicked(b):
    with output_cosine_content:
        clear_output()
        itemset = list(items_cosine_content.value.split(","))
        itemset = [item.strip().title() for item in itemset]
        print(itemset)
        items_no_reconocidos = [item for item in itemset if item not in list(df_metadata2['title'].unique())]
        if len(items_no_reconocidos)==0:
            #################################ITEM-TO-ITEM COLLABORATIVE FILTERING USING COSINE SIMILARITIES##########################
            get_most_likely_items_cosine_similarity(items = itemset, max_number_of_predictions = num_similar_movies_cosine_content.value, df_similarity = cosine_sim_content_based)
        else:
            if len(items_no_reconocidos)==1:
                print(f"The title {items_no_reconocidos} is not recognised", "\n")
            if len(items_no_reconocidos)>1:
                print(f"The titles {items_no_reconocidos} are not recognised", "\n")
            for item in items_no_reconocidos:
                df_similarities_jaro = calculate_jaro_distance(selected_title = item, all_possible_titles = df_metadata2['title'].unique(), num_similarities = 1)
                print("Maybe you meant:",df_similarities_jaro['title'].values[0])
                
butt_cosine_content.on_click(on_butt_clicked)
widgets.VBox([butt_cosine_content,
              items_cosine_content,
              num_similar_movies_cosine_content,
              output_cosine_content])  

VBox(children=(Button(button_style='success', description='Display similar titles based on cosine similarity',…