<center> <h1 style="background-color:DarkSlateBlue; color:white" >Movie Recommendation System</h1> 

![Image](https://www.vshsolutions.com/wp-content/uploads/2020/02/recommender-system-for-movie-recommendation.jpg)

<center>
<br>    
<a id="top"></a>    
<div class="list-group" id="list-tab" role="tablist">
  <h3 class="list-group-item list-group-item-action active" style="background-color:DarkSlateBlue; color:white" data-toggle="list"  role="tab" aria-controls="home">Notebook Content!</h3>  
  <a class="list-group-item list-group-item-action" data-toggle="list" href="#Required libraries" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Required libraries<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">1</span></a>
   <a class="list-group-item list-group-item-action" data-toggle="list" href="#I/O" role="tab" aria-controls="profile" style="color:DarkSlateBlue">I/O<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">2</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Custom functions" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Custom functions<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">3</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Data loading" role="tab" aria-controls="profile" style="color:DarkSlateBlue">Data loading<span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">4</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#Recommender system based on title string similarity " role="tab" aria-controls="profile" style="color:DarkSlateBlue">Recommender system based on title string similarity <span class="badge badge-primary badge-pill" style="background-color:steelblue; color:white">5</span></a>

<a id='Required libraries'></a>
<h1 style="color:DarkSlateBlue" >Required libraries</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [1]:
#################Libraries##############
#Data analysis libraries
import pandas as pd
pd.options.display.max_colwidth = 1000
import numpy as np
import operator
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

#Visualization libraries
import plotly
import plotly.express as px
plotly.offline.init_notebook_mode (connected = True)
import ipywidgets as widgets

#Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Image processing
from PIL import Image
import requests
from io import BytesIO

#String similarity
from pyjarowinkler import distance as jaro_distance
from strsimpy.normalized_levenshtein import NormalizedLevenshtein

############################WARNINGS######################
import warnings
warnings.filterwarnings('ignore')

##################################DISPLAY###################################
from IPython.core.display import display, HTML,clear_output
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:90% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

<a id='I/O'></a>
<h1 style="color:DarkSlateBlue" >I/O</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [2]:
path_metadata = './dat/movies_metadata.csv'

<a id='Custom functions'></a>
<h1 style="color:DarkSlateBlue" >Custom functions</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [3]:
########################STEP 1###############################################
#Tokens
def processing(df_input,column_identifier,column_similarities):
    tokens = df_input[[column_identifier, column_similarities]].drop_duplicates(column_similarities).reset_index().drop(columns=['index'])
    print("Number of different titles:",tokens.shape[0])
    tokens[column_similarities] = tokens[column_similarities].fillna('NA')
    indices = pd.Series(tokens.index, index=tokens[column_identifier]).drop_duplicates()
    return tokens, indices

#########################STEP 2###############################################
#Let's create the vectorizer and the tfidf matrix
def tfidf(df_input, column_identifier, column_similarities):
    vectorizer = TfidfVectorizer(stop_words='english',max_features=None) #It gets the features that will make up the sparse matrix
    tfidf_matrix = vectorizer.fit_transform(df_input[column_similarities])
    column_names = vectorizer.get_feature_names()
    df_tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=column_names,index = df_input[column_identifier])
    n_components = df_tfidf_matrix.shape[1]
    n_vectors = df_tfidf_matrix.shape[0]
    print("Tfidf matrix shape:",df_tfidf_matrix.shape)
    display("Every title is transformed in a vector of {} components, total number of different words. As there are {} distinct titles we have {} distinct vectors".format(n_components,n_vectors,n_vectors),df_tfidf_matrix)
    return tfidf_matrix, column_names

########################STEP 3################################################
#Let's create the cosine similarity matrix
def similarities(df_input, tfidf_matrix, column_identifier, column_similarities):
    cosine_sim = cosine_similarity(tfidf_matrix)
    df_cosine_sim = pd.DataFrame(cosine_sim, columns = df_input[column_identifier],index = df_input[column_identifier])
    print("Cosine similarity matrix shape:",df_cosine_sim.shape)
    return cosine_sim

In [4]:
def calculate_jaro_distance(*, selected_title, all_possible_titles, num_similarities):
    #Jaro similarity
    similarity_jaro_list = []
    title_names = [title for title in all_possible_titles if str(title)!="nan"]
    for title_name in title_names:
        similarity_jaro = jaro_distance.get_jaro_distance(selected_title.lower(), title_name.lower())
        similarity_jaro_list.append(similarity_jaro)
    titles_dict_jaro = dict(zip(title_names, similarity_jaro_list))
    titles_dict_sorted_jaro = dict(sorted(titles_dict_jaro.items(), key=operator.itemgetter(1),reverse=True)[:num_similarities])
    df_similarities_jaro = pd.DataFrame(list(titles_dict_sorted_jaro.items()),columns = ['title','similarity']) 
    return df_similarities_jaro

In [5]:
def get_most_likely_items_cosine_similarity(*,items,max_number_of_predictions,df_similarity):
    df_transactions_cosine = df_similarity[df_similarity['title'].isin(items)].drop(columns=items)
    display(df_transactions_cosine.head())
    df_most_similar_items = df_transactions_cosine.drop(columns=['title']).sum(axis = 0).reset_index().rename(columns={0:'similarity'}).sort_values(by="similarity",ascending=False)
    fig = px.bar(df_most_similar_items.head(max_number_of_predictions), x="title",y="similarity",title="Recommended movies (using cosine similarities)", 
           labels={'producto': "Most likely product to buy"}, height=500)
    fig.show()

<a id='Data loading'></a>
<h1 style="color:DarkSlateBlue" >Data loading</h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [6]:
df_metadata = pd.read_csv(path_metadata,low_memory=False)
df_metadata['id'] = df_metadata['id'].astype('str')
df_metadata['revenue'] = df_metadata['revenue'].astype('float')
df_metadata['imdb_id'] = df_metadata['imdb_id'].str.replace('tt','')
display(df_metadata.head(1))

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [7]:
#Image that we will use in the notebook
response = requests.get("https://wpamelia.com/wp-content/uploads/2019/06/loading1.jpg")
image = Image.open(BytesIO(response.content)) 

<a id='Recommender system based on title string similarity '></a>
<h1 style="color:DarkSlateBlue" >Recommender system based on title string similarity </h1> 

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>

In [8]:
###########Let's call the functions
df_metadata['soup'] = df_metadata['title']
tokens, indices = processing(df_input = df_metadata, column_identifier = 'title', column_similarities = 'soup')
tfidf_matrix,column_names = tfidf(df_input = tokens, column_identifier = 'title', column_similarities = 'soup')
cosine_sim = similarities(df_input = tokens, tfidf_matrix = tfidf_matrix, column_identifier = 'title', column_similarities = 'soup')

Number of different titles: 42278
Tfidf matrix shape: (42278, 22834)


'Every title is transformed in a vector of 22834 components, total number of different words. As there are 42278 distinct titles we have 42278 distinct vectors'

Unnamed: 0_level_0,00,000,002,008,009,01,04,05,06,08,09,10,100,1000,1001,101,102,1066,107,1080,109,10th,11,110th,1119,112,1138,117,11th,12,120,125,127,12th,13,1303,13b,13hrs,13th,14,140,1408,1453,1492,14th,15,150,15th,16,1600,...,доктора,доля,домовёнка,дракона,ехали,игра,ильф,кабинок,каменная,карусель,кентервильское,королёв,крепость,крыльях,львиная,медведь,мечом,на,начинается,они,оно,перекресток,петров,поезд,посадку,поутру,поэма,привидение,приключения,просит,проснулись,путь,расписания,себе,семь,серьезно,сила,совершенно,трамвае,убить,холмса,шерлока,щитом,юленька,іван,هیچ,کجا,کس,ファンタスティポ,貓狗時空傳
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Caged Heat 3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Subdue,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Century of Birthing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Satan Triumphant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Cosine similarity matrix shape: (42278, 42278)


In [9]:
#Column names are the columns represented in the tfidf matrix
selected_word = 'tic'
matching = [s for s in column_names if selected_word==s]
print(matching)

#Number of different words contained in the titles
number_of_different_words = list(set(df_metadata['title'].str.cat(sep=' ').lower().split()))
matching = [s for s in number_of_different_words if selected_word==s]
print(matching)

#What words contained in the titles are not represented in the tfidf matrix
words_not_represented = [word for word in number_of_different_words if word not in column_names]
matching = [s for s in words_not_represented if selected_word==s]
print(matching)

['tic']
['tic']
[]


#### Using jaro distance

In [10]:
# Button, text box, output
layout = widgets.Layout(width='400px', height='25px') #set width and height
butt = widgets.Button(description='Display similar titles based on jaro string similarity',layout = layout,button_style='success')
text = widgets.Text(value='Cars',description='Title',layout=layout)
num_similar_titles = widgets.Dropdown(options=list(range(1,21)),value=10,description='Num similar movies',disabled=False,layout=layout)
output = widgets.Output()
def on_butt_clicked(_):
    with output:
        clear_output()
        selected_title = text.value
        
        df_similarities_jaro = calculate_jaro_distance(selected_title = selected_title, all_possible_titles = df_metadata['title'].unique(), num_similarities = num_similar_titles.value)

        fig_jaro = px.histogram(df_similarities_jaro,x="title",y="similarity",color="title",
                           color_discrete_sequence = px.colors.qualitative.Dark24)
        fig_jaro.update_layout(width=1000,height=600).show()

butt.on_click(on_butt_clicked)
widgets.VBox([butt,text,num_similar_titles,output])

VBox(children=(Button(button_style='success', description='Display similar titles based on jaro string similar…

#### Using NormalizedLevenshtein distance

In [11]:
# Button, text box, output
layout = widgets.Layout(width='400px', height='25px') #set width and height
butt_2 = widgets.Button(description='Display similar titles based on levenshtein string similarity',layout = layout,button_style='success')
text_2 = widgets.Text(value='Titanic',description='Title',layout=layout)
num_similar_titles_2 = widgets.Dropdown(options=list(range(1,21)),value=10,description='Num similar movies',disabled=False,layout=layout)
output_2 = widgets.Output()
def on_butt_clicked(_):
    with output_2:
        clear_output()
        selected_title = text_2.value

        #Jaro similarity
        similarity_levenshtein_list = []
        title_names = [title for title in df_metadata['title'].unique() if str(title)!="nan"]
        for title_name in title_names:
            similarity_levenshtein = NormalizedLevenshtein().similarity(selected_title.lower(), title_name.lower())
            similarity_levenshtein_list.append(similarity_levenshtein)
        titles_dict_leven = dict(zip(title_names, similarity_levenshtein_list))
        titles_dict_sorted_leven = dict(sorted(titles_dict_leven.items(), key=operator.itemgetter(1),reverse=True)[:num_similar_titles_2.value])
        df_similarities_leven = pd.DataFrame(list(titles_dict_sorted_leven.items()),columns = ['title','similarity']) 
        #display(df_similarities.head())
        fig_leven = px.histogram(df_similarities_leven,x="title",y="similarity",color="title",
                           color_discrete_sequence = px.colors.qualitative.Dark24)
        fig_leven.update_layout(width=1000,height=600).show()

butt_2.on_click(on_butt_clicked)
widgets.VBox([butt_2,text_2,num_similar_titles_2,output_2])

VBox(children=(Button(button_style='success', description='Display similar titles based on levenshtein string …

#### Using cosine similarities of the title

In [31]:
# Button, text box, output
layout = widgets.Layout(width='400px', height='25px') #set width and height
butt_cosine = widgets.Button(description='Display similar titles based on cosine similarity',layout = layout,button_style='success')
text_cosine = widgets.Text(value='Titanic',description='Title',layout=layout)
num_similar_movies_cosine = widgets.Dropdown(options=list(range(1,21)),value=10,description='Num similar movies',disabled=False,layout=layout)
similarity_threshold = widgets.Dropdown(options=list(np.around(np.arange(0.1,1.0,0.1),decimals=1)),value=0.1,description='Min similarity',disabled=False,layout=layout)
output_cosine = widgets.Output()
def on_butt_clicked(_):
    with output_cosine:
        clear_output()
        selected_title = text_cosine.value.title()
        print(selected_title)
        
        title_names = [title for title in df_metadata['title'].unique() if str(title)!="nan"]
        if selected_title in title_names:
            idx = indices[selected_title]
            sim_scores = list(enumerate(cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[0:num_similar_movies_cosine.value]

            #Let's get the movie indices and once we have the indices we can get the movie titles
            movie_indices = [i[0] for i in sim_scores]
            movie_titles = tokens['title'].iloc[movie_indices].tolist()

            #Let's get the similarities
            movie_similarities = [i[1].round(3) for i in sim_scores]

            #Let's make a dataframe with the lists of movie titles and movie similarities
            df_movie_similarities = pd.DataFrame(list(zip(movie_titles, movie_similarities)), columns =['Title', 'Similarity']) 
            df_movie_similarities = df_movie_similarities[df_movie_similarities['Similarity']>similarity_threshold.value]
            print(f"Number of titles with similarity higher than {similarity_threshold.value}:",df_movie_similarities.shape[0])
            if df_movie_similarities.shape[0]!=0:
                fig = px.bar(df_movie_similarities,x="Title",y="Similarity",
                             color="Title",
                             color_discrete_sequence = px.colors.qualitative.Dark24,
                            title = "Most similar movies based just on title")
                fig.update_layout(width=1000,height=600).show()
        else:
            print("There are no similar movies")

butt_cosine.on_click(on_butt_clicked)
widgets.VBox([butt_cosine,text_cosine,num_similar_movies_cosine, similarity_threshold, output_cosine])            

VBox(children=(Button(button_style='success', description='Display similar titles based on cosine similarity',…