# Movie recommendation system
https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

## Load Dataset

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kaggle
import os
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Read in data and inspect

In [1]:
# install dataset and unzip (make sure to have kaggle.json in ~/.kaggle)
!kaggle datasets download -d rounakbanik/the-movies-dataset
!unzip the-movies-dataset.zip

the-movies-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  the-movies-dataset.zip
replace credits.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [2]:
# load dataset
movies = pd.read_csv('movies_metadata.csv')

# print all columns
pd.set_option('display.max_columns', None)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
# remove poster_path column since images are not used
movies.drop('poster_path', axis=1, inplace=True)

In [4]:
# load keywords dataset and merge with movies dataset on id
keywords = pd.read_csv('keywords.csv')

# update movies id column to be of type int64
# Find and remove non-numeric rows in the 'id' column of movies
non_numeric_ids = movies[pd.to_numeric(movies['id'], errors='coerce').isna()]['id']
movies = movies[~movies['id'].isin(non_numeric_ids)]

# Convert the 'id' columns to int64
movies['id'] = movies['id'].astype('int64')
keywords['id'] = keywords['id'].astype('int64')

# remove 'id' from each keyword pair 
keywords['keywords'] = keywords['keywords'].apply(lambda x: [i['name'] for i in eval(x)])
keywords.head()


Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


In [5]:
# merge keywords with movies
movies = movies.merge(keywords, on='id')
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy, friendship, friends, riva..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[fishing, best friend, duringcreditsstinger, o..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[based on novel, interracial relationship, sin..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[baby, midlife crisis, confidence, aging, daug..."


In [6]:
def extract_name_values(x):
    try:
        if isinstance(x, str):
            parsed_data = eval(x)
            if isinstance(parsed_data, list):
                return [i['name'] for i in parsed_data if isinstance(i, dict) and 'name' in i]
            else:
                return parsed_data['name']
    except:
        pass
    return np.nan

# Read dictionary value for belongs_to_collection and only keep the name
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(extract_name_values)

# Remove 'id' from genres
movies['genres'] = movies['genres'].apply(extract_name_values)

# Remove homepage column since it is not used
movies.drop('homepage', axis=1, inplace=True)

# Remove imdb_id column since it is not used
movies.drop('imdb_id', axis=1, inplace=True)

# Remove 'id' from production_companies
movies['production_companies'] = movies['production_companies'].apply(extract_name_values)

# Remove 'iso_3166_1' from production_countries
movies['production_countries'] = movies['production_countries'].apply(extract_name_values)

# Remove 'iso_639_1' from spoken_languages
movies['spoken_languages'] = movies['spoken_languages'].apply(extract_name_values)

movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy, friendship, friends, riva..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'..."
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[fishing, best friend, duringcreditsstinger, o..."
3,False,,16000000,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[based on novel, interracial relationship, sin..."
4,False,Father of the Bride Collection,0,[Comedy],11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[baby, midlife crisis, confidence, aging, daug..."


In [7]:
credits = pd.read_csv('credits.csv')
credits.head()
# display cast for first movie
credits['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [8]:
# filter cast so that it only contains the name of the actor/actress
credits['cast'] = credits['cast'].apply(lambda x: [i['name'] for i in eval(x)])
credits.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [9]:
# filter crew so that it only contains the name of the crew member
credits['crew'] = credits['crew'].apply(lambda x: [i['name'] for i in eval(x)])
credits.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",11862


In [9]:
# merge credits with movies
movies = movies.merge(credits, on='id')
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,Father of the Bride Collection,0,[Comedy],11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [10]:
# filter release_date so that it only contains the year
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies['release_date'].head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: release_date, dtype: object

## Use rating system to get recommendations based on imdb rating calculation
Score = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$
where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report

In [11]:
m = movies['vote_count'].quantile(0.9)
C = movies['vote_average'].mean()

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],[United States of America],1995,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,Father of the Bride Collection,0,[Comedy],11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [13]:
# set voting avg and vote count to int
qm = movies[(movies['vote_count'] >= 100) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
qm['vote_average'] = qm['vote_average'].astype('int64') 
qm['vote_count'] = qm['vote_count'].astype('int64') 

In [14]:
# display top 10 movies based on weighted rating
qm['weighted_rating'] = qm.apply(weighted_rating, axis=1)
top_movies = qm.sort_values('weighted_rating', ascending=False)
qm.drop('weighted_rating', axis=1, inplace=True) # remove weighted_rating column
top_movies[['title', 'vote_count', 'vote_average', 'weighted_rating', 'popularity']].head(10)

Unnamed: 0,title,vote_count,vote_average,weighted_rating,popularity
10397,Dilwale Dulhania Le Jayenge,661,9,8.353035,34.457024
15651,Inception,14075,8,7.97382,29.108149
12589,The Dark Knight,12269,8,7.970014,123.167259
23076,Interstellar,11187,8,7.967154,32.213481
2870,Fight Club,9678,8,7.962114,63.869599
4904,The Lord of the Rings: The Fellowship of the Ring,8892,8,7.958823,32.070725
292,Pulp Fiction,8670,8,7.957787,140.950236
314,The Shawshank Redemption,8358,8,7.95624,51.645403
7069,The Lord of the Rings: The Return of the King,8226,8,7.955551,29.324358
351,Forrest Gump,8147,8,7.955128,48.307194


Clearly a trivial way to simply recommend movies that seem to be universally liked by IMDB users.

## Create recommendation based on movie embeddings and cosin similarity

In the following example we use the overview of the movie as the key criteria for our cosin similarity scores. First vectorize the overview into a count vectorizer, then create a cosin similarity matrix and try to create a mapping between each title and it's corresponding pairs. Then sort by whatever is closest.

In [15]:
qm.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995,373554033.0,81.0,[English],Released,,Toy Story,False,7,5415,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6,2413,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
4,False,Father of the Bride Collection,0,[Comedy],11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5,173,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
5,False,,60000000,"[Action, Crime, Drama, Thriller]",949,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,"[Regency Enterprises, Forward Pass, Warner Bros.]",[United States of America],1995,187436818.0,170.0,"[English, Español]",Released,A Los Angeles Crime Saga,Heat,False,7,1886,"[robbery, detective, bank, obsession, chase, s...","[Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
6,False,,58000000,"[Comedy, Romance]",11860,en,Sabrina,An ugly duckling having undergone a remarkable...,6.677277,"[Paramount Pictures, Scott Rudin Productions, ...","[Germany, United States of America]",1995,0.0,127.0,"[Français, English]",Released,You are cordially invited to the most surprisi...,Sabrina,False,6,141,"[paris, brother brother relationship, chauffeu...","[Harrison Ford, Julia Ormond, Greg Kinnear, An...","[{'credit_id': '52fe44959251416c75039da9', 'de..."


In [16]:
# convert all cast, crew, genres, keywords, production_companies, production_countries, spoken_languages, belongs_to_collection to lowercase
qm['cast'] = qm['cast'].apply(lambda x: [str.lower(i) for i in x])
qm['crew'] = qm['crew'].apply(lambda x: [str.lower(i) for i in x])
qm['genres'] = qm['genres'].apply(lambda x: [str.lower(i) for i in x])
qm['keywords'] = qm['keywords'].apply(lambda x: [str.lower(i) for i in x])
qm['production_companies'] = qm['production_companies'].apply(lambda x: [str.lower(i) for i in x])
qm['production_countries'] = qm['production_countries'].apply(lambda x: [str.lower(i) for i in x])
qm['spoken_languages'] = qm['spoken_languages'].apply(lambda x: [str.lower(i) for i in x])
# convert belongs_to_collection to lowercase and string
qm['belongs_to_collection'] = qm['belongs_to_collection'].astype('str')
qm['belongs_to_collection'] = qm['belongs_to_collection'].apply(lambda x: str.lower(x))

In [17]:
# convert all overview, tagline, title, original_title to lowercase and string and remove all non-ascii characters
qm['overview'] = qm['overview'].astype('str')
qm['overview'] = qm['overview'].apply(lambda x: str.lower(x))
qm['tagline'] = qm['tagline'].astype('str')
qm['tagline'] = qm['tagline'].apply(lambda x: str.lower(x))
qm['title'] = qm['title'].astype('str')
qm['title'] = qm['title'].apply(lambda x: str.lower(x))
qm['original_title'] = qm['original_title'].astype('str')
qm['original_title'] = qm['original_title'].apply(lambda x: str.lower(x))
qm.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,toy story collection,30000000,"[animation, comedy, family]",862,en,toy story,"led by woody, andy's toys live happily in his ...",21.946943,[pixar animation studios],[united states of america],1995,373554033.0,81.0,[english],Released,,toy story,False,7,5415,"[jealousy, toy, boy, friendship, friends, riva...","[tom hanks, tim allen, don rickles, jim varney...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , ..."
1,False,,65000000,"[adventure, fantasy, family]",8844,en,jumanji,when siblings judy and peter discover an encha...,17.015539,"[tristar pictures, teitler film, interscope co...",[united states of america],1995,262797249.0,104.0,"[english, français]",Released,roll the dice and unleash the excitement!,jumanji,False,6,2413,"[board game, disappearance, based on children'...","[robin williams, jonathan hyde, kirsten dunst,...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , ..."
4,False,father of the bride collection,0,[comedy],11862,en,father of the bride part ii,just when george banks has recovered from his ...,8.387519,"[sandollar productions, touchstone pictures]",[united states of america],1995,76578911.0,106.0,[english],Released,just when his world is back to normal... he's ...,father of the bride part ii,False,5,173,"[baby, midlife crisis, confidence, aging, daug...","[steve martin, diane keaton, martin short, kim...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , ..."
5,False,,60000000,"[action, crime, drama, thriller]",949,en,heat,"obsessive master thief, neil mccauley leads a ...",17.924927,"[regency enterprises, forward pass, warner bros.]",[united states of america],1995,187436818.0,170.0,"[english, español]",Released,a los angeles crime saga,heat,False,7,1886,"[robbery, detective, bank, obsession, chase, s...","[al pacino, robert de niro, val kilmer, jon vo...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , ..."
6,False,,58000000,"[comedy, romance]",11860,en,sabrina,an ugly duckling having undergone a remarkable...,6.677277,"[paramount pictures, scott rudin productions, ...","[germany, united states of america]",1995,0.0,127.0,"[français, english]",Released,you are cordially invited to the most surprisi...,sabrina,False,6,141,"[paris, brother brother relationship, chauffeu...","[harrison ford, julia ormond, greg kinnear, an...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , ..."


In [18]:
qm.drop('video', axis=1, inplace=True)
# remove word 'collection' from belongs_to_collection
qm['belongs_to_collection'] = qm['belongs_to_collection'].apply(lambda x: x.replace('collection', ''))

In [19]:
qm['cast'] = qm['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
qm['crew'] = qm['crew'].apply(lambda x: [i.replace(' ', '') for i in x])
qm['keywords'] = qm['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
qm.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords,cast,crew
0,False,toy story,30000000,"[animation, comedy, family]",862,en,toy story,"led by woody, andy's toys live happily in his ...",21.946943,[pixar animation studios],[united states of america],1995,373554033.0,81.0,[english],Released,,toy story,7,5415,"[jealousy, toy, boy, friendship, friends, riva...","[tomhanks, timallen, donrickles, jimvarney, wa...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , '..."
1,False,,65000000,"[adventure, fantasy, family]",8844,en,jumanji,when siblings judy and peter discover an encha...,17.015539,"[tristar pictures, teitler film, interscope co...",[united states of america],1995,262797249.0,104.0,"[english, français]",Released,roll the dice and unleash the excitement!,jumanji,6,2413,"[boardgame, disappearance, basedonchildren'sbo...","[robinwilliams, jonathanhyde, kirstendunst, br...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , '..."
4,False,father of the bride,0,[comedy],11862,en,father of the bride part ii,just when george banks has recovered from his ...,8.387519,"[sandollar productions, touchstone pictures]",[united states of america],1995,76578911.0,106.0,[english],Released,just when his world is back to normal... he's ...,father of the bride part ii,5,173,"[baby, midlifecrisis, confidence, aging, daugh...","[stevemartin, dianekeaton, martinshort, kimber...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , '..."
5,False,,60000000,"[action, crime, drama, thriller]",949,en,heat,"obsessive master thief, neil mccauley leads a ...",17.924927,"[regency enterprises, forward pass, warner bros.]",[united states of america],1995,187436818.0,170.0,"[english, español]",Released,a los angeles crime saga,heat,7,1886,"[robbery, detective, bank, obsession, chase, s...","[alpacino, robertdeniro, valkilmer, jonvoight,...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , '..."
6,False,,58000000,"[comedy, romance]",11860,en,sabrina,an ugly duckling having undergone a remarkable...,6.677277,"[paramount pictures, scott rudin productions, ...","[germany, united states of america]",1995,0.0,127.0,"[français, english]",Released,you are cordially invited to the most surprisi...,sabrina,6,141,"[paris, brotherbrotherrelationship, chauffeur,...","[harrisonford, juliaormond, gregkinnear, angie...","[[, {, ', c, r, e, d, i, t, _, i, d, ', :, , '..."


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# use count vectorizer to create a matrix of n-gram vectors
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(qm['overview'])

from sklearn.metrics.pairwise import cosine_similarity

# compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# reset index of our main DataFrame and construct reverse mapping as before
new_qm = qm.reset_index()
indices = pd.Series(new_qm.index, index=new_qm['title'])

# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # transform title to lowercase 
    title = title.lower()

    if title not in indices:
        return "Movie not found in the dataset."

    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # print(movie_indices)

    # return the top 10 most similar movies
    return new_qm['title'].iloc[movie_indices] #

get_recommendations('The Dark Knight Rises')

[63, 241, 3803, 3271, 562, 4421, 4123, 5682, 5683, 2566]


63                               batman forever
241                                      batman
3803                 batman: under the red hood
3271                            the dark knight
562                              batman returns
4421    batman: the dark knight returns, part 1
4123                           batman: year one
5682                          batman: bad blood
5683                          batman: bad blood
2566         batman beyond: return of the joker
Name: title, dtype: object