# Information retrieval for movies recommendation

Database which the project it's based on:   
[HBO Max](https://www.kaggle.com/datasets/dgoenrique/hbo-max-movies-and-tv-shows)  

<div></div> 

In [2]:
import numpy as np 
import pandas as pd
from toolbox import preprocessing
import ast

<div></div> 

## Leitura dos Arquivos

As bases vieram em formato CSV, portanto, só foi utilizado o pandas para leitura e feito um concat

<div></div> 

In [3]:
# load
try: 
    pop_movies = pd.read_csv('../data/modified/popular_movies.csv.zip')['tmdbId']
except: 
# Load the "rating" and "link" with the id from multiple sources
    ratings = pd.read_csv('../data/origin/ratings.zip')
    links = pd.read_csv('../data/origin/links.zip')

    # Inner join between both files
    ratings = ratings.merge(links, how='inner', on='movieId')

    # Select only the movies with at least 750 reviews, to classify as popular enough for recommendation
    pop_movies = ratings['movieId'].value_counts().to_frame().query('count > 250').index
    pop_movies = links.query('movieId in @pop_movies')['tmdbId'].dropna()

    pop_movies.to_csv('../data/modified/popular_movies.csv.zip', index=False, compression='zip')

    del ratings, links

In [4]:
# Load the 'credits' dataset from a zipped CSV file
dt_c = pd.read_csv('../data/origin/credits.zip')

# Load the 'movies_metadata' dataset from a zipped CSV file
dt_m = pd.read_csv('../data/origin/movies_metadata.zip')

# Convert the 'id' column to numeric data type, ignoring any errors
dt_m['id'] = pd.to_numeric(dt_m['id'], errors='coerce')

# Convert the 'popularity' column to numeric data type, ignoring any errors
dt_m['popularity'] = pd.to_numeric(dt_m['popularity'], errors='coerce')

# Merge the 'movies_metadata' DataFrame with the 'credits' DataFrame based on the 'id' column
dt_m = dt_m.merge(dt_c.set_index('id'), how='left', left_on=['id'], right_index=True)

# Drop rows with missing values in the 'id' column
dt_m.dropna(subset=['id', 'overview'], inplace=True)

# Select the movies with the minimun engagement
dt_m.query('id in @pop_movies', inplace=True)

# Reset index 
dt_m.reset_index(drop=True, inplace=True)

# Delete the 'credits' DataFrame to free up memory
del dt_c

dt_m.drop_duplicates(subset=['imdb_id'], inplace=True, ignore_index=True)

  dt_m = pd.read_csv('../data/origin/movies_metadata.zip')


In [5]:
# Define the variables
v = 'vote_count'  # Vote count column
m = 'vote_count.quantile(0.85)'  # Quantile of vote count
R = 'vote_average'  # Vote average column
C = 'vote_average.mean()'  # Mean of vote average

# Evaluate the score using the defined variables and assign it to a new column 'score'
dt_m.eval(f'score = ({v}/({v}+{m}) * {R}) + ({m}/({m}+{v}) * {C})', inplace=True)

del v, m, R, C

<div></div> 

## Pré-Processamento de Texto

<div></div> 


### Remoção de palavras e transformação de minúsculos

In [6]:
# Removing special characters and numbers from the 'overview' column
dt_m['p_overview'] = dt_m['overview'].replace(r'([^\w\s]|\d+)', ' ', regex=True)
# Applying lowercase and strip simultaneously to the 'p_overview' column
dt_m['p_overview'] = dt_m['p_overview'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)
# Extracting lowercase genre names from the 'genres' column and replacing spaces with underscores
dt_m['p_genres'] = dt_m['genres'].map(lambda genres: [genre.get('name').lower().replace(' ', '_') for genre in ast.literal_eval(genres)])

# Removing stopwords from the 'p_overview' column
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.remove_stopwords)
# Lemmatizing the text in the 'p_overview' column
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.lemmatize_text)
# Tokenizing the words in the 'p_overview' column
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.word_tokenize)

# Filtering out rows where 'p_overview' has a length greater than 0
dt_m.query('p_overview.str.len() > 0', inplace=True)
# Checking the shape of the DataFrame after filtering rows where 'p_genres' has a length greater than 0
dt_m.query('p_genres.str.len() > 0', inplace=True)

# Resetting the index of the DataFrame to ensure it starts from 0
dt_m.reset_index(drop=True, inplace=True)


### TF-IDF from Corpus

In [7]:
from gensim import corpora
from gensim import models
import numpy as np

data = dt_m['p_overview'].to_list()

# Create a dictionary based on the 'p_overview' data
dictionary = corpora.Dictionary(data)

# Convert the data into Bag of Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in data]

# Word weight in Bag of Words corpus
word_weight = []

# Iterate over each document in the BoW corpus
for doc in bow_corpus:
    # Extract the word ID and frequency for each word in the document
    for id, freq in doc:
        word_weight.append([dictionary[id], freq])

# Create a TF-IDF model based on the BoW corpus
tfIdf = models.TfidfModel(bow_corpus, smartirs='nfc')

# TF-IDF Word Weight
weight_tfidf = []
# Iterate over each document in the TF-IDF representation
for doc in tfIdf[bow_corpus]:
    # Extract the word ID and TF-IDF weight for each word in the document
    for id, freq in doc:
        weight_tfidf.append([dictionary[id], np.around(freq, decimals=3)])

# Identify words to remove based on their TF-IDF weight
remove = pd.Series([x[0] for x in weight_tfidf if x[1] <= 0.04]).unique()
remove = set(remove)

# Filter out the words to be removed from the 'p_overview' column of dt_m
dt_m['p_overview'] = dt_m['p_overview'].map(lambda words: [x for x in words if x not in remove])


# Using a Model for Information Retrieval

## Similarity by description

In [8]:
from gensim.models import KeyedVectors

# Load pre-trained word vectors using Word2Vec format
model = KeyedVectors.load_word2vec_format('../models/numberbatch-en.txt')

# Create a set of vocabulary words from the loaded model
index_set = set(model.index_to_key)

# Filter out words in 'p_overview' that are not present in the vocabulary set
dt_m['p_overview'] = dt_m['p_overview'].map(lambda words: [w for w in words if w in index_set])


In [9]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

query = dt_m['p_overview'][0]

query_vector = sum(model[word] for word in query) / len(query)

dt_m['embed_docs'] = dt_m['p_overview'].map(lambda doc: sum(model[word] for word in doc) / len(doc))

similarities = dt_m['embed_docs'].map(lambda doc: cosine_similarity(query_vector.reshape(1, -1) , doc.reshape(1, -1))).sort_values(ascending=False)[:100]

sim = {i: v[0][0] for i, v in zip(similarities.index, similarities.values)}


In [10]:
dt_m.iloc[list(sim.keys()),:].sort_values(by=['score', 'popularity'], ascending=False)[:10]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,title,video,vote_average,vote_count,cast,crew,score,p_overview,p_genres,embed_docs
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.500085,"[led, woody, andy, toys, happily, room, until,...","[animation, comedy, family]","[-0.066558816, -0.06601177, 0.03052059, 0.0370..."
6128,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,Toy Story 3,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.393455,"[woody, buzz, rest, andy, toys, haven, played,...","[animation, family, comedy]","[-0.05681363, -0.065113634, 0.039022733, 0.052..."
2325,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story-2,863.0,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,Toy Story 2,False,7.3,3914.0,"[{'cast_id': 18, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8025073', 'de...",7.122641,"[andy, heads, cowboy, camp, leaving, toys, dev...","[animation, comedy, family]","[-0.053141665, -0.04976667, 0.03131111, 0.0481..."
5931,False,,7500000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://500days.com,19913.0,tt1022603,en,(500) Days of Summer,"Tom (Joseph Gordon-Levitt), greeting-card writ...",...,(500) Days of Summer,False,7.2,2993.0,"[{'cast_id': 4, 'character': 'Tom Hansen', 'cr...","[{'credit_id': '52fe47f99251416c750abaa5', 'de...",7.007609,"[tom, joseph, gordon, levitt, greeting, card, ...","[comedy, drama, romance]","[-0.059767745, -0.083354846, 0.019109681, -0.0..."
2226,False,,70000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 16, 'na...",,856.0,tt0096438,en,Who Framed Roger Rabbit,'Toon star Roger is worried that his wife Jess...,...,Who Framed Roger Rabbit,False,7.2,1466.0,"[{'cast_id': 17, 'character': 'Eddie Valiant',...","[{'credit_id': '52fe4282c3a36847f80249a9', 'de...",6.889856,"[toon, star, roger, worried, jessica, playing,...","[fantasy, animation, comedy, crime, family]","[-0.04540769, -0.07091923, 0.020792305, 0.0202..."
845,False,,1500000,"[{'id': 18, 'name': 'Drama'}]",,221.0,tt0048545,en,Rebel Without a Cause,"After moving to a new town, troublemaking teen...",...,Rebel Without a Cause,False,7.6,351.0,"[{'cast_id': 6, 'character': 'Jim Stark', 'cre...","[{'credit_id': '52fe4228c3a36847f8008749', 'de...",6.740996,"[moving, town, troublemaking, teen, jim, stark...",[drama],"[-0.05226364, -0.07014319, 0.0114068175, 0.007..."
1459,False,"{'id': 10453, 'name': 'Poltergeist Collection'...",10700000,"[{'id': 27, 'name': 'Horror'}]",,609.0,tt0084516,en,Poltergeist,"Steve Freeling lives with his wife, Diane, and...",...,Poltergeist,False,7.1,811.0,"[{'cast_id': 21, 'character': 'Steve Freeling'...","[{'credit_id': '52fe425dc3a36847f8018967', 'de...",6.735367,"[steve, diane, children, dana, robbie, carol, ...",[horror],"[-0.037136108, -0.06258333, 0.015158336, 0.033..."
692,False,,325000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,3078.0,tt0025316,en,It Happened One Night,Ellie Andrews has just tied the knot with soci...,...,It Happened One Night,False,7.7,283.0,"[{'cast_id': 10, 'character': 'Peter Warne', '...","[{'credit_id': '52fe4383c3a36847f8059c33', 'de...",6.718043,"[ellie, andrews, tied, knot, society, aviator,...","[comedy, romance]","[-0.047841664, -0.07986041, 0.03252917, 0.0334..."
3496,False,,22000000,"[{'id': 18, 'name': 'Drama'}]",,10950.0,tt0277027,en,I Am Sam,Sam has the mental capacity of a 7-year-old. H...,...,I Am Sam,False,7.2,542.0,"[{'cast_id': 1, 'character': 'Sam Dawson', 'cr...","[{'credit_id': '52fe43d89251416c7502045d', 'de...",6.707429,"[sam, mental, capacity, homeless, abandons, le...",[drama],"[-0.06321563, -0.06864375, -0.0142250005, 0.02..."
2555,False,,30000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,243.0,tt0146882,en,High Fidelity,When record store owner Rob Gordon gets dumped...,...,High Fidelity,False,7.0,636.0,"[{'cast_id': 7, 'character': 'Rob Gordon', 'cr...","[{'credit_id': '52fe422cc3a36847f8009919', 'de...",6.658839,"[record, store, owner, rob, gordon, dumped, gi...","[comedy, drama, romance, music]","[-0.060336668, -0.08761667, -0.0049099987, 0.0..."


## Similarity by genres

In [11]:
dt_m['embed_genres'] = dt_m['p_genres'].map(lambda doc: sum(model[word] for word in doc) / len(doc))

## Similarity by Cast

In [38]:
a = ast.literal_eval(dt_m['crew'][0])

In [41]:
[x.get('name').lower().replace(' ', '_') for x in a if x.get('job').lower()=='director']

['john_lasseter']

In [None]:
def get_director(crew): 
    

In [44]:
dt_m['director'] = dt_m['crew'].map(lambda crew: [x.get('name').lower().replace(' ', '_') for x in ast.literal_eval(crew) if x.get('job').lower()=='director'])

In [79]:
dt_m['director'][7106]

['tom_hanks',
 'david_frankel',
 'phil_alden_robinson',
 'mikael_salomon',
 'richard_loncraine',
 'david_nutter',
 'david_leland',
 'tony_to']

In [87]:
dt_m['director'][0][0]

'john_lasseter'

In [99]:
dt_m['director'].map(lambda dir: ast.List(dir))

TypeError: 'List' object is not subscriptable

In [25]:
[x.get('name').lower().replace(' ', '_') for x in a if x.get('job').lower()=='director']

['John Lasseter']

In [69]:
def find_similar(title, embed_model, database:pd.DataFrame = dt_m): 
    
    movie_data = database.query(f"title=='{title}'")
    
    # only search for movies in the same genre
    mask = database['p_genres'].map(lambda genres: any(set(genres).intersection(movie_data['p_genres'][0])))

    database.query('@mask', inplace=True)

    embed_query = sum(model[word] for word in movie_data['p_overview'])[0] / len(movie_data['p_overview'][0])

    ov_similarity = database['embed_docs'].map(lambda doc: cosine_similarity(embed_query.reshape(1, -1) , doc.reshape(1, -1))).sort_values(ascending=False)[:500]
    
    ov_similarity = {i: j[0][0] for i, j in zip(ov_similarity.index, ov_similarity.values)}

    return database.query('@ov_similarity.keys()').sort_values(by='score')


find_similar('Toy Story', embed_model=model)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,cast,crew,score,p_overview,p_genres,embed_docs,embed_genres
5415,False,,20000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9760.0,tt0799949,en,Epic Movie,"When Edward, Peter, Lucy and Susan each follow...",...,False,3.2,334.0,"[{'cast_id': 1, 'character': 'Edward', 'credit...","[{'credit_id': '56348a269251412857016e9e', 'de...",5.651993,"[edward, peter, lucy, susan, follow, path, fin...","[action, adventure, comedy]","[-0.02639697, -0.05262121, 0.050836366, 0.0097...","[-0.064166665, -0.0864, -0.0491, -0.0034999996..."
5386,False,,100000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",http://www.eragonmovie.com/,2486.0,tt0449010,en,Eragon,"In his homeland of Alagaesia, a farm boy happe...",...,False,4.9,990.0,"[{'cast_id': 1, 'character': 'Eragon', 'credit...","[{'credit_id': '52fe435ac3a36847f804de45', 'de...",5.688890,"[homeland, farm, happens, upon, dragon, egg, d...","[fantasy, action, adventure, family]","[-0.050307143, -0.06353571, -0.0042571416, 0.0...","[-0.048224997, -0.060825, -0.018399999, -0.005..."
6931,False,,88000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",http://www.pixels-movie.com/,257344.0,tt2120120,en,Pixels,Video game experts are recruited by the milita...,...,False,5.6,2564.0,"[{'cast_id': 17, 'character': 'Sam Brenner', '...","[{'credit_id': '530f7120c3a36874ca001178', 'de...",5.842829,"[video, game, experts, recruited, military, er...","[action, comedy, science_fiction]","[-0.040050004, -0.0658, -0.042683333, 0.056441...","[-0.07173333, -0.040599998, -0.0806, -0.018666..."
4070,False,"{'id': 86029, 'name': ""Charlie's Angels Collec...",120000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9471.0,tt0305357,en,Charlie's Angels: Full Throttle,The Angels are charged with finding a pair of ...,...,False,5.2,930.0,"[{'cast_id': 1, 'character': 'Natalie Cook', '...","[{'credit_id': '52fe44fbc3a36847f80b5bd7', 'de...",5.855481,"[angels, charged, finding, pair, missing, ring...","[action, adventure, comedy]","[-0.07086666, -0.045490474, -0.050476193, 0.04...","[-0.064166665, -0.0864, -0.0491, -0.0034999996..."
6632,False,,130000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",http://www.ripd.com/,49524.0,tt0790736,en,R.I.P.D.,A recently slain cop joins a team of undead po...,...,False,5.4,1280.0,"[{'cast_id': 3, 'character': 'Roy Pulsipher', ...","[{'credit_id': '570b93599251412c740021f0', 'de...",5.866993,"[recently, slain, cop, joins, team, undead, po...","[fantasy, action, comedy, crime]","[-0.028838893, -0.07171667, -0.022555554, 0.03...","[-0.0736, -0.079375, -0.05555, -0.030749999, -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,False,,1800000,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,935.0,tt0057012,en,Dr. Strangelove or: How I Learned to Stop Worr...,Insane General Jack D. Ripper initiates a nucl...,...,False,8.0,1472.0,"[{'cast_id': 13, 'character': 'Group Capt. Lio...","[{'credit_id': '52fe4290c3a36847f802876d', 'de...",7.361470,"[insane, general, jack, d, ripper, initiates, ...","[drama, comedy, war]","[-0.046969235, -0.056956418, -0.030351281, 0.0...","[-0.035833333, -0.09153333, -0.042766664, -0.0..."
6128,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.393455,"[woody, buzz, rest, andy, toys, haven, played,...","[animation, family, comedy]","[-0.05681363, -0.065113634, 0.039022733, 0.052...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.500085,"[led, woody, andy, toys, happily, room, until,...","[animation, comedy, family]","[-0.066558816, -0.06601177, 0.03052059, 0.0370...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
5707,False,,180000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/disneypictures/wall-e/,10681.0,tt0910970,en,WALL·E,WALL·E is the last robot left on an Earth that...,...,False,7.8,6439.0,"[{'cast_id': 12, 'character': 'WALL·E / M-O (v...","[{'credit_id': '52fe43a29251416c75018101', 'de...",7.613720,"[wall, e, last, robot, left, earth, overrun, g...","[animation, family]","[-0.06863334, -0.03866945, 0.01928611, 0.05068...","[-0.014, 0.01655, -0.017599998, -0.01735, -0.1..."


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,cast,crew,score,p_overview,p_genres,embed_docs,embed_genres
6128,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.393455,"[woody, buzz, rest, andy, toys, haven, played,...","[animation, family, comedy]","[-0.05681363, -0.065113634, 0.039022733, 0.052...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
2325,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story-2,863.0,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,False,7.3,3914.0,"[{'cast_id': 18, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8025073', 'de...",7.122641,"[andy, heads, cowboy, camp, leaving, toys, dev...","[animation, comedy, family]","[-0.053141665, -0.04976667, 0.03131111, 0.0481...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
1346,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,15513.0,tt0120723,en,Kissing a Fool,"Max (David Schwimmer), an alpha-male commitmen...",...,False,5.9,16.0,"[{'cast_id': 1, 'character': 'Max Abbitt', 'cr...","[{'credit_id': '52fe46639251416c75076af5', 'de...",6.439564,"[max, david, schwimmer, alpha, male, commitmen...","[comedy, romance]","[-0.05744562, -0.06525087, -0.01512807, 0.0165...","[0.0026000002, -0.069800004, -0.0206, -0.1107,..."
237,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,40490.0,tt0113755,en,Man of the House,"Ben Archer is not happy. His mother, Sandy, ha...",...,False,5.1,40.0,"[{'cast_id': 2, 'character': 'Sandy Archer', '...","[{'credit_id': '52fe4594c3a36847f80cf2d7', 'de...",6.397512,"[ben, archer, happy, sandy, met, looks, things...","[action, comedy, family]","[-0.066242866, -0.08415, 0.024992859, 0.000971...","[-0.045366663, -0.052233335, -0.039633334, -0...."
2455,False,,25000000,"[{'id': 35, 'name': 'Comedy'}]",,1831.0,tt0156841,en,The Next Best Thing,A comedy-drama about best friends - one a stra...,...,False,4.3,29.0,"[{'cast_id': 3, 'character': 'Abbie Reynolds',...","[{'credit_id': '52fe4318c3a36847f8039dff', 'de...",6.389074,"[comedy, drama, straight, abbie, gay, robert, ...",[comedy],"[-0.04997826, -0.08250869, 0.01794348, 0.01859...","[-0.0239, -0.0659, -0.0171, -0.0843, -0.0926, ..."
2080,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...",,10552.0,tt0088889,en,Cat's Eye,Three short stories by shock-meister Stephen K...,...,False,5.9,140.0,"[{'cast_id': 1, 'character': 'Amanda', 'credit...","[{'credit_id': '52fe43869251416c75013d3d', 'de...",6.382326,"[short, stories, shock, meister, stephen, king...","[comedy, thriller, horror]","[-0.052583344, -0.06680237, 0.027269045, 0.010...","[-0.044033334, -0.0675, 0.004166667, -0.105433..."
5087,False,,26000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.universalstudiosentertainment.com/t...,6957.0,tt0405422,en,The 40 Year Old Virgin,Andy Stitzer has a pleasant life with a nice a...,...,False,6.2,2020.0,"[{'cast_id': 1, 'character': 'Andy Stitzer', '...","[{'credit_id': '52fe446ac3a36847f8094c49', 'de...",6.283676,"[andy, pleasant, nice, apartment, job, stampin...","[comedy, romance]","[-0.057471428, -0.0701619, 0.0024571428, 0.028...","[0.0026000002, -0.069800004, -0.0206, -0.1107,..."
3602,False,,43000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",http://www.sonypictures.com/movies/thesweetest...,11812.0,tt0253867,en,The Sweetest Thing,Christina's love life is stuck in neutral. Aft...,...,False,5.3,286.0,"[{'cast_id': 10, 'character': 'Christina Walte...","[{'credit_id': '52fe448a9251416c75038579', 'de...",6.198267,"[christina, stuck, neutral, avoiding, hazards,...","[romance, comedy]","[-0.05996875, -0.07875626, 0.0074375, 0.040274...","[0.0026000002, -0.069800004, -0.0206, -0.1107,..."
5148,False,"{'id': 298820, 'name': 'American Pie (Spin-off...",10000000,"[{'id': 35, 'name': 'Comedy'}]",http://www.americanreunionmovie.com/,8274.0,tt0436058,en,American Pie Presents: Band Camp,The original American Pie characters have move...,...,False,5.3,553.0,"[{'cast_id': 7, 'character': 'Mr. Levenstein',...","[{'credit_id': '52fe449ac3a36847f809f9f7', 'de...",6.046653,"[original, american, pie, characters, moved, e...",[comedy],"[-0.055151217, -0.075946346, 0.029085366, 0.02...","[-0.0239, -0.0659, -0.0171, -0.0843, -0.0926, ..."


In [42]:
dt_m.query('@s')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,cast,crew,score,p_overview,p_genres,embed_docs,embed_genres
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.500085,"[led, woody, andy, toys, happily, room, until,...","[animation, comedy, family]","[-0.066558816, -0.06601177, 0.03052059, 0.0370...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
6128,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.393455,"[woody, buzz, rest, andy, toys, haven, played,...","[animation, family, comedy]","[-0.05681363, -0.065113634, 0.039022733, 0.052...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
2325,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story-2,863.0,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,False,7.3,3914.0,"[{'cast_id': 18, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8025073', 'de...",7.122641,"[andy, heads, cowboy, camp, leaving, toys, dev...","[animation, comedy, family]","[-0.053141665, -0.04976667, 0.03131111, 0.0481...","[-0.0173, -0.010933332, -0.017433332, -0.03966..."
5148,False,"{'id': 298820, 'name': 'American Pie (Spin-off...",10000000,"[{'id': 35, 'name': 'Comedy'}]",http://www.americanreunionmovie.com/,8274.0,tt0436058,en,American Pie Presents: Band Camp,The original American Pie characters have move...,...,False,5.3,553.0,"[{'cast_id': 7, 'character': 'Mr. Levenstein',...","[{'credit_id': '52fe449ac3a36847f809f9f7', 'de...",6.046653,"[original, american, pie, characters, moved, e...",[comedy],"[-0.055151217, -0.075946346, 0.029085366, 0.02...","[-0.0239, -0.0659, -0.0171, -0.0843, -0.0926, ..."
5087,False,,26000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.universalstudiosentertainment.com/t...,6957.0,tt0405422,en,The 40 Year Old Virgin,Andy Stitzer has a pleasant life with a nice a...,...,False,6.2,2020.0,"[{'cast_id': 1, 'character': 'Andy Stitzer', '...","[{'credit_id': '52fe446ac3a36847f8094c49', 'de...",6.283676,"[andy, pleasant, nice, apartment, job, stampin...","[comedy, romance]","[-0.057471428, -0.0701619, 0.0024571428, 0.028...","[0.0026000002, -0.069800004, -0.0206, -0.1107,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2601,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,16347.0,tt0051383,en,Auntie Mame,Ten-year-old orphan Patrick Dennis has come to...,...,False,6.4,28.0,"[{'cast_id': 20, 'character': 'Mame Dennis', '...","[{'credit_id': '52fe46c89251416c750836ab', 'de...",6.446685,"[ten, orphan, patrick, dennis, come, nearest, ...","[comedy, drama]","[-0.038778257, -0.06975654, 0.01971884, 0.0029...","[-0.04195, -0.078999996, -0.03995, -0.0787, -0..."
4764,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,18701.0,tt0340012,en,Being Julia,"Julia Lambert is a true diva: beautiful, talen...",...,False,6.4,23.0,"[{'cast_id': 1, 'character': 'Jimmie Langton',...","[{'credit_id': '52fe47929251416c7509e4a3', 'de...",6.446906,"[julia, lambert, true, diva, beautiful, talent...","[comedy, drama, romance]","[-0.07180445, -0.07688001, 0.019824447, -0.004...","[-0.018266665, -0.07723334, -0.034666665, -0.0..."
2373,False,,7000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://mydogskip.warnerbros.com/,17908.0,tt0156812,en,My Dog Skip,A shy boy is unable to make friends in Yazoo C...,...,False,6.5,71.0,"[{'cast_id': 1, 'character': 'Willie Morris', ...","[{'credit_id': '52fe47519251416c750951a5', 'de...",6.451315,"[shy, unable, make, yazoo, city, mississippi, ...","[comedy, drama, family]","[-0.02927188, -0.06497812, 0.023803126, 0.0190...","[-0.017033333, -0.04803333, -0.015099999, -0.0..."
6248,False,"{'id': 9888, 'name': 'Home Alone Collection', ...",0,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",,12536.0,tt0329200,en,Home Alone 4,Kevin McCallister's parents have split up. Now...,...,False,3.9,239.0,"[{'cast_id': 1, 'character': 'Marv Merchants',...","[{'credit_id': '52fe44f69251416c75046f77', 'de...",5.967664,"[kevin, mccallister, parents, split, living, m...","[crime, comedy, family]","[-0.013316001, -0.065524, 0.026904006, 0.00622...","[-0.019833336, -0.045, -0.024400001, -0.057066..."


In [None]:
similarities = cosine_similarity(query_vector.reshape(1, -1), document_vectors[10].reshape(1, -1))

In [None]:

# Calculate cosine similarity between query vector and document vectors
similarities = cosine_similarity(query_vector.reshape(1, -1), document_vectors)

# Get the indices of the most similar documents
most_similar_indices = similarities.argsort()[0][::-1]

# Retrieve the most similar documents
most_similar_documents = [corpus[i] for i in most_similar_indices]


NameError: name 'document_vectors' is not defined