# Information retrieval for movies recommendation

Database which the project it's based on:   
[HBO Max](https://www.kaggle.com/datasets/dgoenrique/hbo-max-movies-and-tv-shows)  

<div></div> 

In [1]:
import numpy as np 
import pandas as pd
from toolbox import preprocessing

<div></div> 

## Leitura dos Arquivos

As bases vieram em formato CSV, portanto, só foi utilizado o pandas para leitura e feito um concat

<div></div> 

In [2]:
# load
try: 
    pop_movies = pd.read_csv('../data/modified/popular_movies.csv.zip')['tmdbId']
except: 
# Load the "rating" and "link" with the id from multiple sources
    ratings = pd.read_csv('../data/origin/ratings.zip')
    links = pd.read_csv('../data/origin/links.zip')

    # Inner join between both files
    ratings = ratings.merge(links, how='inner', on='movieId')

    # Select only the movies with at least 750 reviews, to classify as popular enough for recommendation
    pop_movies = ratings['movieId'].value_counts().to_frame().query('count > 250').index
    pop_movies = links.query('movieId in @pop_movies')['tmdbId'].dropna()

    pop_movies.to_csv('../data/modified/popular_movies.csv.zip', index=False, compression='zip')

    del ratings, links

In [3]:
# Load the 'credits' dataset from a zipped CSV file
dt_c = pd.read_csv('../data/origin/credits.zip')

# Load the 'movies_metadata' dataset from a zipped CSV file
dt_m = pd.read_csv('../data/origin/movies_metadata.zip')

# Convert the 'id' column to numeric data type, ignoring any errors
dt_m['id'] = pd.to_numeric(dt_m['id'], errors='coerce')

# Convert the 'popularity' column to numeric data type, ignoring any errors
dt_m['popularity'] = pd.to_numeric(dt_m['popularity'], errors='coerce')

# Merge the 'movies_metadata' DataFrame with the 'credits' DataFrame based on the 'id' column
dt_m = dt_m.merge(dt_c.set_index('id'), how='left', left_on=['id'], right_index=True)

# Drop rows with missing values in the 'id' column
dt_m.dropna(subset=['id', 'overview'], inplace=True)

# Select the movies with the minimun engagement
dt_m.query('id in @pop_movies', inplace=True)

# Reset index 
dt_m.reset_index(drop=True, inplace=True)

# Delete the 'credits' DataFrame to free up memory
del dt_c

dt_m.drop_duplicates(subset=['imdb_id'], inplace=True, ignore_index=True)

  dt_m = pd.read_csv('../data/origin/movies_metadata.zip')


In [4]:
# Define the variables
v = 'vote_count'  # Vote count column
m = 'vote_count.quantile(0.85)'  # Quantile of vote count
R = 'vote_average'  # Vote average column
C = 'vote_average.mean()'  # Mean of vote average

# Evaluate the score using the defined variables and assign it to a new column 'score'
dt_m.eval(f'score = ({v}/({v}+{m}) * {R}) + ({m}/({m}+{v}) * {C})', inplace=True)

del v, m, R, C

<div></div> 

## Pré-Processamento de Texto

<div></div> 


### Remoção de palavras e transformação de minúsculos

In [5]:
dt_m['p_overview'] = dt_m['overview'].replace(r'([^\w\s]|\d+)', ' ', regex=True)

# Aplicando as funções str.lower() e str.strip() simultaneamente
dt_m['p_overview'] = dt_m['p_overview'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)


In [6]:
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.remove_stopwords)
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.lemmatize_text)
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.word_tokenize)

In [7]:
dt_m.query('p_overview.str.len() > 0', inplace=True)
dt_m.reset_index(drop=True, inplace=True)

### TF-IDF from Corpus

In [8]:
from gensim import corpora
from gensim import models
import numpy as np

data = dt_m['p_overview'].to_list()

# Create a dictionary based on the 'p_overview' data
dictionary = corpora.Dictionary(data)

# Convert the data into Bag of Words (BoW) representation
bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in data]

# Word weight in Bag of Words corpus
word_weight = []

# Iterate over each document in the BoW corpus
for doc in bow_corpus:
    # Extract the word ID and frequency for each word in the document
    for id, freq in doc:
        word_weight.append([dictionary[id], freq])

# Create a TF-IDF model based on the BoW corpus
tfIdf = models.TfidfModel(bow_corpus, smartirs='nfc')

# TF-IDF Word Weight
weight_tfidf = []
# Iterate over each document in the TF-IDF representation
for doc in tfIdf[bow_corpus]:
    # Extract the word ID and TF-IDF weight for each word in the document
    for id, freq in doc:
        weight_tfidf.append([dictionary[id], np.around(freq, decimals=3)])

# Identify words to remove based on their TF-IDF weight
remove = pd.Series([x[0] for x in weight_tfidf if x[1] <= 0.04]).unique()
remove = set(remove)

# Filter out the words to be removed from the 'p_overview' column of dt_m
dt_m['p_overview'] = dt_m['p_overview'].map(lambda words: [x for x in words if x not in remove])


# Using a Model for Information Retrieval

In [12]:
from gensim.models import KeyedVectors

# Load pre-trained word vectors using Word2Vec format
model = KeyedVectors.load_word2vec_format('../models/numberbatch-en.txt')

# Create a set of vocabulary words from the loaded model
index_set = set(model.index_to_key)

# Filter out words in 'p_overview' that are not present in the vocabulary set
dt_m['p_overview'] = dt_m['p_overview'].map(lambda words: [w for w in words if w in index_set])


In [91]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

query = dt_m['p_overview'][0]

query_vector = sum(model[word] for word in query) / len(query)

dt_m['docs_embed'] = dt_m['p_overview'].map(lambda doc: sum(model[word] for word in doc) / len(doc))

similarities = dt_m['docs_embed'].map(lambda doc: cosine_similarity(query_vector.reshape(1, -1) , doc.reshape(1, -1))).sort_values(ascending=False)[:100]

sim = {i: v[0][0] for i, v in zip(similarities.index, similarities.values)}


In [96]:
dt_m.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,title,video,vote_average,vote_count,cast,crew,score,p_overview,docs,docs_embed
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.500085,"[led, woody, andy, toys, happily, room, until,...","[-0.066558816, -0.06601177, 0.03052059, 0.0370...","[-0.066558816, -0.06601177, 0.03052059, 0.0370..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",6.764869,"[siblings, judy, peter, enchanted, board, game...","[-0.050329026, -0.057687096, 0.032664515, 0.01...","[-0.050329026, -0.057687096, 0.032664515, 0.01..."


In [107]:
dt_m.iloc[list(sim.keys()),:].sort_values(by=['score', 'popularity'], ascending=False)[:10]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,title,video,vote_average,vote_count,cast,crew,score,p_overview,docs,docs_embed
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.500085,"[led, woody, andy, toys, happily, room, until,...","[-0.066558816, -0.06601177, 0.03052059, 0.0370...","[-0.066558816, -0.06601177, 0.03052059, 0.0370..."
6134,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,Toy Story 3,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.393455,"[woody, buzz, rest, andy, toys, haven, played,...","[-0.05681363, -0.065113634, 0.039022733, 0.052...","[-0.05681363, -0.065113634, 0.039022733, 0.052..."
2329,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story-2,863.0,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,Toy Story 2,False,7.3,3914.0,"[{'cast_id': 18, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8025073', 'de...",7.122641,"[andy, heads, cowboy, camp, leaving, toys, dev...","[-0.053141665, -0.04976667, 0.03131111, 0.0481...","[-0.053141665, -0.04976667, 0.03131111, 0.0481..."
5937,False,,7500000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://500days.com,19913.0,tt1022603,en,(500) Days of Summer,"Tom (Joseph Gordon-Levitt), greeting-card writ...",...,(500) Days of Summer,False,7.2,2993.0,"[{'cast_id': 4, 'character': 'Tom Hansen', 'cr...","[{'credit_id': '52fe47f99251416c750abaa5', 'de...",7.007609,"[tom, joseph, gordon, levitt, greeting, card, ...","[-0.059767745, -0.083354846, 0.019109681, -0.0...","[-0.059767745, -0.083354846, 0.019109681, -0.0..."
2230,False,,70000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 16, 'na...",,856.0,tt0096438,en,Who Framed Roger Rabbit,'Toon star Roger is worried that his wife Jess...,...,Who Framed Roger Rabbit,False,7.2,1466.0,"[{'cast_id': 17, 'character': 'Eddie Valiant',...","[{'credit_id': '52fe4282c3a36847f80249a9', 'de...",6.889856,"[toon, star, roger, worried, jessica, playing,...","[-0.04540769, -0.07091923, 0.020792305, 0.0202...","[-0.04540769, -0.07091923, 0.020792305, 0.0202..."
847,False,,1500000,"[{'id': 18, 'name': 'Drama'}]",,221.0,tt0048545,en,Rebel Without a Cause,"After moving to a new town, troublemaking teen...",...,Rebel Without a Cause,False,7.6,351.0,"[{'cast_id': 6, 'character': 'Jim Stark', 'cre...","[{'credit_id': '52fe4228c3a36847f8008749', 'de...",6.740996,"[moving, town, troublemaking, teen, jim, stark...","[-0.05226364, -0.07014319, 0.0114068175, 0.007...","[-0.05226364, -0.07014319, 0.0114068175, 0.007..."
1461,False,"{'id': 10453, 'name': 'Poltergeist Collection'...",10700000,"[{'id': 27, 'name': 'Horror'}]",,609.0,tt0084516,en,Poltergeist,"Steve Freeling lives with his wife, Diane, and...",...,Poltergeist,False,7.1,811.0,"[{'cast_id': 21, 'character': 'Steve Freeling'...","[{'credit_id': '52fe425dc3a36847f8018967', 'de...",6.735367,"[steve, diane, children, dana, robbie, carol, ...","[-0.037136108, -0.06258333, 0.015158336, 0.033...","[-0.037136108, -0.06258333, 0.015158336, 0.033..."
693,False,,325000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,3078.0,tt0025316,en,It Happened One Night,Ellie Andrews has just tied the knot with soci...,...,It Happened One Night,False,7.7,283.0,"[{'cast_id': 10, 'character': 'Peter Warne', '...","[{'credit_id': '52fe4383c3a36847f8059c33', 'de...",6.718043,"[ellie, andrews, tied, knot, society, aviator,...","[-0.047841664, -0.07986041, 0.03252917, 0.0334...","[-0.047841664, -0.07986041, 0.03252917, 0.0334..."
3500,False,,22000000,"[{'id': 18, 'name': 'Drama'}]",,10950.0,tt0277027,en,I Am Sam,Sam has the mental capacity of a 7-year-old. H...,...,I Am Sam,False,7.2,542.0,"[{'cast_id': 1, 'character': 'Sam Dawson', 'cr...","[{'credit_id': '52fe43d89251416c7502045d', 'de...",6.707429,"[sam, mental, capacity, homeless, abandons, le...","[-0.06321563, -0.06864375, -0.0142250005, 0.02...","[-0.06321563, -0.06864375, -0.0142250005, 0.02..."
2559,False,,30000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,243.0,tt0146882,en,High Fidelity,When record store owner Rob Gordon gets dumped...,...,High Fidelity,False,7.0,636.0,"[{'cast_id': 7, 'character': 'Rob Gordon', 'cr...","[{'credit_id': '52fe422cc3a36847f8009919', 'de...",6.658839,"[record, store, owner, rob, gordon, dumped, gi...","[-0.060336668, -0.08761667, -0.0049099987, 0.0...","[-0.060336668, -0.08761667, -0.0049099987, 0.0..."


In [None]:
def find_similar(query, g_model): 

    query_vector = sum(g_model[word] for word in query) / len(query)

    

In [77]:
dt_m['docs'].map(lambda doc: cosine_similarity(query_vector.reshape(1, -1) , doc.reshape(1, -1))).sort_values(ascending=False)[:100]

0              [[1.0]]
6134    [[0.70630527]]
2329    [[0.69957006]]
5154     [[0.6512627]]
5093     [[0.6403525]]
             ...      
519      [[0.5755236]]
2836    [[0.57542545]]
2076     [[0.5753243]]
6461      [[0.574911]]
7017     [[0.5746087]]
Name: docs, Length: 100, dtype: object

In [67]:
similarities = cosine_similarity(query_vector.reshape(1, -1), document_vectors[10].reshape(1, -1))

In [None]:

# Calculate cosine similarity between query vector and document vectors
similarities = cosine_similarity(query_vector.reshape(1, -1), document_vectors)

# Get the indices of the most similar documents
most_similar_indices = similarities.argsort()[0][::-1]

# Retrieve the most similar documents
most_similar_documents = [corpus[i] for i in most_similar_indices]


NameError: name 'document_vectors' is not defined