# Information retrieval for movies recommendation

Database which the project it's based on:   
[HBO Max](https://www.kaggle.com/datasets/dgoenrique/hbo-max-movies-and-tv-shows)  

<div></div> 

In [1]:
import numpy as np 
import pandas as pd
from toolbox import preprocessing

In [2]:
# Load the "rating" and "link" with the id from multiple sources
ratings = pd.read_csv('../data/tmdb/ratings.zip')
links = pd.read_csv('../data/tmdb/links.zip')

# Inner join between both files
ratings = ratings.merge(links, how='inner', on='movieId')

# Select only the movies with at least 750 reviews, to classify as popular enough for recommendation
pop_movies = ratings['movieId'].value_counts().to_frame().query('count > 750').index
pop_movies = links.query('movieId in @pop_movies')['tmdbId'].dropna()

del ratings, links

<div></div> 

## Leitura dos Arquivos

As bases vieram em formato CSV, portanto, só foi utilizado o pandas para leitura e feito um concat

<div></div> 

In [3]:
# Load the 'credits' dataset from a zipped CSV file
dt_c = pd.read_csv('../data/tmdb/credits.zip')

# Load the 'movies_metadata' dataset from a zipped CSV file
dt_m = pd.read_csv('../data/tmdb/movies_metadata.zip')

# Convert the 'id' column to numeric data type, ignoring any errors
dt_m['id'] = pd.to_numeric(dt_m['id'], errors='coerce')

# Convert the 'popularity' column to numeric data type, ignoring any errors
dt_m['popularity'] = pd.to_numeric(dt_m['popularity'], errors='coerce')

# Merge the 'movies_metadata' DataFrame with the 'credits' DataFrame based on the 'id' column
dt_m = dt_m.merge(dt_c.set_index('id'), how='left', left_on=['id'], right_index=True)

# Drop rows with missing values in the 'id' column
dt_m.dropna(subset=['id', 'overview'], inplace=True)

# Select the movies with the minimun engagement
dt_m.query('id in @pop_movies', inplace=True)

# Reset index 
dt_m.reset_index(drop=True, inplace=True)

# Delete the 'credits' DataFrame to free up memory
del dt_c

  dt_m = pd.read_csv('../data/tmdb/movies_metadata.zip')


In [4]:
# Define the variables
v = 'vote_count'  # Vote count column
m = 'vote_count.quantile(0.85)'  # Quantile of vote count
R = 'vote_average'  # Vote average column
C = 'vote_average.mean()'  # Mean of vote average

# Evaluate the score using the defined variables and assign it to a new column 'score'
dt_m.eval(f'score = ({v}/({v}+{m}) * {R}) + ({m}/({m}+{v}) * {C})', inplace=True)

In [5]:
# Top ranking by normalized score
dt_m[['title', 'vote_average', 'score']].sort_values(by='score', ascending=False).head(10)

Unnamed: 0,title,vote_average,score
231,The Shawshank Redemption,8.5,8.195274
3624,The Dark Knight,8.3,8.103938
535,The Godfather,8.5,8.101349
1753,Fight Club,8.3,8.058688
216,Pulp Fiction,8.3,8.034883
3868,Inception,8.1,7.946423
262,Forrest Gump,8.2,7.936551
4169,Interstellar,8.1,7.911596
731,The Empire Strikes Back,8.2,7.861528
2976,The Lord of the Rings: The Return of the King,8.1,7.854521


In [6]:
# Top ranking by popularity
dt_m[['title', 'popularity', 'score']].sort_values(by='popularity', ascending=False).head(10)

Unnamed: 0,title,popularity,score
4283,Minions,547.488298,6.437074
4218,Big Hero 6,213.849907,7.553117
4249,Deadpool,187.860492,7.298742
4250,Guardians of the Galaxy Vol. 2,185.330992,7.346307
3812,Avatar,185.070892,7.126373
4213,John Wick,183.870374,6.901203
4198,Gone Girl,154.801009,7.623994
4223,The Hunger Games: Mockingjay - Part 1,147.098006,6.589483
4251,Captain America: Civil War,145.882135,7.005543
216,Pulp Fiction,140.950236,8.034883


<div></div> 

## Pré-Processamento de Texto

Para minimizar possíveis gargalos de processamento e identificação dos termos relevantes, é realizada a remoção de ruídos utilizando regex. Em seguida, é aplicada a tokenização, que consiste na transformação do texto em uma lista de palavras, a fim de possibilitar a aplicação das técnicas de TF-IDF em um modelo vetorial.

Além disso, foram feitas alguns processsos adicionais para possibilitar o processamento sem erros

<div></div> 


### Tratamento dos gêneros

### Remoção de palavras e transformação de minúsculos

In [7]:
dt_m['p_overview'] = dt_m['overview'].replace(r'[^\w\s]', '', regex=True)

# Aplicando as funções str.lower() e str.strip() simultaneamente
dt_m['p_overview'] = dt_m['p_overview'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)


In [8]:
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.remove_stopwords)
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.lemmatize_text)
dt_m['p_overview'] = dt_m['p_overview'].map(preprocessing.word_tokenize)

### Identificação das query / docs

Foi feita uma separação do index das query, para pode fazer uma localização do na base origina após o TF-IDF, dado que o TF-IDF reseta os index dos termos por documento

In [38]:
from gensim.models import Word2Vec

model = Word2Vec(min_count=0, 
                 vector_size=100,
                 workers=12,
                 window=5,
                 sg=1,
                 seed=42, 
                 epochs=150
                 )

In [39]:
model.build_vocab(dt_m['p_overview'].to_list())
model.train(dt_m['p_overview'], total_examples=model.corpus_count, epochs=model.epochs)

(25712508, 33569250)

In [40]:
words = model.wv

In [12]:
dt_m.query('title.str.contains("Toy Story")')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,crew,score,p_overview
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",7.444365,"[led, by, woody, andys, toys, live, happily, i..."
1843,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story-2,863.0,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,Released,The toys are back!,Toy Story 2,False,7.3,3914.0,"[{'cast_id': 18, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8025073', 'de...",7.087499,"[andy, heads, off, to, cowboy, camp, leaving, ..."
3859,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://disney.go.com/toystory/,10193.0,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,Released,No toy gets left behind.,Toy Story 3,False,7.6,4710.0,"[{'cast_id': 6, 'character': 'Woody (voice)', ...","[{'credit_id': '5770143fc3a3683733000f3a', 'de...",7.340308,"[woody, buzz, and, the, rest, of, andys, toys,..."


In [35]:
for x in dt_m.query('overview.str.contains("Woody")')['overview']: 
    print(x)

Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Comic artist and writer Woody performs a simple courier operation for his friend Harry who works for the CIA. But when he successfully fends off hostile agents, he earns the respect of the beautiful Natalia, who requests his assistance for her defection. Woody uses this request as leverage to use the CIA's resources to bring his comic book creation, Condorman, to life to battle the evil Krokov.
Robert Redford stars as billionaire John Gage, who offers a down-on-his-luck yuppie husband (Woody Harrelson) $1 million for the opportunity to spend the night with the man's wife (Demi Moore).
Based on the graphic novel by James Jones, The Thin Red Line tells the story of a group of men, an Army 

In [37]:
words.most_similar(positive=['toys'])

[('daycare', 0.6555206775665283),
 ('devices', 0.6473174095153809),
 ('professionalism', 0.6464580297470093),
 ('frost', 0.6407884955406189),
 ('eternity', 0.6246908903121948),
 ('haine', 0.6213828921318054),
 ('favorites', 0.62006014585495),
 ('mount', 0.6198017001152039),
 ('topdog', 0.6173060536384583),
 ('darker', 0.617026150226593)]

In [74]:
dt_m['p_overview'].map(lambda x: words.doesnt_match(x))

0           learns
1          sibling
2        meanwhile
3       determined
4             like
           ...    
4334        around
4335            90
4336       working
4337       attempt
4338        future
Name: p_overview, Length: 4339, dtype: object

In [66]:
dt_m['p_overview'].map(lambda desc: model.wv[desc].mean(axis=0))

KeyError: "Key 'lightyear' not present"

In [57]:
model.wv.most_similar('woody', topn=10)

[('harrelson', 0.7445396184921265),
 ('clooney', 0.6231038570404053),
 ('nolte', 0.6213560700416565),
 ('buzz', 0.6194430589675903),
 ('behalf', 0.5979752540588379),
 ('olivier', 0.5850128531455994),
 ('andys', 0.5828583836555481),
 ('regina', 0.5733492374420166),
 ('outspoken', 0.5582083463668823),
 ('performs', 0.5475900173187256)]

In [43]:
model.wv.most_similar('', topn=10)

[('debo', 0.6056615114212036),
 ('asquad', 0.5960063934326172),
 ('substitute', 0.5738646984100342),
 ('classroom', 0.5593579411506653),
 ('calm', 0.5526667833328247),
 ('pretending', 0.5499388575553894),
 ('rematch', 0.5471845269203186),
 ('mishap', 0.5470882058143616),
 ('bronx', 0.5457514524459839),
 ('wrath', 0.5406851172447205)]

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [None]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)

(0, 2)

In [None]:
vector = model.wv['computer']  # get numpy vector of a word
sims = model.wv.most_similar('computer', topn=10)  # get other similar words

In [None]:
sims

[('system', 0.21617139875888824),
 ('survey', 0.04468922317028046),
 ('interface', 0.015203381888568401),
 ('time', 0.0019510635174810886),
 ('trees', -0.03284316882491112),
 ('human', -0.07424270361661911),
 ('response', -0.09317591041326523),
 ('graph', -0.09575342386960983),
 ('eps', -0.10513808578252792),
 ('user', -0.16911619901657104)]

In [None]:
vector

array([-0.00515774, -0.00667028, -0.0077791 ,  0.00831315, -0.00198292,
       -0.00685696, -0.0041556 ,  0.00514562, -0.00286997, -0.00375075,
        0.0016219 , -0.0027771 , -0.00158482,  0.0010748 , -0.00297881,
        0.00852176,  0.00391207, -0.00996176,  0.00626142, -0.00675622,
        0.00076966,  0.00440552, -0.00510486, -0.00211128,  0.00809783,
       -0.00424503, -0.00763848,  0.00926061, -0.00215612, -0.00472081,
        0.00857329,  0.00428459,  0.0043261 ,  0.00928722, -0.00845554,
        0.00525685,  0.00203994,  0.0041895 ,  0.00169839,  0.00446543,
        0.0044876 ,  0.0061063 , -0.00320303, -0.00457706, -0.00042664,
        0.00253447, -0.00326412,  0.00605948,  0.00415534,  0.00776685,
        0.00257002,  0.00811905, -0.00138761,  0.00808028,  0.0037181 ,
       -0.00804967, -0.00393476, -0.0024726 ,  0.00489447, -0.00087241,
       -0.00283173,  0.00783599,  0.00932561, -0.0016154 , -0.00516075,
       -0.00470313, -0.00484746, -0.00960562,  0.00137242, -0.00