# Information retrieval for movies recommendation

Database which the project it's based on:   
[HBO Max](https://www.kaggle.com/datasets/dgoenrique/hbo-max-movies-and-tv-shows)  

<div></div> 

In [1]:
import numpy as np 
import pandas as pd
from toolbox import preprocessing
import ast

In [2]:
# load
try: 
    pop_movies = pd.read_csv('../data/modified/popular_movies.csv.zip')['tmdbId']
except: 
# Load the "rating" and "link" with the id from multiple sources
    ratings = pd.read_csv('../data/origin/ratings.zip')
    links = pd.read_csv('../data/origin/links.zip')

    # Inner join between both files
    ratings = ratings.merge(links, how='inner', on='movieId')

    # Select only the movies with at least 750 reviews, to classify as popular enough for recommendation
    pop_movies = ratings['movieId'].value_counts().to_frame().query('count > 50').index
    pop_movies = links.query('movieId in @pop_movies')['tmdbId'].dropna()

    pop_movies.to_csv('../data/modified/popular_movies.csv.zip', index=False, compression='zip')

    del ratings, links

<div></div> 

## Leitura dos Arquivos

As bases vieram em formato CSV, portanto, só foi utilizado o pandas para leitura e feito um concat

<div></div> 

In [3]:
# Load the 'credits' dataset from a zipped CSV file
dt_c = pd.read_csv('../data/origin/credits.zip')

# Load the 'movies_metadata' dataset from a zipped CSV file
dt_m = pd.read_csv('../data/origin/movies_metadata.zip')

# Convert the 'id' column to numeric data type, ignoring any errors
dt_m['id'] = pd.to_numeric(dt_m['id'], errors='coerce')

# Convert the 'popularity' column to numeric data type, ignoring any errors
dt_m['popularity'] = pd.to_numeric(dt_m['popularity'], errors='coerce')

# Merge the 'movies_metadata' DataFrame with the 'credits' DataFrame based on the 'id' column
dt_m = dt_m.merge(dt_c.set_index('id'), how='left', left_on=['id'], right_index=True)

# Drop rows with missing values in the 'id' column
dt_m.dropna(subset=['id', 'overview'], inplace=True)

# Select the movies with the minimun engagement
dt_m.query('id in @pop_movies', inplace=True)

# Reset index 
dt_m.reset_index(drop=True, inplace=True)

# Delete the 'credits' DataFrame to free up memory
del dt_c

  dt_m = pd.read_csv('../data/origin/movies_metadata.zip')


In [5]:
dt_m.drop(['homepage', 'original_title', 'poster_path'], axis=1, inplace=True, errors='ignore')

In [13]:
dt_m['p_cast'] = dt_m['cast'].map(lambda cast: [x.get('name').lower().replace(' ', '_') for x in ast.literal_eval(cast)[0:5]])

In [26]:
def get_director(string): 
    crew = ast.literal_eval(string)

    for i in crew: 
        if i.get('job').lower()=='director':
            return  i.get('name').lower().replace(' ', '_')

dt_m['director'] = dt_m['crew'].map(get_director)

In [27]:
dt_m['p_genres'] = dt_m['genres'].map(lambda genres: [genre.get('name').lower().replace(' ', '_') for genre in ast.literal_eval(genres)])

In [45]:
dt_m['p_director'] = dt_m['director'].map(lambda dir: [dir])

In [54]:
dt_m.eval('docs = p_cast + p_genres + p_director', inplace=True)

### Identificação das query / docs

Foi feita uma separação do index das query, para pode fazer uma localização do na base origina após o TF-IDF, dado que o TF-IDF reseta os index dos termos por documento

In [55]:
import multiprocessing as mp
from gensim.models import Word2Vec

In [57]:
docs = dt_m['docs']

In [190]:
model = Word2Vec(min_count=1,
                 window=3,
                 vector_size=30,
                #  sample=0, 
                 alpha=0.03, 
                 negative=5,
                 workers=mp.cpu_count()-1, 
                 seed=42, 
                 sg=1)

model.build_vocab(docs)
model.train(docs, total_examples=model.corpus_count, epochs=50, report_delay=1)


(2374647, 3032250)

In [191]:
remover = set(model.wv.index_to_key)

dt_m['docs'] = dt_m['docs'].map(lambda doc: [x for x in doc if x in remover])
dt_m['docs'] = dt_m['docs'].map(lambda doc: [w for w in list(doc) if w is not None])

In [192]:
dt_m['embed_docs'] = dt_m['docs'].map(lambda doc: sum(model.wv[(w)] for w in list(doc)) / len(doc))

In [193]:
from sklearn.metrics.pairwise import cosine_similarity

In [194]:
similarity = dt_m['embed_docs'].map(lambda doc: cosine_similarity(dt_m['embed_docs'][0].reshape(1, -1), doc.reshape(1, -1))).sort_values(ascending=False)
similarity

0        [[0.9999998]]
2333    [[0.98740584]]
6148    [[0.96926284]]
5533     [[0.9664703]]
1270     [[0.9592312]]
             ...      
4899    [[0.15361409]]
4513    [[0.15361409]]
719     [[0.15335658]]
5316    [[0.13914518]]
3609    [[0.13711804]]
Name: embed_docs, Length: 7142, dtype: object

In [189]:
dt_m['docs'].map(lambda doc: sum(model.wv[word] for word in doc) / len(doc))

0       [0.03979972, -0.35979572, -0.24217868, 0.03807...
1       [0.037879337, -0.3519118, -0.23652212, 0.03519...
2       [0.027425002, -0.32364863, -0.20734666, 0.0291...
3       [0.043172404, -0.33960414, -0.22528216, 0.0378...
4       [0.05631381, -0.30987737, -0.2057415, 0.054088...
                              ...                        
7137    [0.05709718, -0.40053213, -0.26335394, 0.04953...
7138    [0.056247674, -0.40096593, -0.25617376, 0.0453...
7139    [0.056247674, -0.40096593, -0.25617376, 0.0453...
7140    [0.038822003, -0.32833073, -0.20914164, 0.0373...
7141    [0.038822003, -0.32833073, -0.20914164, 0.0373...
Name: docs, Length: 7142, dtype: object

In [201]:
model.save('../models/metadata_model.txt')