# Modelizar y crear función de recomendación

### Importar librerias

In [1]:
import pandas as pd
import numpy as np


### Cargar Dataset

In [2]:
#Importo dataset
df = pd.read_parquet('datasets_ml.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32184 entries, 0 to 32183
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            32184 non-null  int64  
 1   title         32184 non-null  object 
 2   release_year  32184 non-null  int32  
 3   genres        30543 non-null  object 
 4   tagline       17914 non-null  object 
 5   overview      32184 non-null  object 
 6   vote_average  32184 non-null  float64
 7   actores       30295 non-null  object 
 8   director      31500 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(6)
memory usage: 2.1+ MB


#### Eliminar valores nulos

In [3]:
df = df[df['genres'].notna()]
df = df[df['actores'].notna()]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29216 entries, 0 to 32183
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            29216 non-null  int64  
 1   title         29216 non-null  object 
 2   release_year  29216 non-null  int32  
 3   genres        29216 non-null  object 
 4   tagline       17378 non-null  object 
 5   overview      29216 non-null  object 
 6   vote_average  29216 non-null  float64
 7   actores       29216 non-null  object 
 8   director      28946 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(6)
memory usage: 2.1+ MB


In [5]:
df.columns

Index(['id', 'title', 'release_year', 'genres', 'tagline', 'overview',
       'vote_average', 'actores', 'director'],
      dtype='object')

#### Seleccionar peliculas por año

Como en el EDA, hay un incremento en las películas registradas en el dataset a partir de 1990. Intenté reducir en ese tamaño el dataset pero generó una matriz muy grande. Asi que reduje el tamaño tomando peliculas más modernas.
2015 es el año en que nació mi tercer hijo, asi que considero que podría ser un recomendador para él.

In [6]:
mask = df['release_year'] > 2014
df =  df[mask]

#### Unir características útiles

In [7]:
#Concateno informacion a usar
df['features'] = df['genres'].str.cat([df['overview'], df['director']], sep=' ')

In [8]:
df['features'].info()

<class 'pandas.core.series.Series'>
Index: 2563 entries, 29398 to 32183
Series name: features
Non-Null Count  Dtype 
--------------  ----- 
2536 non-null   object
dtypes: object(1)
memory usage: 40.0+ KB


In [9]:
df

Unnamed: 0,id,title,release_year,genres,tagline,overview,vote_average,actores,director,features
29398,341051,deep dark,2015,horror,,fantasi fail sculptor discov strang talk hole ...,5.0,sean mcgrath denis poirier ann sorc tabor helt...,michael medaglia,horror fantasi fail sculptor discov strang tal...
29399,353406,manoman,2015,drama anim,,gleefulli anarch tour forc puppetri primal scr...,7.0,gordon peardon,simon cartwright,drama anim gleefulli anarch tour forc puppetri...
29400,380013,nobody walks in l.a.,2015,romanc comedi drama,,nobodi walk stori two old friend face decis wa...,9.5,adam shapiro kim shaw peter breitmay,jess shapiro,romanc comedi drama nobodi walk stori two old ...
29402,322266,buddy hutchins,2015,thriller,bad day bad day,busi shambl mount crimin record cheat wife tod...,3.3,jami kennedi salli kirkland sara malakul lane ...,jare cohn,thriller busi shambl mount crimin record cheat...
29403,382651,helix,2015,action mysteri scienc fiction,everyth chang,low level cop aiden magnusson solv infam crime...,4.9,robert duncan marc petey david stuart rhonda d...,eric petey,action mysteri scienc fiction low level cop ai...
...,...,...,...,...,...,...,...,...,...,...
32179,412059,mobile homes,2018,drama,,forgotten town along american border young mot...,0.0,imogen poot callum turner callum keith renni f...,vladimir fontenay,drama forgotten town along american border you...
32180,332283,mary shelley,2018,drama romanc,,love affair poet perci shelley year old mari w...,0.0,ell fan dougla booth bel powley maisi william ...,haifaa mansour,drama romanc love affair poet perci shelley ye...
32181,38700,bad boys for life,2018,thriller action crime,,continu adventur miami detect mike lowrey marc...,0.0,smith martin lawrenc derrick gilbert,joe carnahan,thriller action crime continu adventur miami d...
32182,299782,the other side of the wind,2018,comedi drama,,orson well unfinish masterpiec restor assembl ...,0.0,john huston robert random peter bogdanovich su...,orson well,comedi drama orson well unfinish masterpiec re...


#### Seleccionamos los campos a utilizar para entrenar

In [10]:
df = df[['id','title','features']]

ELiminar datos nulos

In [11]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2536 entries, 29398 to 32183
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2536 non-null   int64 
 1   title     2536 non-null   object
 2   features  2536 non-null   object
dtypes: int64(1), object(2)
memory usage: 79.2+ KB


### Entrenamiento del modelo

Utilizo un modelo TF-IDF para vectorizar los textos de las características seleccionadas. Creando una matriz de vectores para utilizar en el cálculo de similitud del coseno

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Crea un vectorizador TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

# Ajusta el vectorizador a la columna 'overview' y transforma los datos
tfidf_matrix = vectorizer.fit_transform(df['features'])

# Calcula la similitud coseno entre los vectores
similarity_matrix = cosine_similarity(tfidf_matrix)



In [14]:
# Define una función para obtener las películas similares a una película dada
def get_similar_movies(movie, N=5):

    #Poner en minúsculas
    movie = movie.lower()
    # Buscar la película en la base de datos
    movie_id = df.loc[df['title'] == movie, 'id'].values[0]
    # Obtiene el índice de la película en la matriz de similitud
    idx = np.where(df['id'] == movie_id)[0][0]
    
    # Obtiene las puntuaciones de similitud para la película
    scores = list(enumerate(similarity_matrix[idx]))
    
    # Ordena las puntuaciones en orden descendente
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Obtiene las películas similares
    similar_movies = [df.iloc[i,1] for i, _ in scores[1:N+1]]
    
    return similar_movies


In [15]:
# Ejemplo de uso:
# movie = 'avatar' 
# int(df['id'][df['title'] == movie])
# df.loc[df['title'] == movie, 'id']
# df.loc[df['title'] == movie, 'id'].values[0]

In [16]:
# Ejemplo de uso:
movie = 'avatar 2'  # Reemplaza con el ID de la película para la que deseas obtener recomendaciones
similar_movies = get_similar_movies(movie)

print(similar_movies)

['doctor strange', 'thor: ragnarok', 'maximum ride', 'justice league dark', 'team thor']


In [17]:
df.to_parquet('ml.parquet',engine='pyarrow')

In [18]:
import pickle

with open('data.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)