## **1 - Librerias y frameworks**

In [1]:
#Selección de features, resize y preprocesamiento
import zipfile
import gzip
import io
import pandas as pd
import ast
import json
import math
import re
from unidecode import unidecode
pd.set_option('display.max_columns', 200)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

#Para el modelo
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib

In [2]:
#Descargar stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abrahan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **2 - Cargar el dataframe (Probablemente en el etl deba simplificar este dataframe)**

In [3]:
def zip_to_dataframe(zip_file_path, csv_file_name):
    """
    Lee un archivo CSV dentro de un archivo ZIP y lo devuelve como un DataFrame de Pandas.
    Args:
        zip_file_path (str): Ruta del archivo ZIP.
        csv_file_name (str): Nombre del archivo CSV dentro del ZIP.
    Returns:
        pd.DataFrame: DataFrame que contiene los datos del archivo CSV.
    """
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        with zip_file.open(csv_file_name) as csv_file:
            with gzip.open(csv_file, 'rb') as gz_file:
                content = gz_file.read()

    dataframe = pd.read_csv(io.BytesIO(content), encoding='ISO-8859-1')
    return dataframe

In [4]:
df = zip_to_dataframe('df.zip','df.csv')

## **3 - Transformaciones para el modelo**

### Belong to collection

In [5]:
#transformar a diccionarios:
df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {'data': None})

La mayor cantidad de belong to collection esta vacia, probaré dejandola a ver como quedan los vectores, solo voy a usar 'name':'

In [6]:
df['belongs_to_collection'][0]

{'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}

Defino una nueva columna útil para el modelo

In [7]:
df['collection'] = df['belongs_to_collection'].apply(lambda x: x['name'] if isinstance(x, dict) and 'name' in x else '')

### Genres

In [8]:
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {'data': None})

In [9]:
df['genres'][0]

{'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'}]}

Defino una nueva columna útil para el modelo

In [10]:
df['joined_genres'] = df['genres'].apply(lambda x: ' '.join([genre['name'] for genre in x['genres']]) if isinstance(x, dict) and 'genres' in x else '')

### Production Companies

In [11]:
df['production_companies'] = df['production_companies'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {'data': None})

In [12]:
df['production_companies'][0]

{'production_companies': [{'name': 'Pixar Animation Studios', 'id': 3}]}

Defino una nueva columna útil para el modelo

In [13]:
df['joined_pc'] = df['production_companies'].apply(lambda x: ' '.join([production_companies['name'] for production_companies in x['production_companies']]) if isinstance(x, dict) and 'production_companies' in x else '')

### Cast


Sólo tomaré los nombres de los actores

In [14]:
df['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

Defino una nueva columna útil para el modelo

In [15]:
df['cast_names'] = df['cast'].apply(lambda x: ' '.join([actor['name'] for actor in ast.literal_eval(x)]) if isinstance(x, str) else '')

In [16]:
df['cast_names'][0]

'Tom Hanks Tim Allen Don Rickles Jim Varney Wallace Shawn John Ratzenberger Annie Potts John Morris Erik von Detten Laurie Metcalf R. Lee Ermey Sarah Freeman Penn Jillette'

### Crew


Solo tomaré el director

In [17]:
def convert_to_json(texto):
    """
    Convierte una cadena de texto en formato JSON a una estructura de datos en Python.
    Args:
        texto (str): El texto en formato JSON a convertir.
    Returns:
        dict or list: La estructura de datos en Python resultante después de la conversión.
    Raises:
        ValueError: Si el contenido no es una cadena de texto válida.
        ValueError: Si el contenido no es una estructura de datos válida o no se puede evaluar correctamente.
    """
    if pd.isna(texto) or (isinstance(texto, float) and math.isnan(texto)):
        return ''  # Reemplazar NaN por cadena vacía
    
    if isinstance(texto, str):
        try:
            estructura_datos = ast.literal_eval(texto)
            if isinstance(estructura_datos, (dict, list)):
                texto_json = json.dumps(estructura_datos)
                objeto_json = json.loads(texto_json)
                return objeto_json
            else:
                raise ValueError("El contenido no es una estructura de datos válida")
        except (SyntaxError, ValueError) as e:
            raise ValueError("El contenido no se puede evaluar correctamente") from e
    else:
        raise ValueError("El contenido no es una cadena de texto")

In [18]:
df['crew'] = df['crew'].apply(convert_to_json)

In [19]:
df['crew'][0]

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

Defino una nueva columna útil para el modelo

In [20]:
df['director'] = ''

# Recorrer cada fila del DataFrame
for index, row in df.iterrows():
    crew = row['crew']
    director_name = ''
    if crew != '':
        for entry in crew:
            if entry['job'] == 'Director':
                director_name = entry['name']
                break
    df.at[index, 'director'] = director_name

In [21]:
df.head(1)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return,cast,crew,collection,joined_genres,joined_pc,cast_names,director
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"{'genres': [{'id': 16, 'name': 'Animation'}, {...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,{'production_companies': [{'name': 'Pixar Anim...,"{'production_countries': [{'iso_3166_1': 'US',...",1995-10-30,373554050.0,81.0,"{'spoken_languages': [{'iso_639_1': 'en', 'nam...",Released,,Toy Story,7.7,5415.0,1995,12.451801,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",Toy Story Collection,Animation Comedy Family,Pixar Animation Studios,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,John Lasseter


## **4 - Redimensionar el df a las features que serán usadas**

In [22]:
df_resize = df[['id','title','overview','release_year','collection','joined_genres','cast_names','director','joined_pc']]

In [23]:
df_resize.head()

Unnamed: 0,id,title,overview,release_year,collection,joined_genres,cast_names,director,joined_pc
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995,Toy Story Collection,Animation Comedy Family,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,John Lasseter,Pixar Animation Studios
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,1995,,Adventure Family Fantasy,Robin Williams Jonathan Hyde Kirsten Dunst Bra...,Joe Johnston,Interscope Communications Teitler Film TriStar...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995,Grumpy Old Men Collection,Comedy Romance,Walter Matthau Jack Lemmon Ann-Margret Sophia ...,Howard Deutch,Lancaster Gate Warner Bros.
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995,,Comedy Drama Romance,Whitney Houston Angela Bassett Loretta Devine ...,Forest Whitaker,Twentieth Century Fox Film Corporation
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995,Father of the Bride Collection,Comedy,Steve Martin Diane Keaton Martin Short Kimberl...,Charles Shyer,Sandollar Productions Touchstone Pictures


Chequer valores vacios ('')

In [24]:
df_resize[df_resize==''].count()

id                   0
title                0
overview             0
release_year         0
collection       40861
joined_genres     2384
cast_names        2349
director           836
joined_pc        11789
dtype: int64

### Guardamos el dataframe

In [25]:
df_resize = df_resize.reset_index() #Es posible que necesite el index luego.

In [26]:
df_resize.head()

Unnamed: 0,index,id,title,overview,release_year,collection,joined_genres,cast_names,director,joined_pc
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995,Toy Story Collection,Animation Comedy Family,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,John Lasseter,Pixar Animation Studios
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...,1995,,Adventure Family Fantasy,Robin Williams Jonathan Hyde Kirsten Dunst Bra...,Joe Johnston,Interscope Communications Teitler Film TriStar...
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995,Grumpy Old Men Collection,Comedy Romance,Walter Matthau Jack Lemmon Ann-Margret Sophia ...,Howard Deutch,Lancaster Gate Warner Bros.
3,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995,,Comedy Drama Romance,Whitney Houston Angela Bassett Loretta Devine ...,Forest Whitaker,Twentieth Century Fox Film Corporation
4,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995,Father of the Bride Collection,Comedy,Steve Martin Diane Keaton Martin Short Kimberl...,Charles Shyer,Sandollar Productions Touchstone Pictures


# **Abajo hay una nota para borrar OJO**

In [27]:
#Guardo en CSV porque lo voy a usar para el entrenamiento
#NOTA PARA BORRAR este era el sample_df_joinedpc.csv (Solo borrar la nota, es para guiarme cuando este armando la función)
df_resize.to_csv('df_resize.csv', index=False)

## **5 - Preprocesamiento**

#### 5.1 - Unir todas las variables para preprocesar el texto y luego crear los vectores.

In [28]:
combined_features = pd.DataFrame()
combined_features['text'] = df_resize['title'].astype(str) + ' ' + df_resize['overview'].astype(str) + ' ' + df_resize['release_year'].astype(str) + ' ' + df_resize['collection'].astype(str) + ' ' + df_resize['joined_genres'].astype(str) + ' ' + df_resize['cast_names'].astype(str) + ' ' + df_resize['director'].astype(str)+ ' ' + df_resize['joined_pc'].astype(str)
combined_features.head(3)

Unnamed: 0,text
0,"Toy Story Led by Woody, Andy's toys live happi..."
1,Jumanji When siblings Judy and Peter discover ...
2,Grumpier Old Men A family wedding reignites th...


#### 5.2 - Limpieza del texto.

In [29]:
def clean_text(text):
    """
    Limpia un texto dado de caracteres no deseados, como saltos de línea, símbolos, menciones a usuarios (@), hashtags (#) y enlaces web.
    Args:
        texto (str): El texto a limpiar.
    Returns:
        str: El texto limpio, sin caracteres no deseados.
    """
    text_without_line_breaks = text.replace("\n", " ")
    lowercase_text = text_without_line_breaks.lower()
    text_without_accents = unidecode(lowercase_text)
    pattern = r'@[\w]+|#\w+|[!,".]|(\b[^\w\s]\b)|\bhttps?\S+\b'
    cleaned_text = re.sub(pattern, "", text_without_accents)
    return cleaned_text

In [30]:
combined_features['cleaned_text'] = combined_features['text'].astype(str).apply(clean_text)
combined_features.head(3)

Unnamed: 0,text,cleaned_text
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...


#### 5.3 - Tokenizar 

In [31]:
regexp = RegexpTokenizer('\w+')
combined_features['token'] = combined_features['cleaned_text'].apply(regexp.tokenize)
combined_features.head(3)

Unnamed: 0,text,cleaned_text,token
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...,"[toy, story, led, by, woody, andys, toys, live..."
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...,"[jumanji, when, siblings, judy, and, peter, di..."
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...,"[grumpier, old, men, a, family, wedding, reign..."


#### 5.4 - Eliminar stopwords:

In [32]:
### Quitar stopwords
combined_features['token_no_stopwords'] = combined_features['token'].apply(lambda x: [item for item in x if item not in stopwords])
combined_features.head(3)

Unnamed: 0,text,cleaned_text,token,token_no_stopwords
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...,"[toy, story, led, by, woody, andys, toys, live...","[toy, story, led, woody, andys, toys, live, ha..."
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...,"[jumanji, when, siblings, judy, and, peter, di...","[jumanji, siblings, judy, peter, discover, enc..."
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...,"[grumpier, old, men, a, family, wedding, reign...","[grumpier, old, men, family, wedding, reignite..."


#### 5.5 - Stemming:

In [33]:
stemmer = PorterStemmer()
combined_features['stemming'] = combined_features['token_no_stopwords'].apply(lambda x: [stemmer.stem(item) for item in x ])
combined_features.head(3)

Unnamed: 0,text,cleaned_text,token,token_no_stopwords,stemming
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...,"[toy, story, led, by, woody, andys, toys, live...","[toy, story, led, woody, andys, toys, live, ha...","[toy, stori, led, woodi, andi, toy, live, happ..."
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...,"[jumanji, when, siblings, judy, and, peter, di...","[jumanji, siblings, judy, peter, discover, enc...","[jumanji, sibl, judi, peter, discov, enchant, ..."
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...,"[grumpier, old, men, a, family, wedding, reign...","[grumpier, old, men, family, wedding, reignite...","[grumpier, old, men, famili, wed, reignit, anc..."


#### 5.6 - Eliminar palabras duplicadas:

In [34]:
combined_features['stemming_unique'] = combined_features['stemming'].apply(lambda x: list(set(x)))
combined_features.head(3)

Unnamed: 0,text,cleaned_text,token,token_no_stopwords,stemming,stemming_unique
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...,"[toy, story, led, by, woody, andys, toys, live...","[toy, story, led, woody, andys, toys, live, ha...","[toy, stori, led, woodi, andi, toy, live, happ...","[duo, hank, birthday, r, plot, 1995, tim, put,..."
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...,"[jumanji, when, siblings, judy, and, peter, di...","[jumanji, siblings, judy, peter, discover, enc...","[jumanji, sibl, judi, peter, discov, enchant, ...","[creatur, 1995, lloyd, gari, leonard, judi, jo..."
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...,"[grumpier, old, men, a, family, wedding, reign...","[grumpier, old, men, family, wedding, reignite...","[grumpier, old, men, famili, wed, reignit, anc...","[grumpi, burgess, gate, ancient, 1995, meredit..."


#### convertir las listas a string

In [35]:
combined_features['token'] = combined_features['token'].apply(lambda words: ' '.join(set(words)))
combined_features['token_no_stopwords'] = combined_features['token_no_stopwords'].apply(lambda words: ' '.join(set(words)))
combined_features['stemming'] = combined_features['stemming'].apply(lambda words: ' '.join(set(words)))
combined_features['stemming_unique'] = combined_features['stemming_unique'].apply(lambda words: ' '.join(set(words)))
combined_features.head(3)

Unnamed: 0,text,cleaned_text,token,token_no_stopwords,stemming,stemming_unique
0,"Toy Story Led by Woody, Andy's toys live happi...",toy story led by woody andys toys live happily...,1995 tim put andys plots his learns room of by...,toys duo lasseter laurie jillette birthday r w...,duo hank birthday r plot 1995 tim put circumst...,duo hank birthday r plot 1995 tim put circumst...
1,Jumanji When siblings Judy and Peter discover ...,jumanji when siblings judy and peter discover ...,discover opens themselves 1995 lloyd bonnie li...,discover opens 1995 lloyd bonnie living risky ...,creatur 1995 lloyd gari leonard judi johnston ...,creatur 1995 lloyd leonard gari judi johnston ...
2,Grumpier Old Men A family wedding reignites th...,grumpier old men a family wedding reignites th...,buddies alarming the burgess than gate opens a...,buddies alarming burgess gate opens ancient 19...,grumpi burgess gate ancient 1995 meredith bro ...,grumpi burgess gate ancient 1995 meredith bro ...


### Guardamos el dataframe

In [36]:
#Guardo en CSV porque lo voy a usar para más adelante

#Activar cuando se quiera probar con otras columnas
#combined_features.to_csv('df_processed.csv', index=False)

## **6 - Pruebas para el modelo**

In [37]:
#Data a utilizar:
data = combined_features
pelis = df_resize
columns = ['cleaned_text', 'token', 'token_no_stopwords', 'stemming', 'stemming_unique']

In [38]:
vectorizer = TfidfVectorizer()

indice_pelicula = 24207 # Annabelle

In [39]:
for c in columns:
    print('*******************************************************')
    df = pd.DataFrame()
    df[c] = data[c]
    df[c] = df[c].astype(str).apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").replace(",", " "))

    # Preprocesamiento y vectorización de texto
    X = vectorizer.fit_transform(df[c])

    # Obtener las características de la película de interés
    pelicula_interes = df.iloc[indice_pelicula][c]
    pelicula_interes_vec = vectorizer.transform([pelicula_interes])

    # Calcular distancias utilizando vecinos más cercanos
    knn = NearestNeighbors(n_neighbors=6, algorithm='auto')
    knn.fit(X)
    distances, indices = knn.kneighbors(pelicula_interes_vec)
    indices_similares = indices[0][1:]  # Excluir la película de interés
    peliculas_similares = df.iloc[indices_similares]
    indices = peliculas_similares.index

    print(f'Peliculas recomendadas, usando columna: {c}')
    print(pelis.loc[indices, 'title'])
    print('-----------------------------------------------------------')
    print('*******************************************************')

*******************************************************
Peliculas recomendadas, usando columna: cleaned_text
9757         The Cat and the Canary
11481              Loving Annabelle
21234                 The Conjuring
23937                Alligator Eyes
41773    Annabelle Serpentine Dance
Name: title, dtype: object
-----------------------------------------------------------
*******************************************************
*******************************************************
Peliculas recomendadas, usando columna: token
21234             The Conjuring
39012           The Conjuring 2
15839    The Poughkeepsie Tapes
35993                   Martyrs
27125                    Ritual
Name: title, dtype: object
-----------------------------------------------------------
*******************************************************
*******************************************************
Peliculas recomendadas, usando columna: token_no_stopwords
21234             The Conjuring
39012           

#### 2677 #para probar 'Problem Child' - Mi pobre diablillo.

Peliculas recomendadas, usando columna: cleaned_text

- Problem Child 2
- Problem Child 3
- Life With Mikey
- Kiss of Death
- Four More Years

Peliculas recomendadas, usando columna: token

- Problem Child 2
- Problem Child 3
- Take Me Out to the Ball Game
- A Good Night to Die
- Look Who's Talking Too

Peliculas recomendadas, usando columna: token_no_stopwords

- Problem Child 2
- Problem Child 3
- Take Me Out to the Ball Game
- Look Who's Talking Too
- The Mask

Peliculas recomendadas, usando columna: stemming

- Problem Child 2
- Problem Child 3
- Ed
- Jim Breuer: And Laughter for All
- Take Me Out to the Ball Game

Peliculas recomendadas, usando columna: stemming_unique

- Problem Child 2
- Problem Child 3
- Ed
- Jim Breuer: And Laughter for All
- Take Me Out to the Ball Game


##### 14528 #para probar 'Avatar'

Peliculas recomendadas, usando columna: cleaned_text

- Avatar 2
- Avatar: Creating the World of Pandora
- Meeting David Wilson
- Rise of the Planet of the Apes
- Idiocracy

Peliculas recomendadas, usando columna: token

- Avatar 2
- Avatar: Creating the World of Pandora
- Rise of the Planet of the Apes
- X-Men Origins: Wolverine
- Under the Mountain

Peliculas recomendadas, usando columna: token_no_stopwords

- Avatar 2
- Avatar: Creating the World of Pandora
- Rise of the Planet of the Apes
- X-Men Origins: Wolverine
- Under the Mountain

Peliculas recomendadas, usando columna: stemming

- Avatar 2
- Avatar: Creating the World of Pandora
- Rise of the Planet of the Apes
- Under the Mountain
- X-Men Origins: Wolverine

Peliculas recomendadas, usando columna: stemming_unique

- Avatar 2
- Avatar: Creating the World of Pandora
- Rise of the Planet of the Apes
- Under the Mountain
- X-Men Origins: Wolverine


### Luego de chequear con varias películas de diferentes generos se determinó que la mejor opción era usar la columna stemming_unique

## **7 - Entrenamiento**

In [40]:
#Data a utilizar:
data_modelo = pd.DataFrame()
data_modelo['stemming_unique'] = combined_features['stemming_unique']
pelis = df_resize

In [41]:
data_modelo.head(3)

Unnamed: 0,stemming_unique
0,duo hank birthday r plot 1995 tim put circumst...
1,creatur 1995 lloyd leonard gari judi johnston ...
2,grumpi burgess gate ancient 1995 meredith bro ...


# **Abajo hay una nota para borrar OJO**

In [42]:
#Guardo en CSV porque lo voy a usar en la API
#NOTA PARA BORRAR este era el este es nuevo para usar  (Solo borrar la nota, es para guiarme cuando este armando la función)
data_modelo.to_csv('df_model.csv', index=False)

In [43]:
vectorizer = TfidfVectorizer()
trained_model_filename = 'trained_model.joblib' #Nombre del modelo a deployar
vectorizer_filename = 'vectorizer.joblib' #Vector del modelo
column = 'stemming_unique'

In [44]:
def train_model(column):
    """
    Entrena un modelo de vecinos más cercanos utilizando un conjunto de datos en forma de columna.
    Args:
        column (str): El nombre de la columna que contiene los datos de entrenamiento.
        Ejemplo: 'cleaned_text', 'token', 'token_no_stopwords', 'stemming', 'stemming_unique'
    """
    # Preprocesamiento y vectorización de texto
    df = pd.DataFrame()
    df[column] = data_modelo[column]
    df[column] = df[column].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").replace(",", " "))
    X = vectorizer.fit_transform(df[column])

    # Entrenar el modelo de vecinos más cercanos
    knn = NearestNeighbors(n_neighbors=6, algorithm='auto')
    knn.fit(X)

    # Guardar el modelo entrenado en un archivo
    joblib.dump(knn, trained_model_filename)
    joblib.dump(vectorizer, vectorizer_filename)
    print("Modelo entrenado y guardado.")

#### Entrenar

In [45]:
train_model(column)

Modelo entrenado y guardado.


Función para pedir la recomendación de peliculas:

In [46]:
def recommend_movies(movie_id, column):
    """
    Recomienda películas similares a una película de interés dada su index.
    Args:
        movie_id (int): El ID de la película de interés.
        column (str): El nombre de la columna que contiene los datos de las películas.
    Returns:
        recommended_movies (Series): Una serie que contiene los títulos de las películas recomendadas.
    """
    # Cargar el modelo entrenado desde el archivo
    knn = joblib.load(trained_model_filename)
    
    # Cargar el vectorizador
    vectorizer = joblib.load(vectorizer_filename)

    # Obtener las características de la película de interés
    df = pd.read_csv('df_model.csv', index_col=False)
    df[column] = df[column].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").replace(",", " "))
    pelicula_interes = df.iloc[movie_id][column]
    pelicula_interes_vec = vectorizer.transform([pelicula_interes])

    # Calcular distancias utilizando el modelo cargado
    distances, indices = knn.kneighbors(pelicula_interes_vec)
    indices_similares = indices[0][1:]  # Excluir la película de interés
    peliculas_similares = df.iloc[indices_similares]
    indices = peliculas_similares.index

    # Obtener los títulos de las películas recomendadas
    pelis = pd.DataFrame()
    pelis = pd.read_csv('df_resize.csv',index_col=False)
    recommended_movies = pelis.loc[indices, 'title']

    return recommended_movies

In [47]:
# Ejemplo de uso
movie_id = 21234
recommended_movies = recommend_movies(movie_id, column)
print(recommended_movies)

39012         The Conjuring 2
24207               Annabelle
31688             Dark Places
26501    Insidious: Chapter 3
16858               Insidious
Name: title, dtype: object


#### Establecer función para introducir nombre de la película en lugar del id:

In [48]:
def buscar_pelicula(df, nombre_pelicula):
    df['title_lower'] = df['title'].str.lower()
    nombre_pelicula = nombre_pelicula.lower()

    # Buscar la película en el DataFrame
    indice = df[df['title_lower'] == nombre_pelicula].index.tolist()

    if indice:
        return indice[0]  # Devolver el primer índice encontrado
    else:
        return "No se encontró ninguna coincidencia."

In [49]:
dataframe = pelis #deinido en celda 158
pelicula_buscada = input("Ingrese el nombre de la película: ")
indice_pelicula = buscar_pelicula(dataframe, pelicula_buscada)
print("El índice de la película es:", indice_pelicula)

El índice de la película es: 14528


In [50]:
def recommend_movies(title):
    """
    Recomienda películas similares en base a un título dado.
    Args:
        title (str): Título de la película.
    Returns:
        dict: Diccionario con las películas recomendadas.
    """
    #Data
    pelis = pd.DataFrame
    pelis = pd.read_csv('df_resize.csv', index_col=False) #aca colocar ../model/df_resize.csv
    df = pd.read_csv('df_model.csv', index_col=False) #aca colocar ../model/df_model.csv
    trained_model_filename = 'trained_model.joblib' #aca colocar ../model/trained_model.joblib
    vectorizer_filename = 'vectorizer.joblib' #aca colocar ../model/vectorizer.joblib
    column = column = 'stemming_unique'

    #Obtener index:
    pelis['title_lower'] = pelis['title'].str.lower()
    title = title.lower()
    title_index = pelis[pelis['title_lower'] == title].index.tolist()
    
    if not title_index:
        raise ValueError("La película no se encuentra en la base de datos.")
    else:
        movie_id = title_index[0]  # Devolver el primer índice encontrado

    # Cargar el modelo entrenado desde el archivo
    knn = joblib.load(trained_model_filename)
        # Cargar el vectorizador
    vectorizer = joblib.load(vectorizer_filename)

    # Obtener las características de la película de interés
    df[column] = df[column].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").replace(",", " "))
    pelicula_interes = df.iloc[movie_id][column]
    pelicula_interes_vec = vectorizer.transform([pelicula_interes])

    # Calcular distancias utilizando el modelo cargado
    distances, indices = knn.kneighbors(pelicula_interes_vec)
    indices_similares = indices[0][1:]  # Excluir la película de interés
    peliculas_similares = df.iloc[indices_similares]
    indices = peliculas_similares.index

    # Obtener los títulos de las películas recomendadas
    recommended_movies = pelis.loc[indices, 'title']
    movie_dict = {}
    for i, movie in enumerate(recommended_movies, 1):
        movie_dict[str(i)] = movie

    return {'peliculas recomendadas': movie_dict}

In [51]:
#prueba
recommend_movies('avatar')

{'peliculas recomendadas': {'1': 'Avatar 2',
  '2': 'Avatar: Creating the World of Pandora',
  '3': 'Rise of the Planet of the Apes',
  '4': 'Under the Mountain',
  '5': 'X-Men Origins: Wolverine'}}

#### Unir ambas funciones para usarlas en la API: