### Analisis exploratorio de datos

In [51]:
import pandas as pd 
import numpy as np 
import seaborn as sb


In [52]:
data = pd.read_parquet("df_final.parquet")

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45379 entries, 0 to 45378
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   budget                 45379 non-null  object 
 1   id                     45379 non-null  object 
 2   original_language      45368 non-null  object 
 3   overview               44438 non-null  object 
 4   popularity             45377 non-null  object 
 5   release_date           45376 non-null  object 
 6   revenue                45379 non-null  float64
 7   runtime                45130 non-null  float64
 8   status                 45296 non-null  object 
 9   tagline                20398 non-null  object 
 10  title                  45376 non-null  object 
 11  vote_average           45376 non-null  float64
 12  vote_count             45376 non-null  float64
 13  belongs_to_collection  45379 non-null  object 
 14  genres                 45379 non-null  object 
 15  pr

In [53]:
# Eliminamos las columnas que no nos sirven
data = data.drop(['budget', 'release_date', 'status', 'vote_count',
                  'spoken_languages', 'tagline'], axis=1)

In [54]:
data = data.dropna(subset=['overview'])

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44435 entries, 0 to 45375
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    44435 non-null  object 
 1   original_language     44425 non-null  object 
 2   overview              44435 non-null  object 
 3   popularity            44435 non-null  float64
 4   revenue               44435 non-null  float64
 5   runtime               44435 non-null  float64
 6   title                 44435 non-null  object 
 7   vote_average          44435 non-null  float64
 8   genres                44435 non-null  object 
 9   production_companies  44435 non-null  object 
 10  production_countries  44435 non-null  object 
 11  release_year          44435 non-null  float64
 12  return                44435 non-null  float64
dtypes: float64(6), object(7)
memory usage: 4.7+ MB


Usaremos las siguientes columnas como filtros para eliminar peliculas no interesantes:
* popularity 
* revenue
* vote_average
* release_year
* original_language
* runtime
* production_countries

In [56]:
data['popularity'] = pd.to_numeric(data['popularity'], errors='coerce')

In [57]:
# Eliminamos los valores por debajo de la mediana ya que el histograma tiene sesgo positivo
threshold = data['popularity'].median() 
data = data[data['popularity'] >= threshold]

In [35]:
language_counts = data['original_language'].value_counts()
language_counts.head(10)

original_language
en    16725
fr     1296
ja      692
it      534
de      405
es      371
ko      276
ru      269
hi      242
zh      217
Name: count, dtype: int64

In [None]:
# Definimos la lista de idiomas que queremos conservar
languages_to_keep = ['en', 'es']

# Filtramos el dataframe para tener solo las peliculas en ingles y espaniol
data = data[data['original_language'].isin(languages_to_keep)]


In [60]:
# Quitamos las peliculas mas viejas
data = data[data['release_year'] >= 1970]

In [62]:
# Quitamos las peliculas con menor puntuacion
threshold = data['vote_average'].median() 
data = data[data['popularity'] >= threshold]

In [63]:
# Quitamos las peliculas con pocas palabras en el resumen
data = data[data['overview'].apply(lambda x: len(str(x).split()) >= 60)]

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2064 entries, 1 to 45098
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    2064 non-null   object 
 1   original_language     2064 non-null   object 
 2   overview              2064 non-null   object 
 3   popularity            2064 non-null   float64
 4   revenue               2064 non-null   float64
 5   runtime               2064 non-null   float64
 6   title                 2064 non-null   object 
 7   vote_average          2064 non-null   float64
 8   genres                2064 non-null   object 
 9   production_companies  2064 non-null   object 
 10  production_countries  2064 non-null   object 
 11  release_year          2064 non-null   float64
 12  return                2064 non-null   float64
dtypes: float64(6), object(7)
memory usage: 225.8+ KB


In [65]:
df = data

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
import pandas as pd

# Supongamos que ya tienes tu DataFrame `df`

# Paso 1: Asegúrate de que `overview` esté en formato de texto
df['overview'] = df['overview'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x).fillna('')

# Paso 2: Convierte las listas en texto para `genres`, `production_companies` y `production_countries`
df['genres'] = df['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['production_companies'] = df['production_companies'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['production_countries'] = df['production_countries'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Paso 3: Verifica si las celdas están en formato de texto
df['genres'] = df['genres'].astype(str)
df['production_companies'] = df['production_companies'].astype(str)
df['production_countries'] = df['production_countries'].astype(str)



# Vectorización de `overview` usando TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Vectorización de `genres` usando MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genres_matrix = mlb.fit_transform(df['genres'].str.split(', '))

# Normalización de `vote_average` y `release_year`
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[['vote_average', 'release_year']])

# Concatenar todas las características en una matriz final
from scipy.sparse import hstack
feature_matrix = hstack([tfidf_matrix, genres_matrix, scaled_features])

# Ahora puedes calcular la similitud del coseno con la matriz final
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(feature_matrix)


In [67]:
import pyarrow

In [75]:
df.to_parquet("peliculas_para_el_modelo2.parquet", index=False)

In [68]:
def get_recommendations(title, df, cosine_sim, top_n=5):
    # Obtener el índice de la película que coincide con el título
    try:
        idx = df[df['title'] == title].index[0]
    except IndexError:
        return f"No se encontró la película '{title}' en la base de datos."

    # Obtener las puntuaciones de similitud de todas las películas con la película dada
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Ordenar las películas en base a la similitud (de mayor a menor) y seleccionar las top_n
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Obtener los índices de las películas más similares (excluyendo la misma película)
    sim_indices = [i[0] for i in sim_scores[1:top_n+1]]

    # Retornar los títulos de las películas recomendadas
    recommended_titles = df.iloc[sim_indices]['title'].tolist()
    return recommended_titles


In [74]:
title = "The Dark Knight"
recommendations = get_recommendations(title, df, cosine_sim)
print("Películas recomendadas:", recommendations)

IndexError: index 12476 is out of bounds for axis 0 with size 2064

In [72]:
df['title']

1                      Jumanji
14            Cutthroat Island
20                  Get Shorty
22                   Assassins
30             Dangerous Minds
                 ...          
44930           The Dark Tower
44947                 Security
44980            Dirty Dancing
45056                Bedeviled
45098    S.W.A.T.: Under Siege
Name: title, Length: 2064, dtype: object