# ML

## Librerías

In [225]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [226]:
mov_df = pd.read_csv("2_datasets_etl/movies_dataset_cleaned.csv")

In [227]:
# Vectorizar genero 
gen_df = pd.read_csv("2_datasets_etl/genres_data.csv")
# Agregar campo marca
gen_df['genre_mark'] = 1
#Pivotear
gen_df = gen_df.drop_duplicates().pivot(index='id_movie', columns='name_genres', values='genre_mark').fillna(0).reset_index()

# Eliminar filas duplicadas
gen_df = gen_df.drop_duplicates()
gen_df.to_csv("5_datasets_ml/genres_data_ml.csv", index=False)

gen_df


name_genres,id_movie,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,11,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42986,465044,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42987,467731,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42988,468343,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
42989,468707,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [228]:
columns_gen = gen_df.columns[gen_df.columns != 'id_movie']
columns_list = ['vote_average', 'title'] 
# columns_list = columns_list + columns_gen.tolist()

columns_list

['vote_average', 'title']

In [241]:
# Join pelicula y genero 
merged_df = pd.merge(mov_df, gen_df,  left_on='id', right_on='id_movie', how='inner')
merged_df = merged_df[columns_list]

# Seleccionar los primeros 10000 registros
half_rows = int(len(merged_df) * 0.2)

merged_df = merged_df.head(half_rows)

merged_df.to_csv("5_datasets_ml/movies_dataset_cleaned_ml.csv", index=False)

merged_df

Unnamed: 0,vote_average,title
0,7.7,Toy Story
1,6.9,Jumanji
2,6.5,Grumpier Old Men
3,6.1,Waiting to Exhale
4,5.7,Father of the Bride Part II
...,...,...
8590,6.1,The Adventure of Sherlock Holmes' Smarter Brother
8591,5.0,The Adventures of the Wilderness Family
8592,7.3,"Cousin, Cousine"
8593,5.5,Dolemite


In [242]:

# Crear una matriz TF-IDF para las descripciones de las películas
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['title'])

# Calcular la similitud del coseno entre las descripciones
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)



In [None]:
# Función para obtener recomendaciones de películas
def get_movie_recommendations(title, cosine_sim=cosine_sim):
    if merged_df[merged_df['title'].str.lower() == title.lower()].shape[0]  == 0:
        return [] 
    idx = merged_df[merged_df['title'].str.lower() == title.lower()].index[0]  # Obtener el índice de la película
    sim_scores = list(enumerate(cosine_sim[idx]))  # Obtener los puntajes de similitud
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Ordenar por similitud
    sim_scores = sim_scores[1:6]  # Obtener las 05 películas más similares (excluyendo la misma película)
    movie_indices = [i[0] for i in sim_scores]  # Obtener los índices de las películas similares
    return merged_df['title'].iloc[movie_indices]  # Devolver los títulos de las películas similares


In [246]:
# Función para obtener recomendaciones basadas en el título, vote_average y géneros
def get_movie_recommendations(title, cosine_sim=cosine_sim):
    # Obtener el índice de la película a partir de su título
    if merged_df[merged_df['title'].str.lower() == title.lower()].shape[0]  == 0:
        return [] 
    # Obtener el índice de la película
    idx = merged_df[merged_df['title'].str.lower() == title.lower()].index[0]  
    
    # Calcular la similitud del título de la película con todas las películas
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Ordenar las películas según la similitud y obtener las 5 más similares
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    
    # Obtener los índices de las películas recomendadas
    movie_indices = [i[0] for i in sim_scores]
    
    # Filtrar las películas recomendadas por géneros y voto_average
    recommended_movies = merged_df.iloc[movie_indices]
    
    # Devolver los títulos de las películas recomendadas
    return recommended_movies['title'].tolist()




In [None]:
""" # Crear una instancia de TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Aplicar TF-IDF al título de las películas
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['title'])

# Calcular la similitud del coseno entre los títulos
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Obtener los índices y títulos de las películas
indices = pd.Series(merged_df.index, index=merged_df['title'])

# Función para obtener recomendaciones basadas en el título y vote_average
def get_recommendations(title, vote_threshold=7.0, num_recommendations=10):
    # Obtener el índice de la película a partir de su título
    idx = indices[title]
    
    # Calcular la similitud del título de la película con todas las películas
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filtrar las películas por umbral de vote_average
    sim_scores = [score for score in sim_scores if merged_df['vote_average'][score[0]] >= vote_threshold]
    
    # Ordenar las películas según la similitud y el vote_average
    sim_scores = sorted(sim_scores, key=lambda x: (x[1], merged_df['vote_average'][x[0]]), reverse=True)
    
    # Obtener los índices de las películas recomendadas
    recommended_indices = [score[0] for score in sim_scores[:num_recommendations]]
    
    # Obtener las películas recomendadas
    recommended_movies = merged_df.iloc[recommended_indices]
    
    return recommended_movies

# Ejemplo de uso
movie_title = 'The Dark Knight'
recommendations = get_recommendations(movie_title, vote_threshold=7.5, num_recommendations=5)
print(recommendations[['title', 'vote_average']]) """


In [None]:
""" # Función para obtener recomendaciones basadas en el título, vote_average y géneros
def get_recommendations_with_genres(title, vote_threshold=7.0, num_recommendations=10):
    # Obtener el índice de la película a partir de su título
    idx = indices[title]
    
    # Calcular la similitud del título de la película con todas las películas
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filtrar las películas por umbral de vote_average
    sim_scores = [score for score in sim_scores if merged_df['vote_average'][score[0]] >= vote_threshold]
    
    # Obtener los índices de las películas recomendadas
    recommended_indices = [score[0] for score in sim_scores]
    
    # Filtrar las películas recomendadas por géneros
    recommended_movies = merged_df.iloc[recommended_indices]
    recommended_movies = recommended_movies[recommended_movies[columns_gen].sum(axis=1) > 0]
    
    # Ordenar las películas según la similitud y el vote_average
    recommended_movies = recommended_movies.sort_values(by=['vote_average'], ascending=False)
    
    # Obtener las primeras películas recomendadas
    recommended_movies = recommended_movies.head(num_recommendations)
    
    return recommended_movies

# Ejemplo de uso
movie_title = 'The Dark Knight'
recommendations_with_genres = get_recommendations_with_genres(movie_title, vote_threshold=7.5, num_recommendations=5)
print(recommendations_with_genres[['title', 'vote_average', 'genres']]) """


In [247]:
# Ejemplo de recomendación
movie_title = 'Toy Story'  # Cambiar por el título de una película
recommendations = get_movie_recommendations(movie_title)
print(f"Recomendaciones de películas similares a '{movie_title}':")
print(recommendations)


Recomendaciones de películas similares a 'Toy Story':
['Toy Story 2', 'The Toy', 'Toy Soldiers', 'L.A. Story', 'The Story of Us']


In [248]:
# Ejemplo de recomendación
movie_title = 'y'  # Cambiar por el título de una película
recommendations = get_movie_recommendations(movie_title)
print(f"Recomendaciones de películas similares a '{movie_title}':")
print(recommendations)

Recomendaciones de películas similares a 'y':
[]


In [249]:
# Ejemplo de recomendación
movie_title = 'Jurassic Park'  # Cambiar por el título de una película
recommendations = get_movie_recommendations(movie_title)
print(f"Recomendaciones de películas similares a '{movie_title}':")
print(recommendations)

Recomendaciones de películas similares a 'Jurassic Park':
['Jurassic Park III', 'The Lost World: Jurassic Park', 'Dog Park', 'Sunset Park', 'Barefoot in the Park']


In [250]:
# Ejemplo de recomendación
movie_title = 'The Avengers'  # Cambiar por el título de una película
recommendations = get_movie_recommendations(movie_title)
print(f"Recomendaciones de películas similares a '{movie_title}':")
print(recommendations)

Recomendaciones de películas similares a 'The Avengers':
['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale', 'Father of the Bride Part II']
