In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Cargar el dataset (asegúrate de tener el archivo "bgg-26m-reviews.csv" en el directorio de trabajo)
# Solo se leen las columnas relevantes: 'user', 'rating', 'id', 'comment'
df = pd.read_csv("../data/bgg-26m-reviews.csv", usecols=['user', 'rating', 'ID', 'comment'])
df = pd.read_csv("data/bgg-26m-reviews.csv", usecols=['user', 'rating', 'ID', 'comment'], chunksize=100000)

# Preprocesamiento
# 1. Eliminar duplicados
df.drop_duplicates(inplace=True)

# 2. Eliminar filas con valores nulos en las columnas críticas
df.dropna(inplace=True)

# 3. Convertir la columna rating de float a entero
# Se redondea primero y luego se convierte a entero
df['rating'] = df['rating'].round().astype(int)


In [6]:
df.head(5)

Unnamed: 0,user,rating,comment,ID
2,dougthonus,10,"Currently, this sits on my list as my favorite...",13
3,cypar7,10,"I know it says how many plays, but many, many ...",13
7,hreimer,10,i will never tire of this game.. Awesome,13
11,daredevil,10,This is probably the best game I ever played. ...,13
16,hurkle,10,Fantastic game. Got me hooked on games all ove...,13


In [7]:
# Inicializar el analizador de sentimiento de NLTK
sid = SentimentIntensityAnalyzer()

# Función para normalizar el compound score (rango [-1,1]) a un rango de 1 a 10
def normalize_sentiment(compound_score):
    # Mapear: -1 --> 1 y 1 --> 10
    return ((compound_score + 1) / 2) * 9 + 1

# Aplicar análisis de sentimiento a cada comentario y normalizarlo
df['sentiment'] = df['comment'].apply(lambda x: sid.polarity_scores(x)['compound'])
df['sentiment_normalized'] = df['sentiment'].apply(normalize_sentiment)
df.head(5)


Unnamed: 0,user,rating,comment,ID,sentiment,sentiment_normalized
2,dougthonus,10,"Currently, this sits on my list as my favorite...",13,0.4588,7.5646
3,cypar7,10,"I know it says how many plays, but many, many ...",13,0.9001,9.55045
7,hreimer,10,i will never tire of this game.. Awesome,13,0.6249,8.31205
11,daredevil,10,This is probably the best game I ever played. ...,13,0.765,8.9425
16,hurkle,10,Fantastic game. Got me hooked on games all ove...,13,0.5574,8.0083


In [None]:
# Calcular el score ponderado: 70% del rating y 30% del sentimiento normalizado
df['weighted_score'] = df['rating'] * 0.7 + df['sentiment_normalized'] * 0.3

ratings_matrix = df.pivot_table(index='user', columns='ID', values='weighted_score', fill_value=0)


  num_cells = num_rows * num_columns


IndexError: index 1686545168 is out of bounds for axis 0 with size 1686536272

In [None]:
# Función de recomendación personalizada basada en filtrado colaborativo
def recommend_games(target_user, ratings_matrix, top_n=5):
    if target_user not in ratings_matrix.index:
        print("Usuario no encontrado en el dataset.")
        return []
    
    # Calcular la similitud entre usuarios usando la similitud coseno
    user_sim_matrix = cosine_similarity(ratings_matrix)
    sim_df = pd.DataFrame(user_sim_matrix, index=ratings_matrix.index, columns=ratings_matrix.index)
    
    # Obtener la similitud del usuario objetivo con los demás (excluyéndolo a él mismo)
    target_sim = sim_df[target_user].drop(target_user)
    
    # Seleccionar los usuarios más similares (por ejemplo, los top 10)
    similar_users = target_sim.sort_values(ascending=False).head(10)
    
    # Predecir el score para cada juego que el usuario aún no ha valorado
    user_ratings = ratings_matrix.loc[target_user]
    unrated_games = user_ratings[user_ratings == 0].index
    
    predictions = {}
    for game in unrated_games:
        num = 0
        den = 0
        for other_user, sim_score in similar_users.items():
            rating = ratings_matrix.loc[other_user, game]
            if rating > 0:
                num += sim_score * rating
                den += sim_score
        if den > 0:
            predictions[game] = num / den
        else:
            predictions[game] = 0
    # Ordenar las predicciones y devolver las top_n recomendaciones
    recommended_games = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return recommended_games

    

ordenado
Juego recomendado: 347616.0


In [None]:
# Ejemplo de uso:
# Cambia 'usuario_ejemplo' por un identificador de usuario existente en tu dataset
target_user = 'usuario_ejemplo'
recommendations = recommend_games(target_user, ratings_matrix)



In [None]:
print("Recomendaciones para el usuario {}:".format(target_user))
for game, score in recommendations:
    print(f"Juego ID: {game}, Score Predicho: {score:.2f}")