In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df_steam_games = pd.read_parquet('../Data/df_steam_games.parquet')
df_user_items = pd.read_parquet('../Data/df_user_items.parquet')

In [4]:
df_steam_games = df_steam_games.rename(columns={'id': 'item_id'})

In [37]:
df_steam_games['item_id'] = df_steam_games['item_id'].dropna().astype(str)

In [5]:
# Crear un DF con las columnas necesarias
df_games_specs = df_steam_games.loc[:, ["specs", "item_id", "title"]]

# Analisis de los datos nulos
df_games_specs.isnull().mean()

specs      0.021270
item_id    0.000065
title      0.062866
dtype: float64

In [21]:
df_games_specs.head()

Unnamed: 0,specs,item_id,title
0,Single-player,761140,Lost Summoner Kitty
1,Single-player Multi-player Online Multi-Player...,643980,Ironbound
2,Single-player Multi-player Online Multi-Player...,670290,Real Pool 3D - Poolians
3,Single-player,767400,弹炸人2222
5,Single-player Steam Achievements,772540,Battle Royale Trainer


In [7]:
# Eliminar los datos nulos
df_games_specs = df_games_specs.dropna()

# Limpiar la columna "specs"
df_games_specs['specs'] = df_games_specs['specs'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", ''))

In [8]:
# Crear una instancia de CountVectorizer
cv = CountVectorizer()

# Ajustar y transformar el texto en una matriz dispersa
vector = cv.fit_transform(df_games_specs['specs']).toarray()

In [9]:
# Calcular la similitud del coseno entre los valores de la columna "specs"
similarity = cosine_similarity(vector)

In [44]:
def recomendacion_juego(id_de_producto):
    id_de_producto = str(id_de_producto)
    # Buscar el indice ingresado en el DF "df_games_specs"
    filter = df_games_specs['item_id'] == id_de_producto

    item_id = df_games_specs[filter].index[0]
    
    # Asegurarse de que el índice esté dentro del rango
    if item_id >= similarity.shape[0]:
        return []  # Si está fuera de rango, retornar una lista vacía
    # Similitud entre el juego de entrada y todos los demás juegos en el conjunto de datos
    similar_scores = similarity[item_id]
    
    # Ordenar de manera descendente y seleccionar los cinco juegos más similares
    similar_items = sorted(list(enumerate(similar_scores)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Devolver la lista de títulos recomendados
    recommended_titles = [df_games_specs.iloc[e[0]]['title'] for e in similar_items]
    
    return recommended_titles

In [45]:

# Aplicar la función y crear una nueva columna
df_games_specs['recommends'] = df_games_specs['item_id'].apply(recomendacion_juego)

In [48]:
# Eliminar columnas innecesarias
df_games_specs.drop(columns=['title', 'specs'], inplace=True)

df_games_specs.head()

Unnamed: 0,item_id,recommends
0,761140,"[弹炸人2222, Beach Rules, Planetarium 2 - Zen Ody..."
1,643980,"[Duelyst, Warhammer 40,000: Regicide, KROSMAGA..."
2,670290,"[Heroes of Havoc: Idle Adventures, Tactical Mo..."
3,767400,"[弹炸人2222, Beach Rules, Planetarium 2 - Zen Ody..."
5,772540,"[SNOW - All Access Pro Pass, SNOW - All Access..."


In [50]:
df_games_specs.to_parquet('df_games_specs.parquet', index=False)