In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# We import the datasests 
dfPlay = pd.read_csv('datasets/play_time.csv')
dfGameGenre= pd.read_csv('datasets/game_genres.csv')
dfnameGame = pd.read_csv('datasets/name_games.csv')
dfGenre = pd.read_csv('datasets/genres.csv')
dfReviews = pd.read_csv('datasets/reviews.csv')

# Recommendation System
### Cosine Similarity: item - item

An item is taken, based on how similar it is to the rest, similar items are recommended.</br>
Here, the input is a game and the output is a list of recommended games.

In [3]:
# For this game recommendation system, we will consider the genres and playtime of users.
# Any game without a record of played hours will not be considered/recommended.

# We select the dataframes we will work with
print(dfGameGenre.head(1)) # Main dataframe for game-genre relationship
print(dfnameGame.head(1))
print(dfGenre.head(1))
print(dfPlay.head(1))

   id_gameGenre  id_game  id_genre
0             1   761140         1
   id_game            name_game  anio
0   761140  LOST SUMMONER KITTY  2018
   id_genre   genre
0         1  Action
   id_game  playtime_forever            id_user
0       10                 6  76561197970982479


In [4]:
# Realizamos las uniones correspondientes para obtener el dataframe final
gameGenre = pd.merge(dfGameGenre, dfnameGame, on='id_game', how='left')
gameGenre = pd.merge(gameGenre, dfGenre, on='id_genre', how='left')


cols = ['id_game','name_game','genre']
gameGenre = gameGenre[cols]

In [5]:
gameGenre.head()

Unnamed: 0,id_game,name_game,genre
0,761140,LOST SUMMONER KITTY,Action
1,767400,DAN ZHA REN 2222,Action
2,773570,LOG CHALLENGE,Action
3,772540,BATTLE ROYALE TRAINER,Action
4,768570,UNCANNY ISLANDS,Action


In [6]:
gameGenre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85487 entries, 0 to 85486
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id_game    85487 non-null  int64 
 1   name_game  85487 non-null  object
 2   genre      85487 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [7]:
gameGenre.sample(5)

Unnamed: 0,id_game,name_game,genre
67831,457380,STORM VR,Adventure
65950,741310,BIRCH (CHARACTER FOR OCCULT PRERAISE),Adventure
56384,426170,DEFRAGMENTED,RPG
63224,316160,BROKEN SWORD 4 - THE ANGEL OF DEATH,Adventure
51396,381162,DGU - FINALS WEEK,Strategy


In [8]:
# In a previous analysis, we decided not to remove values from the playtime_forever column.
# We normalize the name_game column.
gameGenre['name_game'] = gameGenre['name_game'].apply(lambda x: re.sub(r'[^\w\s]', '', x.replace('-', ' ')).strip().upper())

In [9]:
# Reverting a process, just like in its early stages.
# Group the 'gameGenre' dataframe by the 'id_game' and 'name_game' columns.
# Apply a lambda function to obtain unique values in the 'genre' column for each group.
gameGenre = gameGenre.groupby(['id_game','name_game'])['genre'].apply(lambda x: x.unique()).reset_index()
gameGenre.head()

Unnamed: 0,id_game,name_game,genre
0,10,COUNTER STRIKE,[Action]
1,20,TEAM FORTRESS CLASSIC,[Action]
2,30,DAY OF DEFEAT,[Action]
3,40,DEATHMATCH CLASSIC,[Action]
4,50,HALF LIFE OPPOSING FORCE,[Action]


In [10]:
# Next, we merge the resulting dataframe with dfPlay.
gameGenre = pd.merge(gameGenre, dfPlay, on='id_game', how='left' )

# We specify the columns we will use for the recommendation system.
cols = ['id_game','name_game', 'genre', 'playtime_forever']
gameGenre = gameGenre[cols]

gameGenre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4317005 entries, 0 to 4317004
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id_game           int64  
 1   name_game         object 
 2   genre             object 
 3   playtime_forever  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 131.7+ MB


In [11]:
gameGenre.head()

Unnamed: 0,id_game,name_game,genre,playtime_forever
0,10,COUNTER STRIKE,[Action],6.0
1,10,COUNTER STRIKE,[Action],0.0
2,10,COUNTER STRIKE,[Action],0.0
3,10,COUNTER STRIKE,[Action],93.0
4,10,COUNTER STRIKE,[Action],108.0


In [12]:
# We remove records that do not contribute value to the recommendation system.

# Drop records with null values
gameGenre = gameGenre.dropna()

# Filter the dataframe with playtime different from 0
gameGenre = gameGenre[gameGenre['playtime_forever'] != 0]

gameGenre.reset_index(drop=True, inplace=True)

In [13]:
gameGenre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2868055 entries, 0 to 2868054
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id_game           int64  
 1   name_game         object 
 2   genre             object 
 3   playtime_forever  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 87.5+ MB


In [14]:
gameGenre

Unnamed: 0,id_game,name_game,genre,playtime_forever
0,10,COUNTER STRIKE,[Action],6.0
1,10,COUNTER STRIKE,[Action],93.0
2,10,COUNTER STRIKE,[Action],108.0
3,10,COUNTER STRIKE,[Action],328.0
4,10,COUNTER STRIKE,[Action],6275.0
...,...,...,...,...
2868050,527510,THE LEGIONS OF ROME,"[Casual, Indie, Simulation, Strategy]",32.0
2868051,527810,DYNAMITE ALEX,"[Action, Indie]",2.0
2868052,527890,SHOP N SPREE SHOPPING PARADISE,"[Casual, Simulation]",1.0
2868053,527900,GRIM TALES THE BRIDE COLLECTORS EDITION,"[Casual, Adventure]",44.0


In [15]:
# Convert the 'genre' column into a comma-separated string
gameGenre['genre_str'] = gameGenre['genre'].apply(lambda x: ', '.join(x))

cols = ['id_game','name_game', 'genre_str', 'playtime_forever']
gameGenre = gameGenre[cols]
gameGenre.head()

Unnamed: 0,id_game,name_game,genre_str,playtime_forever
0,10,COUNTER STRIKE,Action,6.0
1,10,COUNTER STRIKE,Action,93.0
2,10,COUNTER STRIKE,Action,108.0
3,10,COUNTER STRIKE,Action,328.0
4,10,COUNTER STRIKE,Action,6275.0


In [16]:
# Again...
# Group the 'gameGenre' dataframe by the 'id_game', 'name_game', and 'genre_str' columns.
# For each unique combination, calculate the sum of values in the 'playtime_forever' column.
gameGenre = gameGenre.groupby(['id_game','name_game', 'genre_str'])['playtime_forever'].sum().reset_index()
gameGenre.head()

Unnamed: 0,id_game,name_game,genre_str,playtime_forever
0,10,COUNTER STRIKE,Action,17386015.0
1,20,TEAM FORTRESS CLASSIC,Action,961702.0
2,30,DAY OF DEFEAT,Action,758991.0
3,40,DEATHMATCH CLASSIC,Action,154486.0
4,50,HALF LIFE OPPOSING FORCE,Action,734562.0


In [17]:
# Aquí les presento el dataframe final.
gameGenre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_game           8523 non-null   int64  
 1   name_game         8523 non-null   object 
 2   genre_str         8523 non-null   object 
 3   playtime_forever  8523 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 266.5+ KB


In [18]:
# We reduce the dataframe to optimize the deployment on Render.
gameGenre = gameGenre.sort_values(by='playtime_forever', ascending=False)
gameGenre = gameGenre.head(1200)
gameGenre.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200 entries, 22 to 1658
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_game           1200 non-null   int64  
 1   name_game         1200 non-null   object 
 2   genre_str         1200 non-null   object 
 3   playtime_forever  1200 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 46.9+ KB


# Sistema de Recomendación

In [19]:
from sklearn.preprocessing import MinMaxScaler

# Normalizar la columna 'playtime_forever'
scaler = MinMaxScaler()
gameGenre['playtime_normalized'] = scaler.fit_transform(gameGenre['playtime_forever'].values.reshape(-1, 1))

In [20]:
gameGenre.head()

Unnamed: 0,id_game,name_game,genre_str,playtime_forever,playtime_normalized
22,730,COUNTER STRIKE GLOBAL OFFENSIVE,Action,785184267.0,1.0
137,4000,GARRYS MOD,"Indie, Simulation",448366616.0,0.570984
1212,105600,TERRARIA,"Action, Indie, RPG, Adventure",154974541.0,0.197282
1769,230410,WARFRAME,"Action, Free to Play",124027703.0,0.157864
10,240,COUNTER STRIKE SOURCE,Action,112612047.0,0.143323


In [21]:
#gameGenre.to_csv('api_steam/data_render/gameRecom.csv', index=False)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object to convert text into a matrix of term-document
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
matriz_generos = vectorizer.fit_transform(gameGenre['genre_str'])



In [23]:
# Calculate cosine similarity between games based on genre
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(matriz_generos, matriz_generos)

In [24]:
# Assign a weight for genre similarity and playtime
genre_weight = 0.7
playtime_weight = 0.3

# Calculate the weighted similarity score
weighted_similarity = (genre_weight * cosine_sim) + (playtime_weight * gameGenre['playtime_normalized'].values.reshape(-1, 1))


In [25]:
import pickle

# Save weighted_similarity to a file using pickle
'''with open('api_steam/data_render/weighted_similarity.pkl', 'wb') as file:
    pickle.dump(weighted_similarity, file)'''

import gzip

# Save weighted_similarity to a compressed file using gzip
'''with gzip.open('api_steam/data_render/weighted_similarity.pkl.gz', 'wb') as file:
   pickle.dump(weighted_similarity, file)'''


In [26]:
# Assuming you have calculated weighted_similarity_scores previously
def get_weighted_recommendations(game_name, weighted_similarity_scores):
    game_index = gameGenre[gameGenre['name_game'] == game_name].index[0]
    similar_games = list(enumerate(weighted_similarity_scores[game_index]))
    similar_games = sorted(similar_games, key=lambda x: x[1], reverse=True)
    similar_games = similar_games[1:11]  # Obtener las 10 recomendaciones principales
    game_indices = [index[0] for index in similar_games]
    return gameGenre['name_game'].iloc[game_indices]

In [27]:
gameGenre[gameGenre['name_game'].str.contains('CAR', case=False, na=False)]

Unnamed: 0,id_game,name_game,genre_str,playtime_forever,playtime_normalized
1440,209080,GUNS OF ICARUS ONLINE,"Action, Indie, Simulation, Adventure",3425754.0,0.004249
1834,234630,PROJECT CARS,"Action, Casual, Indie, Simulation, Sports, Rac...",951783.0,0.001098
1549,215470,PRIMAL CARNAGE,"Action, Indie",908722.0,0.001043
7380,442080,RIDERS OF ICARUS,"Free to Play, RPG, Adventure, Massively Multip...",790044.0,0.000892
3169,293260,CARD HUNTER,"Strategy, Free to Play, RPG",761706.0,0.000856
2291,257730,INFINITY WARS ANIMATED TRADING CARD GAME,"Indie, Strategy, Free to Play, Massively Multi...",609582.0,0.000662
1746,228380,NEXT CAR GAME WRECKFEST,"Action, Indie, Simulation, Sports, Racing, Ear...",402053.0,0.000398
3929,321360,PRIMAL CARNAGE EXTINCTION,"Action, Indie",351430.0,0.000333
3184,293760,AUTOMATION THE CAR COMPANY TYCOON GAME,"Simulation, Strategy, Early Access",223966.0,0.000171
340,12360,FLATOUT ULTIMATE CARNAGE,Racing,161822.0,9.2e-05


In [28]:
# Get weighted recommendations for a specific game
game_name = 'FRONTLINESTM FUEL OF WARTM'
weighted_recommendations = get_weighted_recommendations(game_name, weighted_similarity)
print(weighted_recommendations)

161            NATURAL SELECTION 2
1297               ORCS MUST DIE 2
1468                     SANCTUM 2
1970                        VERDUN
1196                 ORCS MUST DIE
1377                      GUNPOINT
1522                    TOWER WARS
3255    GEMCRAFT   CHASING SHADOWS
4152                    REASSEMBLY
1456                CORTEX COMMAND
Name: name_game, dtype: object
