# 1. ETL, Extracción Transformación y Carga de Datos

En este proceso extraemos los datos necesarios y los limpiamos para tenerlos en formato correcto para nuestros fines.

In [1]:
import pandas as pd
import json
import ast
import warnings
from io import StringIO
import hashlib
import matplotlib.pyplot as plt

from typing import List, Dict
import base64, csv

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
import seaborn as sns
import importlib

warnings.filterwarnings('ignore')

In [2]:

def showPie(columna):
  count_values = pd.Series(columna).value_counts()
  if len(count_values) > 15:
    count_values = count_values.iloc[0:15]
  datos = pd.DataFrame({"valor":count_values.index, "ocurrencia": count_values.values})

  plt.title(columna.name)
  plt.pie(datos["ocurrencia"], labels=datos['valor'], autopct='%1.1f%%')
  plt.show()

def concatenar(data_1, data_2,  axis=1):
  return pd.concat([data_1, data_2], axis=axis)

def contar_nulos(data):
  return data.isna().sum()

def mapear(columna: pd.Series, mapa={'NO': 0, 'SI':1}):
  return columna.map(mapa)

def showPiePx(columna, max=15):
  count_values = pd.Series(columna).value_counts()
  if len(count_values) > max:
    count_values = count_values.iloc[0:max]
  datos = pd.DataFrame({"valor":count_values.index, "ocurrencia": count_values.values})
  fig = px.pie(datos, values='ocurrencia', names='valor', title=columna.name)
  fig.update_traces(textposition='outside', textinfo='percent+label')
  fig.show()
  
def nulos_filas(data):
    return pd.DataFrame(data.isna().sum(axis=1).value_counts().sort_values(ascending=False).reset_index().values, columns=['cant_col_nulas', 'cantidad'])

def hash_fila(fila) -> str:
  return hashlib.md5(fila.astype(str).values.tobytes()).hexdigest()

In [3]:
URL_STEAM_GAMES = 'datasets/origin/output_steam_games.json'
URL_USERS_ITEMS = 'datasets/australian_users_items.json'
URL_USERS_REVIEWS = 'datasets/australian_user_reviews.json'

## 1. ETL de Dataset Steam Games

In [4]:
df_games_all = pd.read_json(URL_STEAM_GAMES, lines=True)

In [5]:
nulos_filas(df_games_all)

Unnamed: 0,cant_col_nulas,cantidad
0,13,88310
1,0,22530
2,1,6070
3,5,1940
4,3,733
5,4,391
6,2,349
7,6,121
8,10,1


Revisando los valores nulos a lo largo del eje 1, vemos que hay un número muy grande (88310) de filas completamente vacías, ya que tiene 13 columnas el dataset y la cuenta de nulos es igual a ese valor. Por polìtica se decide eliminar las filas que contienen más de la mitad de las columnas nulas.

In [6]:
df_games = df_games_all.drop(df_games_all[df_games_all.isna().sum(axis=1)>6].index)

In [7]:
df_games.describe(include='object').T

Unnamed: 0,count,unique,top,freq
publisher,24083.0,8239.0,Ubisoft,385.0
genres,28852.0,883.0,[Action],1880.0
app_name,32133.0,32094.0,Soundtrack,3.0
title,30085.0,30054.0,Soundtrack,3.0
url,32134.0,32134.0,http://store.steampowered.com/app/761140/Lost_...,1.0
release_date,30068.0,3582.0,2012-10-16,100.0
tags,31972.0,15395.0,"[Casual, Simulation]",1292.0
reviews_url,32133.0,32132.0,http://steamcommunity.com/app/612880/reviews/?...,2.0
specs,31465.0,4649.0,[Single-player],2794.0
price,30757.0,162.0,4.99,4278.0


Verificar duplicados

In [8]:
df_games['hash'] = df_games.apply(hash_fila, axis=1)

In [9]:
df_games[df_games['hash'].duplicated()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,hash


In [10]:
df_games = df_games.drop('hash', axis=1)

Analizar columnas utiles

Las columnas ```title``` y ```app_name``` parecen contener lo mismo, comparamos

In [11]:
df_games[['title','app_name']].isna().sum()

title       2049
app_name       1
dtype: int64

Las columnas ```url```, ```reviewes_url```, ```specs``` contienen información considerada superflua para el análisis, ```publiser``` duplica información, al igual que ```early_access```.

In [12]:
columnas_a_quitar = ['title','url','reviews_url', 'early_access', 'publisher', 'specs']

In [13]:
df_games.drop(columnas_a_quitar, axis=1, inplace=True)

Reorganizamos las columnas a comodidad

In [14]:
df_games = df_games[['id','app_name', 'genres', 'release_date', 'tags', 'price', 'developer']]

Trabajamos con las columnas

1. ```App_Name```: Buscamos nulos

In [15]:
df_games[df_games.app_name.isna()]

Unnamed: 0,id,app_name,genres,release_date,tags,price,developer
90890,317160.0,,"[Action, Indie]",2014-08-26,"[Action, Indie]",,


Buscando en la información duplicada que no usaremos para el análisis, pero la tenemos disponibles, conseguimos el valor en la columna ```title```

In [16]:
df_games.loc[90890, 'app_name'] = 'Duet'
df_games.loc[90890]

id                     317160.0
app_name                   Duet
genres          [Action, Indie]
release_date         2014-08-26
tags            [Action, Indie]
price                      None
developer                  None
Name: 90890, dtype: object

2. ```Id```: Buscamos nulos

In [17]:
df_games[df_games.id.isna()]

Unnamed: 0,id,app_name,genres,release_date,tags,price,developer
119271,,Batman: Arkham City - Game of the Year Edition,"[Action, Adventure]",2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",19.99,"Rocksteady Studios,Feral Interactive (Mac)"


Al igual que hicimos anteriormente encontramos el valor en columnas que duplican información

In [18]:
df_games.loc[119271,'id'] = 200260

Ahora consideramos que es apto resetear el índice

In [19]:
df_games.reset_index(inplace=True)

In [20]:
df_games.drop('index', axis=1, inplace=True)

In [21]:
df_games

Unnamed: 0,id,app_name,genres,release_date,tags,price,developer
0,761140.0,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,Kotoshiro
1,643980.0,Ironbound,"[Free to Play, Indie, RPG, Strategy]",2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,Secret Level SRL
2,670290.0,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]",2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",Free to Play,Poolians.com
3,767400.0,弹炸人2222,"[Action, Adventure, Casual]",2017-12-07,"[Action, Adventure, Casual]",0.99,彼岸领域
4,773570.0,Log Challenge,,,"[Action, Indie, Casual, Sports]",2.99,
...,...,...,...,...,...,...,...
32129,773640.0,Colony On Mars,"[Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.99,"Nikita ""Ghost_RUS"""
32130,733530.0,LOGistICAL: South Africa,"[Casual, Indie, Strategy]",2018-01-04,"[Strategy, Indie, Casual]",4.99,Sacada
32131,610660.0,Russian Roads,"[Indie, Racing, Simulation]",2018-01-04,"[Indie, Simulation, Racing]",1.99,Laush Dmitriy Sergeevich
32132,658870.0,EXIT 2 - Directions,"[Casual, Indie]",2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,"xropi,stev3ns"


Buscamos items duplicados

In [22]:
cuenta_duplicados = df_games.id.value_counts()
id_dups = cuenta_duplicados[cuenta_duplicados.values > 1].keys()

# df_games.loc[df_games['id'].isin(id_dups.to_list())]


In [23]:
to_b = []

for i in id_dups:
  to_b.append(df_games[df_games['id'] == i].id.idxmax())

df_games.drop(to_b, axis=0, inplace=True)

In [24]:
df_games = df_games.set_index(df_games['id'].astype(int))
df_games.id = df_games.id.astype(int)
df_games.rename_axis('index')

Unnamed: 0_level_0,id,app_name,genres,release_date,tags,price,developer
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
761140,761140,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,Kotoshiro
643980,643980,Ironbound,"[Free to Play, Indie, RPG, Strategy]",2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,Secret Level SRL
670290,670290,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]",2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",Free to Play,Poolians.com
767400,767400,弹炸人2222,"[Action, Adventure, Casual]",2017-12-07,"[Action, Adventure, Casual]",0.99,彼岸领域
773570,773570,Log Challenge,,,"[Action, Indie, Casual, Sports]",2.99,
...,...,...,...,...,...,...,...
773640,773640,Colony On Mars,"[Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.99,"Nikita ""Ghost_RUS"""
733530,733530,LOGistICAL: South Africa,"[Casual, Indie, Strategy]",2018-01-04,"[Strategy, Indie, Casual]",4.99,Sacada
610660,610660,Russian Roads,"[Indie, Racing, Simulation]",2018-01-04,"[Indie, Simulation, Racing]",1.99,Laush Dmitriy Sergeevich
658870,658870,EXIT 2 - Directions,"[Casual, Indie]",2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,"xropi,stev3ns"


``` Price ```

In [25]:
def isnumber(x):
    try:
        x = float(x)
        return x
    except:
        return 0

In [26]:
df_games.price = df_games.price.apply(isnumber)

In [27]:
# to_date = lambda x: pd.to_datetime(x, errors='coerce') if pd.notna(x) else pd.to_datetime('1900-01-01')
# to_date = lambda x: pd.to_datetime(x, errors='coerce').fillna(pd.to_datetime('1900-01-01'))
to_date = lambda x: pd.to_datetime(x, errors='coerce')



In [28]:
fechas = df_games['release_date'].apply(to_date)


In [29]:
fechas.unique()

<DatetimeArray>
['2018-01-04 00:00:00', '2017-07-24 00:00:00', '2017-12-07 00:00:00',
                 'NaT', '2018-01-03 00:00:00', '2017-12-22 00:00:00',
 '2017-12-23 00:00:00', '1997-06-30 00:00:00', '1998-11-08 00:00:00',
 '2016-11-25 00:00:00',
 ...
 '1999-11-01 00:00:00', '2001-06-01 00:00:00', '2001-12-01 00:00:00',
 '2002-01-01 00:00:00', '2002-08-28 00:00:00', '2003-11-01 00:00:00',
 '2004-03-16 00:00:00', '2004-03-01 00:00:00', '2016-11-19 00:00:00',
 '2018-10-01 00:00:00']
Length: 3463, dtype: datetime64[ns]

In [30]:
df_games['release_year'] = df_games['release_date'].apply(to_date)

In [31]:
df_games['release_year'] = df_games['release_year'].dt.year.fillna(-1).astype(int)
# df_games['release_year'] = df_games['release_year'].astype(int)

In [32]:
df_games

Unnamed: 0_level_0,id,app_name,genres,release_date,tags,price,developer,release_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
761140,761140,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,Kotoshiro,2018
643980,643980,Ironbound,"[Free to Play, Indie, RPG, Strategy]",2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",0.00,Secret Level SRL,2018
670290,670290,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]",2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",0.00,Poolians.com,2017
767400,767400,弹炸人2222,"[Action, Adventure, Casual]",2017-12-07,"[Action, Adventure, Casual]",0.99,彼岸领域,2017
773570,773570,Log Challenge,,,"[Action, Indie, Casual, Sports]",2.99,,-1
...,...,...,...,...,...,...,...,...
773640,773640,Colony On Mars,"[Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.99,"Nikita ""Ghost_RUS""",2018
733530,733530,LOGistICAL: South Africa,"[Casual, Indie, Strategy]",2018-01-04,"[Strategy, Indie, Casual]",4.99,Sacada,2018
610660,610660,Russian Roads,"[Indie, Racing, Simulation]",2018-01-04,"[Indie, Simulation, Racing]",1.99,Laush Dmitriy Sergeevich,2018
658870,658870,EXIT 2 - Directions,"[Casual, Indie]",2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,"xropi,stev3ns",2017


``` genres ``` y ``` tags ```

In [33]:
df_games['genres'].explode().unique()

array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', None, 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Video Production', 'Utilities',
       'Web Publishing', 'Education', 'Software Training',
       'Design &amp; Illustration', 'Audio Production', 'Photo Editing',
       'Accounting'], dtype=object)

In [34]:
df_games['genres_tags'] = df_games['genres'] + df_games['tags']

In [35]:
df_games['genres_tags']	= df_games['genres_tags'].apply(lambda x: pd.Series(x).drop_duplicates().tolist())

In [36]:
# generos_tags = []
# df_games['genres_tags'].fillna("", inplace=True)
# def to_set(x):
#     try:
#         x = list(set(x))
#     except:
#         x = []
#         pass
#     return x

# df_games['genres_tags'] = df_games['genres_tags'].apply(to_set)

In [37]:
df_games['genres_tags']	

id
761140        [Action, Casual, Indie, Simulation, Strategy]
643980    [Free to Play, Indie, RPG, Strategy, Card Game...
670290    [Casual, Free to Play, Indie, Simulation, Spor...
767400                          [Action, Adventure, Casual]
773570                                                [nan]
                                ...                        
773640                [Casual, Indie, Simulation, Strategy]
733530                            [Casual, Indie, Strategy]
610660                          [Indie, Racing, Simulation]
658870    [Casual, Indie, Puzzle, Singleplayer, Atmosphe...
681550                                                [nan]
Name: genres_tags, Length: 32132, dtype: object

In [38]:
print(df_games.query("id == 761140")['genres_tags'].values)

[list(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy'])]


In [39]:
# df_games.loc[1]

In [40]:
# df_games.loc[1, 'genres_tags']

In [41]:
generos_tags = []

for i in df_games.index:
    lista_gt = df_games.loc[i, 'genres_tags']
    try:
        generos_tags.append([df_games.loc[i, 'id'],*lista_gt])
    except:
        pass       

tuplas = []
for i in generos_tags:
    for ind, j in enumerate(i):
        if ind > 0:
            tupla = [i[0], j]
            tuplas.append(tupla)

In [84]:
df_juego_genero_tag = pd.DataFrame(tuplas, columns=['id_juego', 'genero_tag'])

# Elegimos el nro de generos para el dataset de análisis
nro_generos = 40

recorte = df_juego_genero_tag['genero_tag'].value_counts().reset_index().head(100)

df_juego_genero_tag = df_juego_genero_tag[df_juego_genero_tag.genero_tag.isin(recorte['genero_tag'])]

In [91]:
matriz_dummies = pd.get_dummies(df_juego_genero_tag, dtype=int, prefix='gen').groupby('id_juego').sum()

In [92]:
matriz_dummies

Unnamed: 0_level_0,gen_2D,gen_4 Player Local,gen_Action,gen_Action RPG,gen_Adventure,gen_Anime,gen_Arcade,gen_Atmospheric,gen_Building,gen_Bullet Hell,...,gen_Tower Defense,gen_Turn-Based,gen_Turn-Based Strategy,gen_Utilities,gen_Violent,gen_Visual Novel,gen_Walking Simulator,gen_War,gen_Web Publishing,gen_Zombies
id_juego,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
20,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
40,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028055,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2028056,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2028062,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2028103,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
items_unicos = pd.read_csv('id_items_unicos.csv')
items_unicos.columns = ['index', 'id_juego']

items_unicos

# matriz_dummies = pd.merge(matriz_dummies, items_unicos,left_on='id_juego', right_on='id_juego')

Unnamed: 0,index,id_juego
0,0,1250
1,1,22200
2,2,43110
3,3,251610
4,4,227300
...,...,...
3677,3677,307130
3678,3678,209120
3679,3679,220090
3680,3680,262850


In [45]:
# matriz_dummies_rev.to_csv('matriz_dummies_rev.csv')

In [87]:
# matriz_dummies.index = matriz_dummies.index.astype(int)

to_drop = matriz_dummies.sum()[matriz_dummies.sum() == 0].index.to_list()

matriz_dummies.drop(to_drop, axis=1, inplace=True)

In [88]:
matriz_dummies.rename_axis('index', inplace=True)

In [89]:
matriz_dummies.index = matriz_dummies.id_juego.astype(int)

AttributeError: 'DataFrame' object has no attribute 'id_juego'

In [93]:
matriz_dummies.columns

Index(['gen_2D', 'gen_4 Player Local', 'gen_Action', 'gen_Action RPG',
       'gen_Adventure', 'gen_Anime', 'gen_Arcade', 'gen_Atmospheric',
       'gen_Building', 'gen_Bullet Hell', 'gen_Casual', 'gen_Classic',
       'gen_Co-op', 'gen_Colorful', 'gen_Comedy', 'gen_Controller',
       'gen_Crafting', 'gen_Cute', 'gen_Dark', 'gen_Design & Illustration',
       'gen_Design &amp; Illustration', 'gen_Difficult', 'gen_Early Access',
       'gen_Education', 'gen_Exploration', 'gen_FPS', 'gen_Family Friendly',
       'gen_Fantasy', 'gen_Fast-Paced', 'gen_Female Protagonist',
       'gen_Fighting', 'gen_First-Person', 'gen_Free to Play', 'gen_Funny',
       'gen_Gore', 'gen_Great Soundtrack', 'gen_Hack and Slash',
       'gen_Hidden Object', 'gen_Historical', 'gen_Horror', 'gen_Indie',
       'gen_JRPG', 'gen_Local Co-Op', 'gen_Local Multiplayer',
       'gen_Management', 'gen_Massively Multiplayer', 'gen_Memes',
       'gen_Multiplayer', 'gen_Music', 'gen_Mystery', 'gen_Nudity',
       'gen_

In [50]:
matriz_dummies = matriz_dummies.drop(matriz_dummies.columns[[0, -1]], axis=1)

In [101]:
matriz_dummies.to_csv('matriz_dummies.csv')

In [102]:
matriz_dummies.sample(2)

Unnamed: 0_level_0,gen_2D,gen_4 Player Local,gen_Action,gen_Action RPG,gen_Adventure,gen_Anime,gen_Arcade,gen_Atmospheric,gen_Building,gen_Bullet Hell,...,gen_Tower Defense,gen_Turn-Based,gen_Turn-Based Strategy,gen_Utilities,gen_Violent,gen_Visual Novel,gen_Walking Simulator,gen_War,gen_Web Publishing,gen_Zombies
id_juego,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
326600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
723070,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
from sklearn.metrics.pairwise import cosine_similarity

def comparar(id_1, id_2):
    row1 = matriz_dummies.loc[id_1].values.reshape(1,-1)
    row2 = matriz_dummies.loc[id_2].values.reshape(1,-1)
    return cosine_similarity(row1, row2)    


In [99]:
def get_recommended(id_juego):
    lista = []
    # id_juego = matriz_dummies.sample().index

    for i in matriz_dummies.index.tolist():
        if i != id_juego:
            (a, b) = i, comparar(id_juego, i)
            if 0.5 < b[0][0] <= 1:
                lista.append((a, b[0][0]))
    
    return pd.DataFrame(lista, columns=['id_juego', 'similitud']).sort_values('similitud', ascending=False).head(5)
    

In [100]:
id_juego = 314010

# pd.DataFrame(lista, columns=['id_juego', 'similitud']).sort_values('similitud', ascending=False).head(5)

get_recommended(id_juego)

Unnamed: 0,id_juego,similitud
1905,440740,0.92582
709,319570,0.857143
2098,463250,0.857143
3568,632190,0.845154
2827,547180,0.845154


In [56]:
big = cosine_similarity(matriz_dummies,matriz_dummies)

In [57]:
big[0:5, 0: 5]

array([[1.        , 0.50709255, 0.66742381, 0.71713717, 0.43852901],
       [0.50709255, 1.        , 0.56407607, 0.60609153, 0.59299945],
       [0.66742381, 0.56407607, 1.        , 0.68376346, 0.58536941],
       [0.71713717, 0.60609153, 0.68376346, 1.        , 0.6289709 ],
       [0.43852901, 0.59299945, 0.58536941, 0.6289709 , 1.        ]])

In [58]:
generos_filtrados = df_juego_genero_tag['genero_tag'].value_counts().head(50).reset_index().head(38)['genero_tag'].to_list()

In [59]:
mask = df_juego_genero_tag['genero_tag'].isin(generos_filtrados)

In [60]:
df_juego_genero_tag['genero_tag'][mask]

0               Action
1               Casual
2                Indie
3           Simulation
4             Strategy
              ...     
159282          Casual
159283           Indie
159284          Puzzle
159285    Singleplayer
159286     Atmospheric
Name: genero_tag, Length: 110776, dtype: object

In [61]:
a_clustear = pd.get_dummies(df_juego_genero_tag['genero_tag'][mask], dtype='int')

In [62]:
a_clustear

Unnamed: 0,2D,Action,Adventure,Anime,Arcade,Atmospheric,Casual,Co-op,Difficult,Early Access,...,Retro,Sandbox,Sci-fi,Shooter,Simulation,Singleplayer,Sports,Story Rich,Strategy,Turn-Based
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159282,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [63]:
df_games.query('id == 12500')

Unnamed: 0_level_0,id,app_name,genres,release_date,tags,price,developer,release_year,genres_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12500,12500,PuzzleQuest: Challenge of the Warlords,[Casual],2007-10-10,"[Puzzle, Casual, Match 3, RPG, Fantasy, 2D, St...",9.99,Infinite Interactive,2007,"[Casual, Puzzle, Match 3, RPG, Fantasy, 2D, St..."


In [64]:
from sklearn.cluster import KMeans
import numpy as np

# Assuming you have your data stored in a variable called 'data'

# Create a KMeans instance with 38 clusters
kmeans = KMeans(n_clusters=38)

# Fit the KMeans model to your data
kmeans.fit(a_clustear)

# Get the cluster labels for each data point
cluster_labels = kmeans.labels_

# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

In [65]:
preds = kmeans.predict(a_clustear)

preds.shape

(110776,)

In [66]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [67]:
df_games

Unnamed: 0_level_0,id,app_name,genres,release_date,tags,price,developer,release_year,genres_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
761140,761140,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,Kotoshiro,2018,"[Action, Casual, Indie, Simulation, Strategy]"
643980,643980,Ironbound,"[Free to Play, Indie, RPG, Strategy]",2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",0.00,Secret Level SRL,2018,"[Free to Play, Indie, RPG, Strategy, Card Game..."
670290,670290,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]",2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",0.00,Poolians.com,2017,"[Casual, Free to Play, Indie, Simulation, Spor..."
767400,767400,弹炸人2222,"[Action, Adventure, Casual]",2017-12-07,"[Action, Adventure, Casual]",0.99,彼岸领域,2017,"[Action, Adventure, Casual]"
773570,773570,Log Challenge,,,"[Action, Indie, Casual, Sports]",2.99,,-1,[nan]
...,...,...,...,...,...,...,...,...,...
773640,773640,Colony On Mars,"[Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.99,"Nikita ""Ghost_RUS""",2018,"[Casual, Indie, Simulation, Strategy]"
733530,733530,LOGistICAL: South Africa,"[Casual, Indie, Strategy]",2018-01-04,"[Strategy, Indie, Casual]",4.99,Sacada,2018,"[Casual, Indie, Strategy]"
610660,610660,Russian Roads,"[Indie, Racing, Simulation]",2018-01-04,"[Indie, Simulation, Racing]",1.99,Laush Dmitriy Sergeevich,2018,"[Indie, Racing, Simulation]"
658870,658870,EXIT 2 - Directions,"[Casual, Indie]",2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,"xropi,stev3ns",2017,"[Casual, Indie, Puzzle, Singleplayer, Atmosphe..."


In [68]:
df_recortado =df_games.query("release_year.notnull()")


In [69]:
df_recortado['release_year'] = df_recortado['release_year'].astype(int)

In [70]:
df_recortado.query("id == 12500")

Unnamed: 0_level_0,id,app_name,genres,release_date,tags,price,developer,release_year,genres_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12500,12500,PuzzleQuest: Challenge of the Warlords,[Casual],2007-10-10,"[Puzzle, Casual, Match 3, RPG, Fantasy, 2D, St...",9.99,Infinite Interactive,2007,"[Casual, Puzzle, Match 3, RPG, Fantasy, 2D, St..."


In [71]:
df_recortado.index = df_recortado.index.astype(int)

In [72]:
df_recortado['id'] = df_recortado['id'].astype(int)

In [73]:
df_recortado.drop('id', axis=1)

Unnamed: 0_level_0,app_name,genres,release_date,tags,price,developer,release_year,genres_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
761140,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,Kotoshiro,2018,"[Action, Casual, Indie, Simulation, Strategy]"
643980,Ironbound,"[Free to Play, Indie, RPG, Strategy]",2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",0.00,Secret Level SRL,2018,"[Free to Play, Indie, RPG, Strategy, Card Game..."
670290,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]",2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",0.00,Poolians.com,2017,"[Casual, Free to Play, Indie, Simulation, Spor..."
767400,弹炸人2222,"[Action, Adventure, Casual]",2017-12-07,"[Action, Adventure, Casual]",0.99,彼岸领域,2017,"[Action, Adventure, Casual]"
773570,Log Challenge,,,"[Action, Indie, Casual, Sports]",2.99,,-1,[nan]
...,...,...,...,...,...,...,...,...
773640,Colony On Mars,"[Casual, Indie, Simulation, Strategy]",2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.99,"Nikita ""Ghost_RUS""",2018,"[Casual, Indie, Simulation, Strategy]"
733530,LOGistICAL: South Africa,"[Casual, Indie, Strategy]",2018-01-04,"[Strategy, Indie, Casual]",4.99,Sacada,2018,"[Casual, Indie, Strategy]"
610660,Russian Roads,"[Indie, Racing, Simulation]",2018-01-04,"[Indie, Simulation, Racing]",1.99,Laush Dmitriy Sergeevich,2018,"[Indie, Racing, Simulation]"
658870,EXIT 2 - Directions,"[Casual, Indie]",2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,"xropi,stev3ns",2017,"[Casual, Indie, Puzzle, Singleplayer, Atmosphe..."


In [74]:
items_unicos = pd.read_csv('id_items_unicos.csv')
items_unicos.columns = ['index', 'id_juego']

items_unicos

Unnamed: 0,index,id_juego
0,0,1250
1,1,22200
2,2,43110
3,3,251610
4,4,227300
...,...,...
3677,3677,307130
3678,3678,209120
3679,3679,220090
3680,3680,262850


In [75]:
df_recortado_2 = pd.merge(items_unicos, df_recortado.drop('id', axis=1), left_on='id_juego', right_on='id')

In [82]:
df_recortado_2.query("app_name.str.contains('boid')")


Unnamed: 0,index,id_juego,app_name,genres,release_date,tags,price,developer,release_year,genres_tags
281,318,108600,Project Zomboid,"[Indie, RPG, Simulation, Early Access]",2013-11-08,"[Early Access, Survival, Zombies, Open World, ...",14.99,The Indie Stone,2013,"[Indie, RPG, Simulation, Early Access, Surviva..."


In [76]:
# lista_comp = []
# for i in matriz_dummies.index:
#     temp = []
#     for j in matriz_dummies.index:
#         temp.append(comparar(i,j))
#     lista_comp.append(temp)


In [77]:
# matriz = pd.DataFrame(lista_comp)

# matriz.to_csv('matrizon.csv')