In [1]:
# Librerias necesarias
import pandas as pd
import numpy as np
import json
import ast
import gzip

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Ruta de los datos:
ruta_games= 'Datasets originales/steam_games.json.gz'
ruta_user_reviews= 'Datasets originales/user_reviews.json.gz'
ruta_user_items= 'Datasets originales/users_items.json.gz'

#### Games

In [3]:
## Games:
games_orig = []
with gzip.open(ruta_games, 'rt', encoding='utf-8') as archivo_comprimido:
    for linea in archivo_comprimido:
        diccionario = json.loads(linea)
        games_orig.append(diccionario)

# Transformamos a DataFrame:
games_orig = pd.DataFrame(games_orig)

(120445, 13)

In [77]:
# Copia de los datos para no estar constantemente cargandolos para resetearlos
games = games_orig.copy()

In [None]:
games.tail()

Las transformaciones que realizaremos en este caso serán: 

1. Eliminar registros completamente vacíos
2. Eliminar columnas que no utilizaremos: ``publisher``, ``url``, ``reviews_url``, ``price``, ``early_access``, ``developer``. Eliminar la columna ``tags`` previo a rellenar los datos faltantes de genres con sus valores. ``app_name`` y ``title`` dicen lo mismo, dejaremos la columna que tenga menor cantidad de datos faltantes.
3. Desanidar registros que poseen valores con tipo de dato LISTA.
4. Eliminar registros duplicados.
5. Corregir el tipo de dato de cada columna.
6. Eliminar registros que poseen datos vacíos en columnas importantes

In [79]:
# 1. Eliminar registros completamente vacíos
print(f'tamaño inicial del dataframe: {len(games)} registros')
games.dropna(how='all', inplace= True, ignore_index=True)
print(f'tamaño final del dataframe: {len(games)} registros ')

tamaño inicial del dataframe: 120445 registros
tamaño final del dataframe: 32135 registros 


In [80]:
# 2. Eliminar columnas que no utilizaremos: ``publisher``, ``url``, ``reviews_url``, ``price``, ``early_access``, ``developer``.
# Eliminar la columna ``tags`` previo a rellenar los datos faltantes de ``genres`` con sus valores. 
# ``app_name`` y ``title`` dicen lo mismo, dejaremos "app_name" que tiene menos datos faltantes y le cambiaremos el nombre a "title"

print(f"El tamaño inicial de games era {games.shape}")

games['genres'] = games['genres'].combine_first(games['tags']) # a cada valor faltante de genres le hara corresponder lo que figure en tags
games['title']= games['app_name']
games.drop(columns=['publisher', 'url', 'reviews_url', 'price', 'early_access', 'developer', 'tags', 'app_name'], inplace=True)

print("Columnas sin utilidad eliminadas")
print(f"Ahora, el tamaño final es: {games.shape}")


El tamaño inicial de games era (32135, 13)
Columnas sin utilidad eliminadas
Ahora, el tamaño final es: (32135, 5)


In [88]:
games.sample()

Unnamed: 0,genres,title,release_date,specs,id
4769,[Simulation],FSX: Steam Edition - Embraer E-Jets 175 & 195 ...,2015-11-10,"[Single-player, Multi-player, Downloadable Con...",364324


In [89]:
# 3. Desanidar registros que poseen valores con tipo de dato LISTA: genres y specs.
games = games.explode(column=['genres'], ignore_index=True)
games = games.explode(column=['specs'], ignore_index=True)
games.shape

(408844, 5)

In [90]:
# 4. Eliminar registros duplicados.
games.drop_duplicates(inplace=True)
games.shape

(408840, 5)

In [91]:
# 5. Corregir el tipo de dato de cada columna.
    # genres OK
    # title OK
    # release_date 83245 NaN
    # specs OK
    #

<class 'pandas.core.frame.DataFrame'>
Index: 408840 entries, 0 to 408843
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   genres        408641 non-null  object
 1   title         408835 non-null  object
 2   release_date  325595 non-null  object
 3   specs         407194 non-null  object
 4   id            408829 non-null  object
dtypes: object(5)
memory usage: 18.7+ MB


In [96]:
games['release_date'] = pd.to_datetime(games['release_date'],  )


ValueError: time data "Soon.." doesn't match format "%Y-%m-%d", at position 4. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

#### Reviews

In [4]:
## Reviews:
user_reviews = []
with gzip.open(ruta_user_reviews, 'rt', encoding="utf-8") as archivo_comprimido:
    for linea in archivo_comprimido:
        user_reviews.append(ast.literal_eval(linea))
    
# Transformamos a DataFrame:
user_reviews_orig = pd.DataFrame(user_reviews)


(25799, 3)

In [None]:
# Copia de los datos para no estar constantemente cargandolos para resetearlos
user_reviews = user_reviews_orig.copy()

In [None]:
# Extraigo solo las columnas que voy a usar

reviews = user_reviews[['user_id','reviews']]
reviews.head()

Unnamed: 0,user_id,reviews
0,76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,"[{'funny': '3 people found this review funny',..."


In [None]:
# Desanidamos reviews:

reviews_explode = reviews.explode('reviews', ignore_index=True) # separo en filas todos todos los json que estan listados en cada registro
reviews_desanidado = pd.json_normalize(reviews_explode['reviews']) # transformo a tabla cada uno de los json que tenemos en cada registro
reviews = pd.concat([reviews_explode,reviews_desanidado], axis=1).drop(columns=['reviews']) # concateno ambos dataframes y dropeo la columna reviews que está anidada

#### Items

In [5]:
## Items:
user_items = []
with gzip.open(ruta_user_items, 'rt', encoding="utf-8") as archivo_comprimido:
    for linea in archivo_comprimido:
        user_items.append(ast.literal_eval(linea))
    
# Transformamos a DataFrame:
user_items_orig = pd.DataFrame(user_items)

(88310, 5)

In [None]:
# Copia de los datos para no estar constantemente cargandolos para resetearlos
user_items = user_items_orig.copy()

In [27]:
# Extraigo solo las columnas que voy a usar:
items = user_items[['user_id','items']]
items.head()

Unnamed: 0,user_id,items
0,76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [28]:
# Desanidamos items:

items_explode = items.explode('items', ignore_index=True) # separo en filas todos todos los json que estan listados en cada registro
items_desanidado = pd.json_normalize(items_explode['items']) # transformo a tabla cada uno de los json que tenemos en cada registro
items = pd.concat([items_explode,items_desanidado], axis=1).drop(columns=['items']) # concateno ambos dataframes y dropeo la columna items que está anidada

Ahora tenemos 3 tablas con las que trabajaremos: `games`, `reviews` e `items`

#### Transformaciones generales

In [54]:
games['title'].info()
games['app_name'].info()
# Son columnas similares. En este caso la columna que voy a dejar es app_name porque posee menor cantidad de datos vacios: 2.

<class 'pandas.core.series.Series'>
Index: 32135 entries, 88310 to 120444
Series name: title
Non-Null Count  Dtype 
--------------  ----- 
30085 non-null  object
dtypes: object(1)
memory usage: 502.1+ KB
<class 'pandas.core.series.Series'>
Index: 32135 entries, 88310 to 120444
Series name: app_name
Non-Null Count  Dtype 
--------------  ----- 
32133 non-null  object
dtypes: object(1)
memory usage: 502.1+ KB


In [63]:
# Prueba para rellenar valores faltantes de genres
prueba = games[['genres','tags']].loc[88313:88320]
prueba['genres'] = prueba['genres'].combine_first(prueba['tags'])
prueba

Unnamed: 0,genres,tags
88313,"[Action, Adventure, Casual]","[Action, Adventure, Casual]"
88314,"[Action, Indie, Casual, Sports]","[Action, Indie, Casual, Sports]"
88315,"[Action, Adventure, Simulation]","[Action, Adventure, Simulation, FPS, Shooter, ..."
88316,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]"
88317,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]"
88318,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]"
88319,"[Casual, Indie, Racing, Simulation]","[Indie, Casual, Simulation, Racing]"
88320,"[Action, Indie, Simulation, Early Access]","[Early Access, Action, Indie, Simulation, Surv..."


In [49]:
games['genres'] = games['genres'].combine_first(games['tags']) # a cada valor faltante de genres le hara corresponder lo que figure en tags
games.drop(columns=['publisher'])

<class 'pandas.core.series.Series'>
Index: 32135 entries, 88310 to 120444
Series name: title
Non-Null Count  Dtype 
--------------  ----- 
30085 non-null  object
dtypes: object(1)
memory usage: 502.1+ KB
<class 'pandas.core.series.Series'>
Index: 32135 entries, 88310 to 120444
Series name: app_name
Non-Null Count  Dtype 
--------------  ----- 
32133 non-null  object
dtypes: object(1)
memory usage: 502.1+ KB


In [40]:
# Tratamiento de registros vacíos
print(f"El tamaño inicial del dataset games es: \n{games.shape}")
print(f"El tamaño inicial del dataset reviews es: \n{reviews.shape}")
print(f"El tamaño inicial del dataset items es: \n{items.shape}")

games.dropna(how='all', inplace=True)
reviews.dropna(how='all', inplace=True)
items.dropna(how='all', inplace=True)

print(f"Removiendo los registros completamente vacíos nos quedaran con un tamaño de \n games: {games.shape}")
print(f"Removiendo los registros completamente vacíos nos quedaran con un tamaño de \n reviews: {reviews.shape}")
print(f"Removiendo los registros completamente vacíos nos quedaran con un tamaño de \n items: {items.shape}")

El tamaño inicial del dataset games es: 
(32135, 13)
El tamaño inicial del dataset reviews es: 
(59333, 8)
El tamaño inicial del dataset items es: 
(5170015, 5)
Removiendo los registros completamente vacíos nos quedaran con un tamaño de 
 games: (32135, 13)
Removiendo los registros completamente vacíos nos quedaran con un tamaño de 
 reviews: (59333, 8)
Removiendo los registros completamente vacíos nos quedaran con un tamaño de 
 items: (5170015, 5)


In [45]:
# Tratamiento de registros duplicados
print(f"El tamaño del dataset games previo a realizar limpieza de registros duplicados es: {games.shape}")
print(f"El tamaño del dataset reviews previo a realizar limpieza de registros duplicados es: {reviews.shape}")
print(f"El tamaño del dataset items previo a realizar limpieza de registros duplicados es: {items.shape}")

games.drop_duplicates(subset=['publisher', 'app_name', 'title', 'url', 'release_date', 'reviews_url', 'price', 'early_access', 'id','developer'])
reviews.drop_duplicates()
items.drop_duplicates()

print(f"El tamaño del dataset games luego de realizar limpieza de registros duplicados es: {games.shape}")
print(f"El tamaño del dataset reviews luego de realizar limpieza de registros duplicados es: {reviews.shape}")
print(f"El tamaño del dataset items luego de realizar limpieza de registros duplicados es: {items.shape}")

El tamaño del dataset games previo a realizar limpieza de registros duplicados es: (32135, 13)
El tamaño del dataset reviews previo a realizar limpieza de registros duplicados es: (59333, 8)
El tamaño del dataset items previo a realizar limpieza de registros duplicados es: (5170015, 5)
El tamaño del dataset games luego de realizar limpieza de registros duplicados es: (32135, 13)
El tamaño del dataset reviews luego de realizar limpieza de registros duplicados es: (59333, 8)
El tamaño del dataset items luego de realizar limpieza de registros duplicados es: (5170015, 5)


#### Las tablas que finalmente utilizaremos son: ***games***, ***reviews*** e ***items***

#### 3. Carga o disponibilización de datos

#### 3.1 PlayTimeGenre:  Debe devolver año con mas horas jugadas para dicho género.
Ejemplo de retorno: {"Año de lanzamiento con más horas jugadas para Género X" : 2013}

In [28]:
games.tail()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"
120444,,,Maze Run VR,,http://store.steampowered.com/app/681550/Maze_...,,"[Early Access, Adventure, Indie, Action, Simul...",http://steamcommunity.com/app/681550/reviews/?...,"[Single-player, Stats, Steam Leaderboards, HTC...",4.99,True,681550,


In [29]:
reviews.head()

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [30]:
items.head()

Unnamed: 0,user_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,10,Counter-Strike,6.0,0.0
1,76561197970982479,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,30,Day of Defeat,7.0,0.0
3,76561197970982479,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,50,Half-Life: Opposing Force,0.0,0.0


#### 3.2 UsersForGenre: Debe devolver el usuario que acumula más horas jugadas para el género dado y una lista de la acumulación de horas jugadas por año.
Ejemplo de retorno: {"Usuario con más horas jugadas para Género X" : us213ndjss09sdf, "Horas jugadas":[{Año: 2013, Horas: 203}, {Año: 2012, Horas: 100}, {Año: 2011, Horas: 23}]}

In [35]:
items[['user_id','item_id','playtime_forever']].head()

Unnamed: 0,user_id,item_id,playtime_forever
0,76561197970982479,10,6.0
1,76561197970982479,20,0.0
2,76561197970982479,30,7.0
3,76561197970982479,40,0.0
4,76561197970982479,50,0.0
