In [1]:
import pandas as pd
import json
import pyarrow
import ast

In [2]:
# Lista para almacenar los diccionarios tipo json de cada línea
data_list = []

# Ruta del archivo json
file_path = 'DB Steam/users_items.json'

# Abrir el archivo y procesar cada línea
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

# Crear un DataFrame a partir de la lista de diccionarios
dfItems = pd.DataFrame(data_list)

In [3]:
dfItems.head(10)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
5,MinxIsBetterThanPotatoes,371,76561198004744620,http://steamcommunity.com/id/MinxIsBetterThanP...,"[{'item_id': '50', 'item_name': 'Half-Life: Op..."
6,NitemarePK,304,76561197990951820,http://steamcommunity.com/id/NitemarePK,"[{'item_id': '240', 'item_name': 'Counter-Stri..."
7,themanwich,258,76561198035296505,http://steamcommunity.com/id/themanwich,"[{'item_id': '220', 'item_name': 'Half-Life 2'..."
8,maplemage,629,76561198026584251,http://steamcommunity.com/id/maplemage,"[{'item_id': '240', 'item_name': 'Counter-Stri..."
9,Wackky,0,76561198039117046,http://steamcommunity.com/id/Wackky,[]


In [4]:
print(dfItems.shape)

(88310, 5)


In [5]:
#Extraemos los valores de 'items_count' del dataframe creado en un nuevo dataframe y lo convertimos en formato parquet
dfItems_count = dfItems[['user_id', 'items_count']]

In [6]:
dfItems_count

Unnamed: 0,user_id,items_count
0,76561197970982479,277
1,js41637,888
2,evcentric,137
3,Riot-Punch,328
4,doctr,541
...,...,...
88305,76561198323066619,22
88306,76561198326700687,177
88307,XxLaughingJackClown77xX,0
88308,76561198329548331,7


In [7]:
dfItems_count.to_parquet('DB Steam/items_count.parquet', index = False)

In [8]:
#Ahora borramos las columnas que no necesitamos del dataframe principal y lo convertimos a parquet también
del dfItems['steam_id']
del dfItems['items_count']
del dfItems['user_url']

In [9]:
dfItems

Unnamed: 0,user_id,items
0,76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...
88305,76561198323066619,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,[]
88308,76561198329548331,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [10]:
dfItems.to_parquet('DB Steam/items.parquet', index = False)

In [11]:
# Se procede a desanidar o separar los datos de la columna items, ya que estan en un diccionario los datos de cada usuario
dfItems_sep = dfItems.explode('items').reset_index(drop = True)
# Con el método explode podemos "explotar" las series de listas en nuevas filas, por eso es adecuado en este paso

In [12]:
# Podemos ver que se ha creado una fila por cada item, no por cada usuario
dfItems_sep


Unnamed: 0,user_id,items
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike..."
1,76561197970982479,"{'item_id': '20', 'item_name': 'Team Fortress ..."
2,76561197970982479,"{'item_id': '30', 'item_name': 'Day of Defeat'..."
3,76561197970982479,"{'item_id': '40', 'item_name': 'Deathmatch Cla..."
4,76561197970982479,"{'item_id': '50', 'item_name': 'Half-Life: Opp..."
...,...,...
5170010,76561198329548331,"{'item_id': '373330', 'item_name': 'All Is Dus..."
5170011,76561198329548331,"{'item_id': '388490', 'item_name': 'One Way To..."
5170012,76561198329548331,"{'item_id': '521570', 'item_name': 'You Have 1..."
5170013,76561198329548331,"{'item_id': '519140', 'item_name': 'Minds Eyes..."


In [13]:
 # Se extraen los datos de 'item_id' y 'playtime_forever' que son los que se van a usar de la columna items 
dfItems_sep['item_id'] = dfItems_sep['items'].apply(lambda x: x.get('item_id') if isinstance(x, dict) and 'item_id' in x else None)
dfItems_sep['playtime'] = dfItems_sep['items'].apply(lambda x: x.get('playtime_forever') if isinstance(x, dict) and 'playtime_forever' in x else None)
dfItems_sep['playtime'] = (dfItems_sep['playtime'] / 60).round(2)

In [14]:
dfItems_sep

Unnamed: 0,user_id,items,item_id,playtime
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.10
1,76561197970982479,"{'item_id': '20', 'item_name': 'Team Fortress ...",20,0.00
2,76561197970982479,"{'item_id': '30', 'item_name': 'Day of Defeat'...",30,0.12
3,76561197970982479,"{'item_id': '40', 'item_name': 'Deathmatch Cla...",40,0.00
4,76561197970982479,"{'item_id': '50', 'item_name': 'Half-Life: Opp...",50,0.00
...,...,...,...,...
5170010,76561198329548331,"{'item_id': '373330', 'item_name': 'All Is Dus...",373330,0.00
5170011,76561198329548331,"{'item_id': '388490', 'item_name': 'One Way To...",388490,0.05
5170012,76561198329548331,"{'item_id': '521570', 'item_name': 'You Have 1...",521570,0.07
5170013,76561198329548331,"{'item_id': '519140', 'item_name': 'Minds Eyes...",519140,0.05


In [15]:
# Se elimina la columna items ya que se extrajeron los datos que se necesitan de ahí 
del dfItems_sep['items']

In [16]:
dfItems_sep

Unnamed: 0,user_id,item_id,playtime
0,76561197970982479,10,0.10
1,76561197970982479,20,0.00
2,76561197970982479,30,0.12
3,76561197970982479,40,0.00
4,76561197970982479,50,0.00
...,...,...,...
5170010,76561198329548331,373330,0.00
5170011,76561198329548331,388490,0.05
5170012,76561198329548331,521570,0.07
5170013,76561198329548331,519140,0.05


In [17]:
# Una vez verificado, pasamos el dataframe final a formato parquet para más eficiencia de recursos 
dfItems_sep.to_parquet('DB Steam/items_sep.parquet', index = False)