In [21]:
import json
import pandas as pd

# Ruta al archivo que contiene el JSON
ruta_archivo = '../PI_ML_OPS_data util/output_steam_games.json'

# Lista para almacenar los objetos JSON
lista_objetos_json = []

# Leer el contenido del archivo línea por línea
with open(ruta_archivo, 'r',encoding="utf-8") as f:
    for linea in f:
        try:
            # Utilizar json.loads para cargar cada línea como un objeto JSON
            objeto_json = json.loads(linea)
            lista_objetos_json.append(objeto_json)
        except json.JSONDecodeError as e:
            print(f"Error al decodificar el JSON en la línea: {linea.strip()}. {e}")

# Convierte la lista de objetos a un DataFrame de Pandas para mejor lectura
data = pd.DataFrame(lista_objetos_json)




In [22]:
# Eliminar filas con valores NaN
# Solo aquellas que tengan toda la fila vacía
data_cleaned = data.dropna(how='all')

# Mostrar los primeros registros para verificar la lectura
print(data_cleaned.head())

              publisher                                             genres  \
88310         Kotoshiro      [Action, Casual, Indie, Simulation, Strategy]   
88311  Making Fun, Inc.               [Free to Play, Indie, RPG, Strategy]   
88312      Poolians.com  [Casual, Free to Play, Indie, Simulation, Sports]   
88313              彼岸领域                        [Action, Adventure, Casual]   
88314               NaN                                                NaN   

                      app_name                    title  \
88310      Lost Summoner Kitty      Lost Summoner Kitty   
88311                Ironbound                Ironbound   
88312  Real Pool 3D - Poolians  Real Pool 3D - Poolians   
88313                  弹炸人2222                  弹炸人2222   
88314            Log Challenge                      NaN   

                                                     url release_date  \
88310  http://store.steampowered.com/app/761140/Lost_...   2018-01-04   
88311  http://store.steampower

In [23]:
# Reviso cantidad de nulos iniciales
print(data.isna().sum())

publisher       96362
genres          91593
app_name        88312
title           90360
url             88310
release_date    90377
tags            88473
reviews_url     88312
specs           88980
price           89687
early_access    88310
id              88312
developer       91609
dtype: int64


In [24]:
# Reviso cantidad de nulos luego de dropna
print(data_cleaned.isna().sum())

publisher       8052
genres          3283
app_name           2
title           2050
url                0
release_date    2067
tags             163
reviews_url        2
specs            670
price           1377
early_access       0
id                 2
developer       3299
dtype: int64


In [25]:
# Elimino columnas que según análisis de diccionario no serán necesarias para los futuros endpoints
columns_to_drop = ['publisher', 'url', 'reviews_url', 'early_access']

data_cleaned = data_cleaned.drop(columns=columns_to_drop)


In [26]:
# Obtengo la composición actual del dataframe
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32135 entries, 88310 to 120444
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28852 non-null  object
 1   app_name      32133 non-null  object
 2   title         30085 non-null  object
 3   release_date  30068 non-null  object
 4   tags          31972 non-null  object
 5   specs         31465 non-null  object
 6   price         30758 non-null  object
 7   id            32133 non-null  object
 8   developer     28836 non-null  object
dtypes: object(9)
memory usage: 2.5+ MB


In [30]:
# Eliminar filas con nulos en 'app_name' y 'release_date'
data_cleaned = data_cleaned.dropna(subset=['app_name', 'release_date'])

# Verificar el resultado
print(data_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 30067 entries, 88310 to 120443
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28833 non-null  object
 1   app_name      30067 non-null  object
 2   title         30067 non-null  object
 3   release_date  30067 non-null  object
 4   tags          29906 non-null  object
 5   specs         29398 non-null  object
 6   price         28821 non-null  object
 7   id            30066 non-null  object
 8   developer     28818 non-null  object
dtypes: object(9)
memory usage: 2.3+ MB
None


In [32]:
# Imprimir el tipo de dato de la columna 'price'
print(data_cleaned['price'].dtype)

object


In [33]:
# Contar el número de filas donde la columna 'price' es igual a 0
count_zeros = (data_cleaned['price'] == 0).sum()

# Imprimir el resultado
print(f"Número de filas con 'price' igual a 0: {count_zeros}")

Número de filas con 'price' igual a 0: 0


In [37]:
# Reemplazar valores específicos en la columna 'price'
data_cleaned.loc[data_cleaned['price'].isin(['Free to play', 'Free to use']), 'price'] = 0

# Convertir la columna 'price' a tipo numérico
data_cleaned.loc[:, 'price'] = pd.to_numeric(data_cleaned['price'], errors='coerce')

# Imprimir información sobre los tipos de datos después de la conversión
print(data_cleaned.dtypes)


genres           object
app_name         object
title            object
release_date     object
tags             object
specs            object
price           float64
id               object
developer        object
dtype: object


In [31]:
data_cleaned

Unnamed: 0,genres,app_name,title,release_date,tags,specs,price,id,developer
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],4.99,761140,Kotoshiro
88311,"[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",Free To Play,643980,Secret Level SRL
88312,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",Free to Play,670290,Poolians.com
88313,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,2017-12-07,"[Action, Adventure, Casual]",[Single-player],0.99,767400,彼岸领域
88315,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,2018-01-04,"[Action, Adventure, Simulation, FPS, Shooter, ...","[Single-player, Steam Achievements]",3.99,772540,Trickjump Games Ltd
...,...,...,...,...,...,...,...,...,...
120439,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,2018-01-04,"[Action, Indie, Casual, Violent, Adventure]","[Single-player, Steam Achievements, Steam Cloud]",1.99,745400,Bidoniera Games
120440,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,2018-01-04,"[Strategy, Indie, Casual, Simulation]","[Single-player, Steam Achievements]",1.99,773640,"Nikita ""Ghost_RUS"""
120441,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,2018-01-04,"[Strategy, Indie, Casual]","[Single-player, Steam Achievements, Steam Clou...",4.99,733530,Sacada
120442,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,2018-01-04,"[Indie, Simulation, Racing]","[Single-player, Steam Achievements, Steam Trad...",1.99,610660,Laush Dmitriy Sergeevich


In [38]:
# Verifica si hay duplicados en la columna 'title'
duplicates = data_cleaned['title'].duplicated()

# Muestra las filas que tienen títulos duplicados
duplicated_rows = data_cleaned[duplicates]
print(duplicated_rows)

                                          genres  \
89819                 [Adventure, Casual, Indie]   
100232  [Indie, Massively Multiplayer, Strategy]   
101963                [Action, Adventure, Indie]   
102883                                  [Action]   
105108                     [Adventure, Strategy]   
106374   [Action, Adventure, Casual, Indie, RPG]   
107374                                  [Casual]   
108615                      [Casual, Simulation]   
108661                                   [Indie]   
109299          [Adventure, Free to Play, Indie]   
109300                 [Adventure, Free to Play]   
109458                                  [Casual]   
110210                              [Simulation]   
111127                                       NaN   
111706                                       NaN   
112212          [Adventure, Free to Play, Indie]   
112716                [Action, Adventure, Indie]   
113877                           [Action, Indie]   
113955      

In [40]:
print(data_cleaned['title'])

88310          Lost Summoner Kitty
88311                    Ironbound
88312      Real Pool 3D - Poolians
88313                      弹炸人2222
88315        Battle Royale Trainer
                    ...           
120439                Kebab it Up!
120440              Colony On Mars
120441    LOGistICAL: South Africa
120442               Russian Roads
120443         EXIT 2 - Directions
Name: title, Length: 30067, dtype: object


In [45]:
data_duplicated = data_cleaned[data_cleaned['title'] == 'Black Rose']
print(data_duplicated)

                                  genres    app_name       title release_date  \
111208                          [Action]  Black Rose  Black Rose   2016-06-02   
112212  [Adventure, Free to Play, Indie]  Black Rose  Black Rose   2016-03-12   

                                                     tags  \
111208                                           [Action]   
112212  [Free to Play, Horror, Indie, Adventure, Survi...   

                                                    specs  price      id  \
111208  [Single-player, Downloadable Content, Steam Ac...   0.99  464510   
112212        [Single-player, Partial Controller Support]    NaN  453890   

                     developer  
111208                 TAMSOFT  
112212  Sir Bedlam Productions  


In [17]:
# Guardar el DataFrame modificado en un archivo CSV
data_cleaned.to_csv('../PI_ML_OPS_data util/steam_games.csv', index=False)

In [24]:
data_cleaned['id'].duplicated().any()

True

In [32]:
tipo_dato = type(data_cleaned.at[120443, 'release_date'])
print("Tipo de dato:", tipo_dato)

Tipo de dato: <class 'str'>


In [18]:
filtered_data = data_cleaned[data_cleaned['id'] == '10'][['id', 'app_name', 'title']]
print(filtered_data)


        id        app_name           title
120416  10  Counter-Strike  Counter-Strike
