# PROCESO ETL PARA (steam_games, user_reviews, user_items)

#### Cargamos las librerías necesarias

In [11]:
import pandas as pd
import gzip
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#### Función que separa el año del campo 'release_date'

In [12]:
def separar_anio(fecha):
    fecha["release_date"] = pd.to_datetime(fecha["release_date"], errors='coerce') #Convierte los valores en formato fecha
    df_filtrado = fecha.dropna(subset=["release_date"]) #obtiene un nuevo dataframe con valores de 'release_date' no nulos
    df_filtrado["release_date"] = pd.to_datetime(df_filtrado["release_date"]) # Convertir la columna "release_date" a tipo datetime
    # Completo los valores faltantes en las columna'release_date' con el dato más común que es la moda
    columns_fill_mode = ['release_date']
    for col in columns_fill_mode:
        fecha[col].fillna(fecha[col].mode()[0], inplace=True)
    fecha['release_date'] = fecha['release_date'].dt.year.astype('Int64') #Actualizo la columna 'release_date' con el tipo int
    return fecha

'\ndfgames = dfgames_with_dummies\n# Reemplazar los valores no válidos por NaN / Replace invalid values \u200b\u200bwith NaN\ndfgames["release_date"] = pd.to_datetime(dfgames["release_date"], errors=\'coerce\')\n# Filtrar el DataFrame para obtener los registros válidos / Filter the DataFrame to get valid records\ndf_filtered = dfgames.dropna(subset=["release_date"])\n# Convertir la columna "release_date" a tipo datetime / Convert "release_date" column to datetime type\ndf_filtered["release_date"] = pd.to_datetime(df_filtered["release_date"])\n# Rellenando los valores faltantes en las columna\'release_date\' con la moda / Filling missing values \u200b\u200bin \'release_date\' columns with mode\ncolumns_fill_mode = [\'release_date\']\nfor col in columns_fill_mode:\n    dfgames[col].fillna(dfgames[col].mode()[0], inplace=True)\n# Modelado y eliminación de la columna \'release_date\' / Modeling and removing \'release_date\' column\ndfgames[\'year\'] = dfgames[\'release_date\'].dt.year.asty

#### Función que hace el análisis de sentimiento según la escala

In [None]:
def get_sentiment_score(text):
    if pd.isnull(text) or text == '':
        return 1                                # Retorna neutral si está vacía o es NaN
    elif isinstance(text, str):
        sia = SentimentIntensityAnalyzer()      # Inicializamos el analizador de sentimientos
        sentiment = sia.polarity_scores(text)
        compound_score = sentiment['compound']
        if compound_score >= -0.05:
            return 2                            # Buen score
        elif compound_score <= -0.05:
            return 0                            # Mal score
        else:
            return 1
    else:
        return 1                                # Retorna neutral para valores no-string

## ETL para 'steam_games'

#### Cargamos el archivo json en un dataframe

In [13]:
#df = pd.read_json('../Datasets_originales/steam_games.json.gz', orient='records', lines=True, convert_dates=True)
#df = pd.read_json('dataset_finales/steam_games_organizados.json', orient='records', lines=True, convert_dates=True)

#### Hacemos una revisión general de los datos

In [14]:
df

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


#### Elimino las filas que tienen todas valores nulos NaN

In [15]:
df.dropna(how='all', inplace=True)

In [16]:
df.shape

(32135, 13)

##### Selecciono las columnas 'publisher', 'developer', 'app_name', 'title' para comparar valores y revisar cuáles puedo eliminar
- 'publisher' tiene la misma información que 'developer' solo que 'developer' tiene menos nulos. Elimino 'publisher'
- 'title' tiene la misma información que 'app_name' solo que 'app_name' tiene menos nulos. Elimino 'title'

In [17]:
selected_columns = ['publisher', 'developer', 'app_name', 'title']
result = df[selected_columns]
result.head(10)

Unnamed: 0,publisher,developer,app_name,title
88310,Kotoshiro,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty
88311,"Making Fun, Inc.",Secret Level SRL,Ironbound,Ironbound
88312,Poolians.com,Poolians.com,Real Pool 3D - Poolians,Real Pool 3D - Poolians
88313,彼岸领域,彼岸领域,弹炸人2222,弹炸人2222
88314,,,Log Challenge,
88315,Trickjump Games Ltd,Trickjump Games Ltd,Battle Royale Trainer,Battle Royale Trainer
88316,,Poppermost Productions,SNOW - All Access Basic Pass,SNOW - All Access Basic Pass
88317,Poppermost Productions,Poppermost Productions,SNOW - All Access Pro Pass,SNOW - All Access Pro Pass
88318,Poppermost Productions,Poppermost Productions,SNOW - All Access Legend Pass,SNOW - All Access Legend Pass
88319,RewindApp,RewindApp,Race,Race


##### Elimino variables que no se necesitan 

In [None]:
columnas_a_eliminar = ['publisher', 'reviews_url','url','tags', 'specs', 'price', 'early_access','title']
df.drop(columnas_a_eliminar, axis=1, inplace=True)

In [19]:
df= df.dropna(subset=['developer']) #Elimino los valores nulos en la variable 'developer'

In [16]:
df.info() #Reviso el estado de las variables que quedan

<class 'pandas.core.frame.DataFrame'>
Index: 28836 entries, 88310 to 120443
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        28682 non-null  object 
 1   app_name      28836 non-null  object 
 2   release_date  28818 non-null  object 
 3   id            28835 non-null  float64
 4   developer     28836 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.3+ MB


Elimino los valores nulos en la variable 'genres' pues no hay forma de obtenerlos de otra variable

In [20]:
df= df.dropna(subset=['genres'])

Elimino los valores nulos de la variable 'release_date' pues esta es relevante en las funciones de la API, elimino los restantes valores nulos

In [21]:
df= df.dropna()

Extraigo el año de la variable 'release_date' usando la función (separar_anio)

In [None]:
separar_anio(df)

- Verifico que el procedimiento haya quedado bien con el año y la frecuencia

In [103]:
tabla_frecuencia = pd.DataFrame(df['release_date'].value_counts()).reset_index()
tabla_frecuencia.columns = ['release_date', 'frecuencia']
registros = tabla_frecuencia.iloc[0:5]
registros
#tabla_frecuencia.head(30)

Unnamed: 0,release_date,frecuencia
0,2017,9185
1,2016,6653
2,2015,4751
3,2014,2721
4,2013,1437


### Creamos variables dummys para los géneros 'genres' 

In [26]:
df['genres'] = df['genres'].fillna('[]')                    # Rellenar los valores faltantes con una lista vacía
df['genres'] = df['genres'].apply(lambda x: ', '.join(x))   # Convierte la lista de géneros a una cadena separada por comas

In [28]:
dummy_generos = df['genres'].str.get_dummies(', ')          #Genero variables ficticias para el campo 'genres'

In [31]:
df_con_dummy = pd.concat([df, dummy_generos], axis=1)       #Concateno el Dataframe original con las variables ficticias

In [33]:
df = df_con_dummy                                           #Cambio el nombre de Dataframe

In [35]:
df['id'] = df['id'].astype(int)                             #Cambio el tipo del campo 'id' de float a int

Exportamos el dataframe a un archivo CSV

In [32]:
#df.to_csv('../Datasets_organizados/steam_games_organizados.csv',encoding='UTF-8',index=False) # --> guardo el dataset en archivo .csv
df.to_csv('dataset_finales/steam_games_organizados.csv',encoding='UTF-8',index=False)

Generamos un archivo comprimido

In [33]:
import gzip
#with open('../Datasets_organizados/steam_games_organizados.csv', 'rb') as f_in, gzip.open('../Datasets_organizados/steam_games_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)  

#with open('dataset_finales/steam_games_organizados.csv', 'rb') as f_in, gzip.open('dataset_finales/steam_games_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)      

#### Elimino la columna 'genres'

In [34]:
df.drop(['genres'],axis=1, inplace=True)

In [84]:
df_games = df

## ETL para 'australian_user_reviews'

-Cargamos el archivo con los reviews de los usuarios

In [55]:
import ast
data_list = []
#file_path = '../Datasets_originales/australian_user_reviews.json' # Definimos la ruta del archivo json
#file_path = 'dataset_finales/australian_user_reviews.json' # Definimos la ruta del archivo json

#Abrimos el archivo y procesamos cada línea
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

#Creamos un DataFrame a partir de la lista de diccionarios
df_reviews = pd.DataFrame(data_list)

#### Damos una vista general de la imformación del Dataframe

In [56]:
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


Vemos el detalle de un elemento de la variable 'reviews'

In [57]:
df_reviews.shape

(25799, 3)

#### Vemos un detalle de un review para entender qué tipo y cuanta información contiene

In [58]:
detalle_item = df_reviews.loc[4140, 'reviews'][0]           #Vemos el detalle del registro 4140 y el review 0
detalle_item

{'funny': '',
 'posted': 'Posted December 8, 2013.',
 'last_edited': '',
 'item_id': '221380',
 'helpful': '1 of 1 people (100%) found this review helpful',
 'recommend': True,
 'review': 'Extremely good RTS game that, although over 10 years old, is far superior to many modern AAA titles.'}

Desanidamos la variable 'review'

In [59]:
df_desanidados = df_reviews.explode('reviews')              #Desanidamos la columna 'reviews'

# Combinamos los DataFrames originales con los nuevos datos desanidados y eliminamos la columna 'review'
df_desanidados2 = pd.concat([df_desanidados.drop(['reviews'], axis=1), df_desanidados['reviews'].apply(pd.Series)], axis=1)

#### Extraemos el año de la columna 'posted'

In [64]:
df_desanidados2['posted_year'] = df_desanidados2['posted'].str.extract(r'(\d{4})')

#### Eliminamos la columna 'posted'

In [65]:
df_desanidados2.drop('posted' , axis = 1, inplace = True)

Previsualizamos

In [66]:
df_desanidados2

Unnamed: 0,user_id,user_url,funny,last_edited,item_id,helpful,recommend,review,0,posted_year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,22200,No ratings yet,True,It's unique and worth a playthrough.,,2011
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,,2011
1,js41637,http://steamcommunity.com/id/js41637,,,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,,2014
1,js41637,http://steamcommunity.com/id/js41637,,,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,,2013
...,...,...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,70,No ratings yet,True,a must have classic from steam definitely wort...,,
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,362890,No ratings yet,True,this game is a perfect remake of the original ...,,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,730,No ratings yet,True,:D,,


#### Realizamos el análisis de sentimiento en la variable 'review'

In [69]:
df_desanidados2['review'] = df_desanidados2['review'].astype(str)                           # Convierte la columna 'review' a tipo de dato str
df_desanidados2['sentiment_score'] = df_desanidados2['review'].apply(get_sentiment_score)   # Aplica la función get_sentiment_score a la columna 'review'

- Revisamos los datos y vemos que todo quedó Ok

In [71]:
df_desanidados2

Unnamed: 0,user_id,user_url,funny,last_edited,item_id,helpful,recommend,review,0,posted_year,sentiment_score
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,,2011,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,22200,No ratings yet,True,It's unique and worth a playthrough.,,2011,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,,2011,2
1,js41637,http://steamcommunity.com/id/js41637,,,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,,2014,2
1,js41637,http://steamcommunity.com/id/js41637,,,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,,2013,2
...,...,...,...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,70,No ratings yet,True,a must have classic from steam definitely wort...,,,2
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,362890,No ratings yet,True,this game is a perfect remake of the original ...,,,2
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,,,2
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,730,No ratings yet,True,:D,,,2


Exportamos el dataframe a un archivo CVS

In [48]:
#df_desanidados2.to_csv('../Datasets_organizados/australian_user_reviews_organizados.csv',encoding='UTF-8',index=False) # Guardo el dataset en .csv
#df_desanidados2.to_csv('dataset_finales/australian_user_reviews_organizados.csv',encoding='UTF-8',index=False) # Guardo el dataset en .csv

Generamos un archivo comprimido

In [49]:
import gzip
#with open('../Datasets_organizados/australian_user_reviews_organizados.csv', 'rb') as f_in, gzip.open('../Datasets_organizados/australian_user_reviews_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)

#with open('dataset_finales/australian_user_reviews_organizados.csv', 'rb') as f_in, gzip.open('dataset_finales/australian_user_reviews_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)    

In [73]:
df_review = df_desanidados2

## ETL para 'australian_users_items'

Cargamos el archivo 'australian_users_items.json'

In [75]:
import ast
data_list = []
#file_path = '../Datasets_originales/australian_users_items.json'    #Ruta del archivo json
#file_path = 'dataset_finales/australian_users_items.json'    #Ruta del archivo json

#Abrimos el archivo y procesamos cada línea
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)                      # Usamos ast.literal_eval para convertir la línea en un diccionario
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_items = pd.DataFrame(data_list)                                  #Crear un DataFrame a partir de la lista de diccionarios

- Revisamos el Dataframe

In [76]:
df_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


- Reviso la estructura de un valor del campo anidado 'items' para verificar tipo de datos y cantidad

In [77]:
detalle_item = df_items.loc[4140, 'items'][0]                   #Reviso el registro 4140 campo 0
detalle_item

{'item_id': '240',
 'item_name': 'Counter-Strike: Source',
 'playtime_forever': 755,
 'playtime_2weeks': 0}

- Desanidamos el campo 'items' en sus datos relevantes

In [78]:
df_items['playtime_forever'] = df_items['items'].apply(lambda x: x[0].get('playtime_forever') if len(x) > 0 else 0)
df_items['playtime_2weeks'] = df_items['items'].apply(lambda x: x[0].get('playtime_2weeks') if len(x) > 0 else 0)
df_items['id'] = df_items['items'].apply(lambda x: x[0].get('item_id') if len(x) > 0 else 0)
df_items['playtime_forever'] = df_items['playtime_forever'].astype(int)
df_items['playtime_2weeks'] = df_items['playtime_2weeks'].astype(int)

In [79]:
df_items['id'] = df_items['id'].astype(int)                 #Convertimos el campo 'id' a entero

- Elimino las columnas que no se requieren

In [80]:
df_items.drop(['user_url'], axis=1, inplace=True)  
df_items.drop(['items'], axis=1, inplace=True)

- Reviso los datos del nuevo Dataframe

In [81]:
df_items

Unnamed: 0,user_id,items_count,steam_id,playtime_forever,playtime_2weeks,id
0,76561197970982479,277,76561197970982479,6,0,10
1,js41637,888,76561198035864385,0,0,10
2,evcentric,137,76561198007712555,923,0,1200
3,Riot-Punch,328,76561197963445855,0,0,10
4,doctr,541,76561198002099482,1131,0,300
...,...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,0,0,413850
88306,76561198326700687,177,76561198326700687,0,0,11020
88307,XxLaughingJackClown77xX,0,76561198328759259,0,0,0
88308,76561198329548331,7,76561198329548331,677,677,304930


- Exportamos el dataframe a un archivo CSV

In [66]:
#df_items.to_csv('../Datasets_organizados/australian_users_items_organizados.csv',encoding='UTF-8',index=False) # Guardamos el dataset en .csv
#df_items.to_csv('dataset_finales/australian_users_items_organizados.csv',encoding='UTF-8',index=False) # Guardamos el dataset en .csv

- Generamos un archivo comprimido

In [67]:
#with open('../Datasets_organizados/australian_users_items_organizados.csv', 'rb') as f_in, gzip.open('../Datasets_organizados/australian_users_items_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)

#with open('dataset_finales/australian_users_items_organizados.csv', 'rb') as f_in, gzip.open('dataset_finales/australian_users_items_organizados.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)    

## Creación de un único Dataset

#### Unimos tablas mediante el 'id'

In [86]:
df_games.shape                                                          #Verificamos estructura general del dataset1 

(28663, 27)

In [87]:
df_items.shape                                                          #Verificamos estructura general del dataset2

(88310, 6)

In [88]:
df_union = pd.merge(df_games, df_items, on='id', how='inner')           #Unimos los dos dataset mediante la llave 'id'

In [89]:
df_union.shape                                                          #Verificamos la estructura del nuevo dataset

(62956, 32)

- Guardamos este primer dataset con dos uniones en un archivo CSV

In [71]:
#df_union.to_csv('../Datasets_organizados/DF_final.csv',encoding='UTF-8',index=False) # Guardamos dataset en .csv
#df_union.to_csv('dataset_finales/DF_final.csv',encoding='UTF-8',index=False) # Guardamos dataset en .csv

- Agregamos el dataset 'reviews' a la unión anterior

In [94]:
#df_final = pd.merge(df_union, df_review, on = 'user_id', how='inner')               # Con esta operación unimos los tres dataset

In [95]:
df_final.shape                                                                      # Verificamos la estructura del dataset final 

(52945, 42)

- Borramos las columnas que no vamos a usar

In [98]:
df_final2 = df_final
df_final.drop(columns=[0], inplace=True)

- Exportamos el dataset final a un archivo CSV

In [100]:
#df_final.to_csv('../Datasets_organizados/Dataset_final.csv',encoding='UTF-8',index=False) #Guardamos el dataset en .cvs 
#df_final.to_csv('dataset_finales/Dataset_final.csv',encoding='UTF-8',index=False) #Guardamos el dataset en .cvs 

- Generamos un archivo comprimido

In [101]:
#with open('../Datasets_organizados/Dataset_final.csv', 'rb') as f_in, gzip.open('../Datasets_organizados/Dataset_final.csv.gz', 'wb') as f_out:  
#    f_out.writelines(f_in)

#with open('dataset_finales/Dataset_final.csv', 'rb') as f_in, gzip.open('dataset_finales/Dataset_final.csv.gz', 'wb') as f_out:  
#    f_out.writelines(f_in)    

## Creamos otro Dataset sin las columnas genres para procesos EDA

In [84]:
df_num= df_final.select_dtypes(include = (int,float))
df_num.drop(columns=['Action', 'Adventure',
       'Audio Production', 'Casual', 'Design &amp; Illustration',
       'Early Access', 'Education', 'Free to Play', 'Indie', 'Animation &amp; Modeling',
       'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation',
       'Software Training', 'Sports', 'Strategy', 'Utilities',
       'Video Production', 'Web Publishing'], inplace=True)

- Exportamos el dataset a un archivo CVS

In [85]:
#df_final.to_csv('../Datasets_organizados/Dataset_final_Numeros.csv',encoding='UTF-8',index=False) #Guardamos el dataset en .cvs
#df_final.to_csv('dataset_finales/Dataset_final_Numeros.csv',encoding='UTF-8',index=False) #Guardamos el dataset en .cvs

- Generamos un archivo comprimido

In [86]:
#with open('../Datasets_organizados/Dataset_final_Numeros.csv', 'rb') as f_in, gzip.open('../Datasets_organizados/Dataset_final_Numeros.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)

#with open('dataset_finales/Dataset_final_Numeros.csv', 'rb') as f_in, gzip.open('dataset_finales/Dataset_final_Numeros.csv.gz', 'wb') as f_out:  # --- > releo el archivo para comprimirlo a formato gzip
#    f_out.writelines(f_in)    