In [5]:
import pandas as pd
import numpy as np
import json
import gzip
import ast
import pyarrow as pa
import pyarrow.parquet as pq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from textblob import TextBlob

### Crearemos los Dataframe

In [4]:
# Lista vacía para almacenar los datos leídos del archivo, crear el Dataframe con el comprimido 'steam_games.json.gz'
data1 = []

# Abrir el archivo comprimido 'steam_games.json.gz' en modo lectura
with gzip.open('steam_games.json.gz', 'rb') as archivo1:
    # Iterar sobre cada línea del archivo
    for linea in archivo1:
        # Cargar cada línea como un objeto JSON y añadirlo a la lista 'data1'
        data1.append(json.loads(linea))

# Crear un DataFrame de pandas ('df_games') a partir de la lista de datos obtenidos
df_games = pd.DataFrame(data1)

# Convertir la columna 'price' a valores numéricos, reemplazando 'Free To Play' con 0
df_games['price'] = pd.to_numeric(df_games['price'], errors='coerce').fillna(0)

# Escribir el DataFrame limpio como un archivo Parquet
nombre_archivo_parquet = '..\\data\\steam_games.parquet'

# Convertir el DataFrame en una tabla Arrow
tabla_arrow = pa.Table.from_pandas(df_games)

# Escribir la tabla Arrow como un archivo Parquet
pq.write_table(tabla_arrow, nombre_archivo_parquet)

In [3]:
# Lista vacía para almacenar los datos leídos del archivo, crear el Dataframe con el comprimido 'user_reviews.json.gz'
data2 = []

# Abrir el archivo comprimido 'user_reviews.json.gz' en modo lectura binaria
with gzip.open('user_reviews.json.gz', 'rb') as archivo2:
    # Iterar sobre cada línea del archivo
    for linea in archivo2:
        # Convertir la línea de bytes a un diccionario usando ast.literal_eval y utf-8 decoding
        data2.append(ast.literal_eval(linea.decode('utf-8')))

# Crear un DataFrame de pandas ('df_reviews') a partir de la lista de datos obtenidos
df_reviews = pd.DataFrame(data2)

# Escribir el DataFrame como un archivo Parquet
nombre_archivo_parquet = '..\\data\\user_reviews.parquet'

# Convertir el DataFrame en una tabla Arrow
tabla_arrow = pa.Table.from_pandas(df_reviews)

# Escribir la tabla Arrow como un archivo Parquet
pq.write_table(tabla_arrow, nombre_archivo_parquet)

In [4]:
# Lista vacía para almacenar los datos leídos del archivo, crear el Dataframe con el comprimido 'users_items.json.gz'
data3 = []

# Abrir el archivo comprimido 'users_items.json.gz' en modo lectura binaria
with gzip.open('users_items.json.gz', 'rb') as archivo3:
    # Iterar sobre cada línea del archivo
    for linea in archivo3:
        # Convertir la línea de bytes a un diccionario usando ast.literal_eval y utf-8 decoding
        data3.append(ast.literal_eval(linea.decode('utf-8')))

# Crear un DataFrame de pandas ('df_items') a partir de la lista de datos obtenidos
df_items = pd.DataFrame(data3)

# Escribir el DataFrame como un archivo Parquet
nombre_archivo_parquet = '..\\data\\users_items.parquet'

# Convertir el DataFrame en una tabla Arrow
tabla_arrow = pa.Table.from_pandas(df_items)

# Escribir la tabla Arrow como un archivo Parquet
pq.write_table(tabla_arrow, nombre_archivo_parquet)

### ETL al Dataframe 'df_games'

In [5]:
df_games = pd.read_parquet('..\\data\\steam_games.parquet')

In [6]:
# Observar que existen filas que completamente nulas
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,0.0,,,
1,,,,,,,,,,0.0,,,
2,,,,,,,,,,0.0,,,
3,,,,,,,,,,0.0,,,
4,,,,,,,,,,0.0,,,


In [7]:
# Contar los valores nulos por columna
nulos_por_columna = df_games.isnull().sum(axis=0)
nulos_por_columna

publisher       96362
genres          91593
app_name        88312
title           90360
url             88310
release_date    90377
tags            88473
reviews_url     88312
specs           88980
price               0
early_access    88310
id              88312
developer       91609
dtype: int64

In [8]:
# Identificar las filas completamente Nulas, considerando que para esta, el valor de la columna 'price' es 0
condicion_filtrado = (df_games['price'] == 0) & (df_games.drop(columns=['price']).isnull().all(axis=1))

'''
Eliminar las filas que cumplen las condiciones expuestas anteriormente, con ~ creamos 
este nuevo Dataframe con los valores que no cumplen la condicion
'''
df_games = df_games[~condicion_filtrado]

In [9]:
# Contar los valores nulos por columna luego de eliminar filas completamente nulas
nulos_por_columna = df_games.isnull().sum(axis=0)
nulos_por_columna

publisher       8052
genres          3283
app_name           2
title           2050
url                0
release_date    2067
tags             163
reviews_url        2
specs            670
price              0
early_access       0
id                 2
developer       3299
dtype: int64

In [10]:
df_games.head(2)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.0,False,643980,Secret Level SRL


In [11]:
# Filtrar el DataFrame para mostrar las filas donde 'app_name' es nula para comparar con 'title'
app_name_nulo = df_games[df_games['app_name'].isnull()]
app_name_nulo

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,False,,
90890,,"[Action, Indie]",,,http://store.steampowered.com/app/317160/_/,2014-08-26,"[Action, Indie]",http://steamcommunity.com/app/317160/reviews/?...,"[Single-player, Game demo]",0.0,False,317160.0,


In [12]:
''' 
Eliminar las filas donde 'app_name' es nula ya que en estas mismas el valor de 'title' 
es tambien nulo, y no existe otra forma de identificar los titulos de estos Games
'''
df_games = df_games.dropna(subset=['app_name'])

In [13]:
# Seleccionar solo las columnas deseadas del DataFrame original
columnas = ['genres', 'app_name', 'release_date', 'id']
df_games = df_games[columnas]

In [14]:
# Desempacar las listas que conforman la columna 'genres' en valores unicos, solo para consultar
df_expanded = df_games['genres'].explode()
valores_unicos = df_expanded.unique()
valores_unicos

array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', None, 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Video Production', 'Utilities',
       'Web Publishing', 'Education', 'Software Training',
       'Design &amp; Illustration', 'Audio Production', 'Photo Editing',
       'Accounting'], dtype=object)

In [15]:
df_games.reset_index(drop=True, inplace=True)

In [18]:
df_games.head(2)

Unnamed: 0,genres,app_name,release_date,id
0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,761140
1,"[Free to Play, Indie, RPG, Strategy]",Ironbound,2018-01-04,643980


In [19]:
# Contar los valores nulos por columna luego de eliminar filas completamente nulas
nulos_por_columna = df_games.isnull().sum(axis=0)
nulos_por_columna

genres          3282
app_name           0
release_date    2066
id                 1
dtype: int64

In [28]:
# Eliminar filas con valores nulos en la columna 'genres' Representa el 10% del DF
df_games = df_games.dropna(subset=['genres', 'release_date', 'id'])

In [30]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28832 entries, 0 to 32131
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28832 non-null  object
 1   app_name      28832 non-null  object
 2   release_date  28832 non-null  object
 3   id            28832 non-null  object
dtypes: object(4)
memory usage: 1.1+ MB


In [31]:
# Guardar el Dataframe trabajado como .parquet con compresión gzip
ruta_archivo1 = "..\\data\\steam_games_etl_comprimido.parquet"
df_games.to_parquet(ruta_archivo1, compression='gzip')

### ETL a Dataframe 'df_items'

In [2]:
df_items = pd.read_parquet('..\\data\\users_items.parquet')

In [3]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [4]:
# Contar los valores nulos por columna
nulos_por_columna = df_items.isnull().sum(axis=0)
nulos_por_columna

user_id        0
items_count    0
steam_id       0
user_url       0
items          0
dtype: int64

In [5]:
df_items.head(2)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."


In [6]:
# Utilizar la función explode para expandir la lista de diccionarios en la columna 'items'
df_exploded = df_items.explode('items')

# Normalizar la columna 'items' ahora que se ha expandido
df_items_normalized = pd.json_normalize(df_exploded['items'])

# Agregar la columna 'user_id' para mantener la asociación con la fila original
df_items_normalized['user_id'] = df_exploded['user_id'].values

# Mostrar el DataFrame resultante
df_items_normalized

Unnamed: 0,item_id,item_name,playtime_2weeks,playtime_forever,user_id
0,10,Counter-Strike,0.0,6.0,76561197970982479
1,20,Team Fortress Classic,0.0,0.0,76561197970982479
2,30,Day of Defeat,0.0,7.0,76561197970982479
3,40,Deathmatch Classic,0.0,0.0,76561197970982479
4,50,Half-Life: Opposing Force,0.0,0.0,76561197970982479
...,...,...,...,...,...
5170010,373330,All Is Dust,0.0,0.0,76561198329548331
5170011,388490,One Way To Die: Steam Edition,3.0,3.0,76561198329548331
5170012,521570,You Have 10 Seconds 2,4.0,4.0,76561198329548331
5170013,519140,Minds Eyes,3.0,3.0,76561198329548331


In [7]:
# Contar los valores nulos por columna del df_items_normalized
nulos_por_columna = df_items_normalized.isnull().sum(axis=0)
nulos_por_columna

item_id             16806
item_name           16806
playtime_2weeks     16806
playtime_forever    16806
user_id                 0
dtype: int64

In [8]:
''' Eliminar los nulos de las columnas item_id, item_name, playtime_2weeks, 
playtime_forever. Ya que estas filas no contiene ninguna otra informacion.
Solo nos quedaremos con los datos del desempacado de la columna 'items' y los nombraremos 'df_items'
'''
df_items = df_items_normalized.dropna(subset=['item_id'])
nulos_por_columna = df_items.isnull().sum(axis=0)
nulos_por_columna

item_id             0
item_name           0
playtime_2weeks     0
playtime_forever    0
user_id             0
dtype: int64

In [10]:
# Verificar valores iguales a cero en la columna 'playtime_forever'
filas_con_cero_playtime = df_items[df_items['playtime_forever'] == 0]

In [12]:
''' 
Eliminar las filas con valor igual a cero en 'playtime_forever', con esto lograremos 
reducir 1.884.769 filas en el Datafra, que no nos da ninguna información
'''
df_items = df_items[df_items['playtime_forever'] != 0]

In [13]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3285246 entries, 0 to 5170013
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           object 
 1   item_name         object 
 2   playtime_2weeks   float64
 3   playtime_forever  float64
 4   user_id           object 
dtypes: float64(2), object(3)
memory usage: 150.4+ MB


In [15]:
# Resetear índices del DataFrame
df_items.reset_index(drop=True, inplace=True)

In [16]:
df_items

Unnamed: 0,item_id,item_name,playtime_2weeks,playtime_forever,user_id
0,10,Counter-Strike,0.0,6.0,76561197970982479
1,30,Day of Defeat,0.0,7.0,76561197970982479
2,300,Day of Defeat: Source,0.0,4733.0,76561197970982479
3,240,Counter-Strike: Source,0.0,1853.0,76561197970982479
4,3830,Psychonauts,0.0,333.0,76561197970982479
...,...,...,...,...,...
3285241,304930,Unturned,677.0,677.0,76561198329548331
3285242,227940,Heroes & Generals,43.0,43.0,76561198329548331
3285243,388490,One Way To Die: Steam Edition,3.0,3.0,76561198329548331
3285244,521570,You Have 10 Seconds 2,4.0,4.0,76561198329548331


In [17]:
# Guardar el Dataframe trabajado como .parquet con compresión gzip
ruta_archivo2 = "..\\data\\users_items_etl_comprimido.parquet"
df_items.to_parquet(ruta_archivo2, compression='gzip')

### ETL a Dataframe 'df_reviews'

In [76]:
df_reviews = pd.read_parquet('..\\data\\user_reviews.parquet')

In [77]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [78]:
# Contar los valores nulos por columna
nulos_por_columna = df_reviews.isnull().sum(axis=0)
nulos_por_columna

user_id     0
user_url    0
reviews     0
dtype: int64

In [79]:
df_reviews.head(2)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'helpful': 'No ratings yet', 'i..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'helpful': '15 of 20 people (75..."


In [80]:
# Expandir la columna 'reviews' en filas separadas 
df_expandido = df_reviews.explode('reviews').reset_index(drop=True)  # Resetear el índice para tener un índice único

# Normalizar la columna 'reviews' en nuevas columnas
df_dicc = pd.json_normalize(df_expandido['reviews'])

# Concatenar las nuevas columnas con el DataFrame original 'df_reviews'
df_reviews = pd.concat([df_expandido.drop('reviews', axis=1), df_dicc], axis=1)

In [81]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59333 non-null  object
 1   user_url     59333 non-null  object
 2   funny        59305 non-null  object
 3   helpful      59305 non-null  object
 4   item_id      59305 non-null  object
 5   last_edited  59305 non-null  object
 6   posted       59305 non-null  object
 7   recommend    59305 non-null  object
 8   review       59305 non-null  object
dtypes: object(9)
memory usage: 4.1+ MB


In [82]:
# Contar los valores nulos por columna
nulos_por_columna = df_reviews.isnull().sum(axis=0)
nulos_por_columna

user_id         0
user_url        0
funny          28
helpful        28
item_id        28
last_edited    28
posted         28
recommend      28
review         28
dtype: int64

In [83]:
# Filtrar las filas donde la columna item_id es nula
filas_nulas = df_reviews[df_reviews['funny'].isnull()]
filas_nulas.head()

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review
137,gdxsd,http://steamcommunity.com/id/gdxsd,,,,,,,
177,76561198094224872,http://steamcommunity.com/profiles/76561198094...,,,,,,,
2559,76561198021575394,http://steamcommunity.com/profiles/76561198021...,,,,,,,
10080,cmuir37,http://steamcommunity.com/id/cmuir37,,,,,,,
13767,Jaysteeny,http://steamcommunity.com/id/Jaysteeny,,,,,,,


In [84]:
'''
Eliminar filas completas donde la columna funny es nula
no exite manera de recuperar el valor de esta columna en estas filas
esto en total representa el 0.1% de los registros del Dataframe
'''
df_reviews = df_reviews.dropna(subset=['funny'])


# Reiniciar el índice del DataFrame
df_reviews = df_reviews.reset_index(drop=True)

In [85]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...


In [86]:
# Seleccionar solo las columnas deseadas del DataFrame original
columnas = ['user_id', 'last_edited', 'posted', 'review']
df_reviews = df_reviews[columnas]

In [87]:
df_reviews

Unnamed: 0,user_id,last_edited,posted,review
0,76561197970982479,,"Posted November 5, 2011.",Simple yet with great replayability. In my opi...
1,76561197970982479,,"Posted July 15, 2011.",It's unique and worth a playthrough.
2,76561197970982479,,"Posted April 21, 2011.",Great atmosphere. The gunplay can be a bit chu...
3,js41637,,"Posted June 24, 2014.",I know what you think when you see this title ...
4,js41637,,"Posted September 8, 2013.",For a simple (it's actually not all that simpl...
...,...,...,...,...
59300,76561198312638244,,Posted July 10.,a must have classic from steam definitely wort...
59301,76561198312638244,,Posted July 8.,this game is a perfect remake of the original ...
59302,LydiaMorley,,Posted July 3.,had so much fun plaing this and collecting res...
59303,LydiaMorley,,Posted July 20.,:D


In [88]:
# Crear la columna 'intermediate_date' con los valores de 'last_edited' o 'posted' en caso que la primera sea nula o vacia
df_reviews['intermediate_date'] = df_reviews['posted'].fillna(df_reviews['last_edited'])
df_reviews

Unnamed: 0,user_id,last_edited,posted,review,intermediate_date
0,76561197970982479,,"Posted November 5, 2011.",Simple yet with great replayability. In my opi...,"Posted November 5, 2011."
1,76561197970982479,,"Posted July 15, 2011.",It's unique and worth a playthrough.,"Posted July 15, 2011."
2,76561197970982479,,"Posted April 21, 2011.",Great atmosphere. The gunplay can be a bit chu...,"Posted April 21, 2011."
3,js41637,,"Posted June 24, 2014.",I know what you think when you see this title ...,"Posted June 24, 2014."
4,js41637,,"Posted September 8, 2013.",For a simple (it's actually not all that simpl...,"Posted September 8, 2013."
...,...,...,...,...,...
59300,76561198312638244,,Posted July 10.,a must have classic from steam definitely wort...,Posted July 10.
59301,76561198312638244,,Posted July 8.,this game is a perfect remake of the original ...,Posted July 8.
59302,LydiaMorley,,Posted July 3.,had so much fun plaing this and collecting res...,Posted July 3.
59303,LydiaMorley,,Posted July 20.,:D,Posted July 20.


In [89]:
# Extraer el año de la columna 'intermediate_date'
df_reviews['date'] = df_reviews['intermediate_date'].str.extract(r'(\d{4})')
df_reviews.head(2)

Unnamed: 0,user_id,last_edited,posted,review,intermediate_date,date
0,76561197970982479,,"Posted November 5, 2011.",Simple yet with great replayability. In my opi...,"Posted November 5, 2011.",2011
1,76561197970982479,,"Posted July 15, 2011.",It's unique and worth a playthrough.,"Posted July 15, 2011.",2011


In [90]:
# Eliminar las columnas 'last_edited', 'posted' y 'intermediate_date'
df_reviews = df_reviews.drop(columns=['last_edited', 'posted', 'intermediate_date'])

In [91]:
# Contar los valores nulos por columna
nulos_por_columna = df_reviews.isnull().sum(axis=0)
nulos_por_columna

user_id        0
review         0
date       10119
dtype: int64

In [92]:
# Eliminar las filas donde el valor de la columna 'date' es nulo
df_reviews = df_reviews.dropna(subset=['date'])

In [6]:
# Descargar recursos necesarios de NLTK para hacer el Preprocesamietno de Text y posterior Analisis Sentimental
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
# Inicializar lemmatizer y lista de stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Crear Función para preprocesar texto
def preprocess_text(text):
    # Tokenizar el texto en palabras
    tokens = word_tokenize(text.lower())

    # Eliminar signos de puntuación y palabras vacías
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]

    # Lematizar cada palabra
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Reconstruir el texto preprocesado
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Aplicar la función de preprocesamiento a la columna 'review' de 'df_reviews'
df_reviews['reviews_preprocessed'] = df_reviews['review'].apply(preprocess_text)

In [95]:
# Crear Función para asignar valores de sentimiento en la escala de 0 a 2
def valor_sentimiento(sentimiento):
    if sentimiento is None:
        return 1  # Neutral o falta de análisis
    elif sentimiento < 0:
        return 0  # Negativo
    else:
        return 2  # Positivo

# Cargar DataFrame con las reseñas de los usuarios
reviews = df_reviews['reviews_preprocessed']

# Aplicar análisis de sentimiento y asignar valores en la escala
valores_sentimiento = []
for review in reviews:
    blob = TextBlob(str(review))  # Convertir la reseña a cadena en caso de que sea de otro tipo
    sentimiento = blob.sentiment.polarity
    valor = valor_sentimiento(sentimiento)
    valores_sentimiento.append(valor)

# Agregar los valores de sentimiento al DataFrame original
df_reviews['sentiment_analysis'] = valores_sentimiento

In [96]:
# Eliminar las columnas 'review' y 'reviews_preprocessed' del DataFrame
df_reviews = df_reviews.drop(['review', 'reviews_preprocessed'], axis=1)

In [97]:
df_reviews.head(2)

Unnamed: 0,user_id,date,sentiment_analysis
0,76561197970982479,2011,2
1,76561197970982479,2011,2


In [104]:
# Ver los valores nulos en la columna 'sentiment_analysis'
valores_unicos = df_reviews['sentiment_analysis'].unique()
valores_unicos

array([2, 0], dtype=int64)

In [105]:
# Verificar los valores vacíos en la columna 'sentiment_analysis'
valores_vacios = df_reviews['sentiment_analysis'].isna().sum()
valores_vacios

0

In [106]:
# Ver los valores nulos en la columna 'sentiment_analysis'
valores_nulos = df_reviews['sentiment_analysis'].isnull().sum()
valores_nulos

0

In [108]:
# Guardar el Dataframe trabajado como .parquet con compresión gzip
ruta_archivo3 = "..\\data\\users_reviews_etl_comprimido.parquet"
df_reviews.to_parquet(ruta_archivo3, compression='gzip')