En este Notebook, se presenta un conjunto de pasos para realizar el proceso de Extracción, Carga y Transformación sobre el cojunto de datos users_items con el proposito de construir un MVP (Minimum Viable Product) o Producto Mínimo Viable

Ingesta de Datos (Extracción) Dataset User Reviews

In [19]:
import pandas as pd
import numpy as np
import ast
import gzip
import re

In [20]:
# Carga del Dataset users_items.json
ruta_archivo = '..\\Datasets\\user_reviews.json.gz'

json_datos = list()

# Abrir y leer el archivo user_reviews.json.gz
with gzip.open(ruta_archivo, 'rt', encoding='utf-8') as archivo_json:
    for linea in archivo_json:
        try:
            # Evaluar una cadena que representa un diccionario
            json_data = ast.literal_eval(linea)
            json_datos.append(json_data)
        except ValueError as e:
            print(f"ERROR al leer o extraer : {linea}")
            continue

In [21]:
# Transformar la lista json_datos a un dataframe
df_reviews = pd.DataFrame(json_datos)

# Observar las dimensiones del dataset (Fila, Columna)
print(f'Filas : {df_reviews.shape[0]}, Variables : {df_reviews.shape[1]}')
df_reviews.head(2)

Filas : 25799, Variables : 3


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."


In [22]:
# Verificar la cantidad de nulos que pude tener el dataframe 
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [23]:
# En un primer paso desanidamos las estructuras de listas de diccionario dentro de la variable reviews
df_reviews = df_reviews.explode('reviews')

# Obsevamos la variable reviews, estructura y el numero de variables que fueron anexado
print(f'Filas : {df_reviews.shape[0]}, Variables : {df_reviews.shape[1]}')
df_reviews.head(2)

Filas : 59333, Variables : 3


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."


In [24]:
# Observemos la variable reviews
df_reviews["reviews"][0]

0    {'funny': '', 'posted': 'Posted November 5, 20...
0    {'funny': '', 'posted': 'Posted July 15, 2011....
0    {'funny': '', 'posted': 'Posted April 21, 2011...
Name: reviews, dtype: object

In [25]:
# Concatenamos el df_reviews (reviews excluido) con una nueva columna reviews donde cada elemento es convertida en una fila para el DataFrame.
tmp_df_reviews = pd.concat([df_reviews.drop(columns = ['reviews'],axis= 1), df_reviews['reviews'].apply(pd.Series)], axis = 1)
df_reviews = tmp_df_reviews

In [26]:
# Observemos los datos del dataframe
df_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,
...,...,...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D,


In [27]:
# Observar las dimensiones del dataset (Fila, Columna)
print(f'Filas : {df_reviews.shape[0]}, Variables : {df_reviews.shape[1]}')
df_reviews.head(2)

Filas : 59333, Variables : 10


Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,


In [28]:
# Tenemos una columna con etiqueta 0, la usaremos para sentiment_analysis
df_reviews.rename(columns= {0: 'sentiment_analysis'}, inplace= True)

In [29]:
# Observemos la transformacion realizada, las cantidades de fila y numero de variables
print(f'Filas : {df_reviews.shape[0]}, Variables : {df_reviews.shape[1]}')
df_reviews.head(2)

Filas : 59333, Variables : 10


Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,


In [30]:
# Exluimos las columnas o varibales que no nos aporta información
variables_a_excluir = ['funny', 'posted', 'last_edited', 'helpful']
df_reviews.drop(columns = variables_a_excluir, axis=1, inplace=True)

In [18]:
# Inicialicemos la Serie sentiment_analysis con el valor 0, los unicos valores que esta variable puede tomar es '0' si es malo, 
# '1' si es neutral y '2' si es positivo. De no ser posible este análisis por estar ausente la reseña escrita, debe tomar el valor de 1.
df_reviews['sentiment_analysis'] = 0

# Observemos como queda el dataframe
df_reviews.head(2)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,0


In [31]:
# Observar la existencia de valores nulos en item_id
df_reviews['item_id'].isna().value_counts()

item_id
False    59305
True        28
Name: count, dtype: int64

In [32]:
# Eliminar valores nulos en item_id, ya que no existe modo de relacionarlo con otro dataset
df_reviews.dropna(subset= ['item_id'], inplace=True)
print(f'Filas : {df_reviews.shape[0]}, Variables : {df_reviews.shape[1]}')

Filas : 59305, Variables : 6


Analisis de sentimientos

In [33]:
# Librería de procesamiento del texto para Python que permite realizar tareas de Procesamiento del Lenguaje Natural 
# como análisis morfológico, extracción de entidades, análisis de opinión, traducción automática, etc.

from textblob import TextBlob

In [34]:
def analisis_sentimiento(review:str):
    """
    Asigna un valor entero (0,1,2)  

    Parametros 
    ----------
    reviews : str
        Representa un texto
    Retorno
    -------
    int
        0 : Retorna el valor '0' si es malo
        1 : Retorna el valor '1' si es neutral, o si la reseña escrita esta ausente o vacio
        2 : Retorna el valor '2' si es positivo      
    """
    if len(review) == 0:
        return 1 
    else:
        analisis_sentimiento  = TextBlob(review)
        if analisis_sentimiento.sentiment.polarity < 0:
            return 0   
        elif analisis_sentimiento.sentiment.polarity == 0:
            return 1 
        else :
            return 2


In [35]:
# Actualizamos la variable sentiment_analysis aplicando la funcion analisis_sentimiento
df_reviews['sentiment_analysis'] = df_reviews['review'].apply(analisis_sentimiento)

In [36]:
# Observemos los valores tomados para la variable sentiment_analysis
df_reviews.head(2)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,2


In [37]:
# Ahora exluimos varibales que no nos aporta información
variables_a_excluir = ["user_url", "review"]
df_reviews.drop(columns = variables_a_excluir, axis=1, inplace=True)

In [38]:
# Crear un archivo con formato parquet
ruta_archivo = '..\\Datasets\\user_review.parquet'
df_reviews.to_parquet(ruta_archivo, compression='snappy', index=False)

In [39]:
# Crear un archivo con formato csv
ruta_archivo = '..\\Datasets\\user_review.csv'
df_reviews.to_csv(ruta_archivo, sep=';', index=False, encoding='utf-8')