En este Notebook, se presenta un conjunto de pasos para realizar el proceso de Extracción, Carga y Transformación del cojunto de datos a utilizar en el MVP (Minimum Viable Product) o Producto Mínimo Viable

In [1]:
import pandas as pd
import numpy as np
import ast
import gzip
import re

In [2]:
# Carga del Dataset users_items.json
ruta_archivo = '..\\Datasets\\user_reviews.json.gz'

json_datos = list()

# Abrir y leer el archivo user_reviews.json.gz
with gzip.open(ruta_archivo, 'rt', encoding='utf-8') as archivo_json:
    for linea in archivo_json:
        try:
            # Evaluar una cadena que representa un diccionario
            json_data = ast.literal_eval(linea)
            json_datos.append(json_data)
        except ValueError as e:
            print(f"ERROR al leer o extraer : {linea}")
            continue

In [3]:
# Transformar la lista json_datos a un dataframe
df_reviews = pd.DataFrame(json_datos)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [4]:
# Verificar dimensiones del dataframe (Filas, Columnas)
print(f'Filas : {df_reviews.shape[0]}, Columnas : {df_reviews.shape[1]}')

Filas : 25799, Columnas : 3


In [5]:
# Verificar la cantidad de nulos que pude tener el dataframe 
df_reviews.isna().sum()

user_id     0
user_url    0
reviews     0
dtype: int64

Transformacion de los datos

In [6]:
# En un primer paso desanidamos las estructuras de listas de diccionario dentro de la variable reviews
df_reviews = df_reviews.explode('reviews')

In [7]:
# Obsevamos como quedo la variable reviews
df_reviews.head(3)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."


In [8]:
# Concatenamos el df_reviews (reviews excluido) con una nueva columna reviews donde cada elemento es convertida en una fila para el DataFrame.
tmp_df_reviews = pd.concat([df_reviews.drop(columns = ['reviews'],axis= 1), df_reviews['reviews'].apply(pd.Series)], axis = 1)
df_reviews = tmp_df_reviews

In [9]:
# Observemos el dataframe modificado con reviews como variable
df_reviews.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,


In [10]:
# Observamos que tenemos una columna con etiqueta 0, la usaremos para sentiment_analysis
df_reviews.rename(columns= {0: 'sentiment_analysis'}, inplace= True)

In [11]:
# Observemos como queda ahora la nueva columna 
df_reviews.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,


In [12]:
# Exluimos las columnas o varibales que no nos aporta información
variables_a_excluir = ['funny', 'posted', 'last_edited', 'helpful']
df_reviews.drop(columns = variables_a_excluir, axis=1, inplace=True)

In [13]:
# Observemos como queda el dataframe
df_reviews.head()

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,http://steamcommunity.com/id/js41637,251610,True,I know what you think when you see this title ...,
1,js41637,http://steamcommunity.com/id/js41637,227300,True,For a simple (it's actually not all that simpl...,


In [14]:
# Inicialicemos la Serie sentiment_analysis con el valor 0, los unicos valores que esta variable puede tomar es '0' si es malo, 
# '1' si es neutral y '2' si es positivo. De no ser posible este análisis por estar ausente la reseña escrita, debe tomar el valor de 1.
df_reviews['sentiment_analysis'] = 0

In [15]:
df_reviews.head(2)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,0


In [16]:
#Observemos los valores nulos en item_id
df_reviews['item_id'].isna().value_counts()

item_id
False    59305
True        28
Name: count, dtype: int64

In [17]:
# Eliminar valores nulos en item_id
df_reviews.dropna(subset= ['item_id'], inplace=True)
df_reviews.shape

(59305, 6)

Analisis de sentimientos

In [18]:
# Librería de procesamiento del texto para Python que permite realizar tareas de Procesamiento del Lenguaje Natural 
# como análisis morfológico, extracción de entidades, análisis de opinión, traducción automática, etc.

from textblob import TextBlob

In [19]:
def analisis_sentimiento(review:str):
    if len(review) == 0:
        # De no ser posible este análisis por estar ausente la reseña escrita, debe tomar el valor de 1.
        return 1 
    else:
        analisis_sentimiento  = TextBlob(review)
        if analisis_sentimiento.sentiment.polarity < 0:
            # Tomar el valor '0' si es malo
            return 0   
        elif analisis_sentimiento.sentiment.polarity == 0:
            # Tomar el valor '1' si es neutral
            return 1 
        else :
            # Tomar el valor '2' si es positivo
            return 2


In [20]:
# Actualizamos la Serie sentiment_analysis aplicando la funcion analisis_sentimiento
df_reviews['sentiment_analysis'] = df_reviews['review'].apply(analisis_sentimiento)

In [21]:
# Observemos los valores tomados para la variable sentiment_analysis
df_reviews.head(3)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,True,Great atmosphere. The gunplay can be a bit chu...,2


In [22]:
# Exluimos las columnas o varibales que no nos aporta información
variables_a_excluir = ["user_url", "review"]
df_reviews.drop(columns = variables_a_excluir, axis=1, inplace=True)

In [23]:
# Crear un archivo con formato parquet
ruta_archivo = '..\\Datasets\\user_review.parquet'
df_reviews.to_parquet(ruta_archivo)