# **EDUARDO PEREZ CHAVARRIA. FT17**
## Proyecto individual
### ETL 2. Archivo a analizar:  Users_review

In [1]:
import pandas as pd
import gzip
import ast
import re
# Restaurar la configuración predeterminada de pandas para la visualización
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')


## **1. Carga de archivo "user_reviews"**

In [2]:
# Establecemos la ruta a la base de datos.
file_path = "bases/user_reviews.json.gz"

# Creamos una lista para almacenar las filas como diccionarios.
filas_review = []

# Tiene formato gz, entonces descomprimimos primero y cargamos datos en diccionario.
with gzip.open(file_path, "rt", encoding="utf-8") as file:
    for line in file:
        # Utilizamos ast.literal_eval en lugar de json.loads
        data = ast.literal_eval(line)
        filas_review.append(data)

# Convertimos la lista de datos en un DataFrame.
df_usrev = pd.DataFrame(filas_review)

# Muestra el DataFrame.
df_usrev



Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


## **2.Exploración de valores nulos (df_usrev original)**

Exploramos el porcentaje de nulos en las columnas

In [3]:
porcentaje_nulos = df_usrev.isna().mean() * 100
print(porcentaje_nulos)

user_id     0.0
user_url    0.0
reviews     0.0
dtype: float64


## **3. Verificamos duplicados**

In [4]:
duplicados_por_id = (lambda df, columna: df[df.duplicated(subset=columna, keep=False)])(df_usrev, "user_id")

# Mostrar resultados
print("filas duplicadas por id")
duplicados_por_id

filas duplicadas por id


Unnamed: 0,user_id,user_url,reviews
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."
50,Rivtex,http://steamcommunity.com/id/Rivtex,"[{'funny': '', 'posted': 'Posted December 23, ..."
83,76561198094224872,http://steamcommunity.com/profiles/76561198094...,[]
119,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,"[{'funny': '', 'posted': 'Posted August 29, 20..."
147,relesprit,http://steamcommunity.com/id/relesprit,"[{'funny': '', 'posted': 'Posted December 27, ..."
...,...,...,...
17819,76561198076474887,http://steamcommunity.com/profiles/76561198076...,"[{'funny': '', 'posted': 'Posted April 12.', '..."
17916,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
18028,76561198075591109,http://steamcommunity.com/profiles/76561198075...,"[{'funny': '', 'posted': 'Posted December 26, ..."
18234,76561198092022514,http://steamcommunity.com/profiles/76561198092...,"[{'funny': '', 'posted': 'Posted July 3.', 'la..."


Se observa una cantidad importante de duplicados. Vamos a identificar si las reviews de esos users id duplicados son identicas.

In [5]:
def obtener_info_reviews(df, columna_id, columna_reviews):
    # Filtrar solo duplicados
    duplicados = df[df.duplicated(subset=columna_id, keep=False)]

    # Contadores
    duplicados_totales = 0
    identicos = 0
    diferentes = 0

    # Lista para almacenar los índices de revisiones diferentes
    indices_reviews_diferentes = []

    # Crear un diccionario para almacenar las revisiones por user_id
    reviews_por_id = {}

    # Iterar sobre las filas duplicadas
    for index, row in duplicados.iterrows():
        user_id = row[columna_id]
        review = row[columna_reviews]

        # Si el user_id ya está en el diccionario, comparar las revisiones
        if user_id in reviews_por_id:
            duplicados_totales += 1
            if reviews_por_id[user_id] == review:
                identicos += 1
            else:
                diferentes += 1
                indices_reviews_diferentes.append(index)
        else:
            reviews_por_id[user_id] = review

    # Imprimir resultados
    print(f"Total de duplicados: {duplicados_totales}")
    print(f"Duplicados con revisiones idénticas: {identicos}")
    print(f"Duplicados con revisiones diferentes: {diferentes}")

    return indices_reviews_diferentes

indices_reviews_diferentes = obtener_info_reviews(df_usrev, "user_id", "reviews")


print("Índices de filas con revisiones diferentes:")
print(indices_reviews_diferentes)


Total de duplicados: 314
Duplicados con revisiones idénticas: 313
Duplicados con revisiones diferentes: 1
Índices de filas con revisiones diferentes:
[9027]


debido a que todos, salvo uno, son identicos, nos quedaremos solamente con el primer user_id que se encuentre.

In [6]:
def eliminar_duplicados(df, columna_id):
    # Eliminar duplicados manteniendo solo la primera aparición
    df_sin_duplicados = df.drop_duplicates(subset=columna_id, keep='first')

    return df_sin_duplicados


df_usrev = eliminar_duplicados(df_usrev, "user_id")


Comprobamos que no hay ya duplicados

In [7]:
indices_reviews_diferentes = obtener_info_reviews(df_usrev, "user_id", "reviews")

Total de duplicados: 0
Duplicados con revisiones idénticas: 0
Duplicados con revisiones diferentes: 0


##**4. Exploramos el tipo de datos que tienen las columnas**

In [8]:
resumen_tipos_columnas = df_usrev.apply(lambda x: {"tipo_dato": type(x.iloc[0]), "primeros_valores": x.iloc[:5].tolist()})

# Mostrar el resumen
for columna, resumen in resumen_tipos_columnas.items():
    print(f"Columna: {columna}")
    print(f"Tipo de dato: {resumen['tipo_dato']}")
    print(f"Primeros valores: {resumen['primeros_valores']}")
    print("\n")


Columna: user_id
Tipo de dato: <class 'str'>
Primeros valores: ['76561197970982479', 'js41637', 'evcentric', 'doctr', 'maplemage']


Columna: user_url
Tipo de dato: <class 'str'>
Primeros valores: ['http://steamcommunity.com/profiles/76561197970982479', 'http://steamcommunity.com/id/js41637', 'http://steamcommunity.com/id/evcentric', 'http://steamcommunity.com/id/doctr', 'http://steamcommunity.com/id/maplemage']


Columna: reviews
Tipo de dato: <class 'list'>
Primeros valores: [[{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}, {'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet

##**5. Desaniadamos la columna de las reviews y obtenemos el df para trabajar**

Observamos que las reviews se encuentran anidadas, así que las convertimos en filas, de tal modo que se duplique la información del user_id y de user_url para cada item_id al que se hizo review

In [9]:
dfreviewsOpen = df_usrev.explode('reviews')
df_expandido_long = pd.concat([dfreviewsOpen.drop(['reviews'], axis=1), dfreviewsOpen['reviews'].apply(pd.Series)], axis=1)
df_expandido_long

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,
...,...,...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D,


Eliminar los df secundarios y dejar unicamente df_usrev con los cambios hechos

In [10]:
# Elimina el DataFrame existente si ya existe
if "df_usrev" in globals():
    del df_usrev

# Lista de columnas a conservar
columnas_a_utilizar = ["user_id",	"user_url",	"funny",	"posted",	"last_edited",	"item_id",	"helpful",	"recommend",	"review"]

# Crea el nuevo DataFrame df_usrev
df_usrev = df_expandido_long[columnas_a_utilizar].copy()

# Muestra el nuevo DataFrame df_usrev
print(df_usrev)

# Elimina el DataFrame existente si ya existe
if "dfreviewsOpen" in globals():
    del dfreviewsOpen
# Elimina el DataFrame existente si ya existe
if "df_expandido_long" in globals():
    del df_expandido_long


                 user_id                                           user_url  \
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1                js41637               http://steamcommunity.com/id/js41637   
1                js41637               http://steamcommunity.com/id/js41637   
...                  ...                                                ...   
25797  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
25797  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   

                                  funny            

##**6. Obtener información general sobre las columnas y los tipos de datos del df desanidado"**

Vamos a obtener informacion general sobre el nuevo df, y sus valores unicos

In [11]:
for columna in df_usrev.columns:
    unique_values_count = df_usrev[columna].value_counts()

    # Imprimir el resumen para cada columna
    print(f"\nResumen para la columna: {columna}")
    print(f"Tipo de dato: {df_usrev[columna].dtype}")
    print(f"Primeros valores: {df_usrev[columna].head().tolist()}")
    print("\nValores únicos y su frecuencia:")
    print(unique_values_count)



Resumen para la columna: user_id
Tipo de dato: object
Primeros valores: ['76561197970982479', '76561197970982479', '76561197970982479', 'js41637', 'js41637']

Valores únicos y su frecuencia:
user_id
DeadNiggaStorage     10
flatwhite            10
76561198038741062    10
jefftequilla         10
76561198091682568    10
                     ..
76561198081938549     1
76561198085470909     1
76561198085144241     1
mastrix8              1
76561198071781219     1
Name: count, Length: 25485, dtype: int64

Resumen para la columna: user_url
Tipo de dato: object
Primeros valores: ['http://steamcommunity.com/profiles/76561197970982479', 'http://steamcommunity.com/profiles/76561197970982479', 'http://steamcommunity.com/profiles/76561197970982479', 'http://steamcommunity.com/id/js41637', 'http://steamcommunity.com/id/js41637']

Valores únicos y su frecuencia:
user_url
http://steamcommunity.com/id/DeadNiggaStorage           10
http://steamcommunity.com/id/flatwhite                  10
http://steam

## **7. Exploración de valores nulos df desanidado"**

Se observa que la mayor cantidad de datos en funny y last_edited son vacios. Por lo cual sería buena idea eliminar esas columnas. También los primeros valores son ' ', vamos a cambiar esos valores por NA y luego contarlos para tomar una decisión

In [12]:
#Remplazamos los vacios con Nas
df_usrev.replace('', None, inplace=True)

In [13]:
# Cuenta la cantidad de NaN en cada columna
nan_counts = df_usrev.isna().sum()

# Calcula el porcentaje de NaN en cada columna
nan_percentages = (nan_counts / len(df_usrev)) * 100

# Crea un DataFrame con los resultados
nan_table = pd.DataFrame({"Columna": nan_counts.index, "Porcentaje NaN": nan_percentages.values})


# Muestra la tabla
print(nan_table)



       Columna  Porcentaje NaN
0      user_id        0.000000
1     user_url        0.000000
2        funny       86.297855
3       posted        0.047898
4  last_edited       89.672928
5      item_id        0.047898
6      helpful        0.047898
7    recommend        0.047898
8       review        0.099217


primero eliminamos funny y last_edited, como recien se mencionó.

In [14]:
df_usrev = df_usrev.drop(['funny', 'last_edited'], axis=1)

Ahora, como la cantindad de faltantes es identica en las "posted", "item_id", "helpful", "recommend", "review" (aqui no es identica pero hay a la par de los demas) se infiere que cuando hay Na en una de ellas hay NANs en las demás, entonces vamos a borrar aquellos registros donde estas columnas tienen todas NAs

In [15]:
columns_to_check = ["posted", "item_id", "helpful", "recommend", "review"]

# Elimina filas donde todas las columnas seleccionadas son NaN
df_usrev = df_usrev.dropna(subset=columns_to_check, how="all")
print(df_usrev)



                 user_id                                           user_url  \
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1                js41637               http://steamcommunity.com/id/js41637   
1                js41637               http://steamcommunity.com/id/js41637   
...                  ...                                                ...   
25797  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
25797  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   

                          posted item_id  \
0      

volvemos a explorar los NANs. Debe haber aprox el 5% en reviews

In [16]:
nan_counts = df_usrev.isna().sum()
nan_percentages = (nan_counts / len(df_usrev)) * 100
nan_table = pd.DataFrame({"Columna": nan_counts.index, "Porcentaje NaN": nan_percentages.values})
print(nan_table)


     Columna  Porcentaje NaN
0    user_id        0.000000
1   user_url        0.000000
2     posted        0.000000
3    item_id        0.000000
4    helpful        0.000000
5  recommend        0.000000
6     review        0.051343


eliminamos esos valores para obtener un df limpio de ese tipo de datos

In [17]:
df_usrev = df_usrev.dropna(subset=["review"])
nan_counts = df_usrev.isna().sum()
nan_percentages = (nan_counts / len(df_usrev)) * 100
nan_table = pd.DataFrame({"Columna": nan_counts.index, "Porcentaje NaN": nan_percentages.values})
print(nan_table)


     Columna  Porcentaje NaN
0    user_id             0.0
1   user_url             0.0
2     posted             0.0
3    item_id             0.0
4    helpful             0.0
5  recommend             0.0
6     review             0.0


##**8. Procesar la columna con información sobre fecha"**

observamos el df para ver si nos falta modificar algo de importancia. Se nota que la columna posted, que indica la fecha en que se publica la review, tiene un formato no estandar.  

In [18]:
df_usrev

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,No ratings yet,True,It's unique and worth a playthrough.
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
1,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 10.,70,No ratings yet,True,a must have classic from steam definitely wort...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 8.,362890,No ratings yet,True,this game is a perfect remake of the original ...
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 3.,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 20.,730,No ratings yet,True,:D


Esa columna la convertiremos a datetime, primero quitaremos el "Posted" con el que inician las filas. Para mantener las filas que no se puedan coerecionar y no perder informacion cambiaremos los NaNs por "Sin dato"

In [19]:
# Eliminar "Posted " de las entradas en la columna 'posted'
df_usrev["posted"] = df_usrev["posted"].str.replace("Posted ", "")

# Convertir lo restante a fecha y manejar errores con 'coerce' para obtener NaN en caso de fallo
df_usrev["posted"] = pd.to_datetime(df_usrev["posted"], dayfirst=True, errors="coerce")

# Contar la cantidad de valores nulos en 'posted'
cantidad_nulos_en_posted = df_usrev["posted"].isnull().sum()

# Imprimir la cantidad de valores nulos
print("Cantidad de valores nulos en 'posted' después de la conversión:", cantidad_nulos_en_posted)


Cantidad de valores nulos en 'posted' después de la conversión: 9929


extremos el año de posted y creamos una nueva columna llamada "year" para almacenarlo

In [20]:
df_usrev["year"] = df_usrev["posted"].dt.year.astype("object")
# Imputar valores NaN en la columna 'year' con "Sin dato"
df_usrev['year'].fillna("Sin dato", inplace=True)

Observamos antes que la cantidad de datos que se conviertieron en nulos es importante, casi un 15% del df. Son muchos como para perderlos. Podemos probar utilizar la media del año en que se publican. Antes veamos la frecuencia de los valores en "year".

In [21]:
frecuencia_anios = df_usrev['year'].value_counts()

# Imprimir la frecuencia de los años
print("Frecuencia de los años:")
print(frecuencia_anios)

#crear df filtrado para calculo de estadisticos descriptivos
df_usrev_filtered = df_usrev[df_usrev['year'] != 'Sin dato']

# Calcular la media, mediana y moda (sin dato excluido)
media_anios = df_usrev_filtered['year'].astype(float).mean()
mediana_anios = df_usrev_filtered['year'].astype(float).median()
moda_anios = df_usrev_filtered['year'].astype(float).mode()[0]

# Calcular el error estándar
error_estandar_anios = df_usrev_filtered['year'].astype(float).std()

# Imprimir resultados
print("Estadísticas de Años (excluyendo 'Sin dato'):")
print(f"Media: {media_anios}")
print(f"Mediana: {mediana_anios}")
print(f"Moda: {moda_anios}")
print(f"Error Estándar: {error_estandar_anios}")

Frecuencia de los años:
year
2014.0      21821
2015.0      18146
Sin dato     9929
2013.0       6707
2012.0       1201
2011.0        530
2010.0         66
Name: count, dtype: int64
Estadísticas de Años (excluyendo 'Sin dato'):
Media: 2014.148191702255
Mediana: 2014.0
Moda: 2014.0
Error Estándar: 0.8426742164117421


Como se puede notar. Hay muchos valores acumulados en 2014. Imputar valores en la media inflaria ese numero. Parece ser mejor optar por dejar "Sin dato" para que las casi 10k filas sirvan a otros analisis y excluirlas de calculos numericos cuando sea pertinente para la columna year.

## **9. modificaciones finales**

veamos el df hasta el momento

In [22]:
df_usrev

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review,year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-11-05,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011.0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-07-15,22200,No ratings yet,True,It's unique and worth a playthrough.,2011.0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2011-04-21,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,2011.0
1,js41637,http://steamcommunity.com/id/js41637,2014-06-24,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014.0
1,js41637,http://steamcommunity.com/id/js41637,2013-09-08,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,2013.0
...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,NaT,70,No ratings yet,True,a must have classic from steam definitely wort...,Sin dato
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,NaT,362890,No ratings yet,True,this game is a perfect remake of the original ...,Sin dato
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,NaT,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,Sin dato
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,NaT,730,No ratings yet,True,:D,Sin dato


Antes vimos que helpfull la mayoría de los usuarios no han vertido evaluaciones, podemos borrarla ya que no nos aporta mucha información. User_url tampoco parece ser necesario para las funciones o el modelo. En el caso de posted ocurre algo parecido una vez que se extrajo el año y está en year.

In [23]:
# Lista de columnas a eliminar
columnas_a_eliminar = ["helpful", "user_url", "posted"]

# Eliminar las columnas en el DataFrame original
df_usrev.drop(columnas_a_eliminar, axis=1, inplace=True)

# Mostrar el DataFrame resultante
df_usrev


Unnamed: 0,user_id,item_id,recommend,review,year
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2011.0
0,76561197970982479,22200,True,It's unique and worth a playthrough.,2011.0
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2011.0
1,js41637,251610,True,I know what you think when you see this title ...,2014.0
1,js41637,227300,True,For a simple (it's actually not all that simpl...,2013.0
...,...,...,...,...,...
25797,76561198312638244,70,True,a must have classic from steam definitely wort...,Sin dato
25797,76561198312638244,362890,True,this game is a perfect remake of the original ...,Sin dato
25798,LydiaMorley,273110,True,had so much fun plaing this and collecting res...,Sin dato
25798,LydiaMorley,730,True,:D,Sin dato


In [24]:
print(df_usrev.info())

<class 'pandas.core.frame.DataFrame'>
Index: 58400 entries, 0 to 25798
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    58400 non-null  object
 1   item_id    58400 non-null  object
 2   recommend  58400 non-null  object
 3   review     58400 non-null  object
 4   year       58400 non-null  object
dtypes: object(5)
memory usage: 2.7+ MB
None


guardamos la base

In [25]:
data_curado = 'bases/users_review_curado.csv'
df_usrev.to_csv(data_curado, index=False, encoding='utf-8')

print(f'Has guardado tu base:  {data_curado}')

Has guardado tu base:  bases/users_review_curado.csv


importamos

In [26]:
data_curado = 'bases/users_review_curado.csv'

# Importar el archivo CSV como DataFrame
df_usrev_importado = pd.read_csv(data_curado, encoding='utf-8')

# Mostrar información del DataFrame importado
print(df_usrev_importado.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58400 entries, 0 to 58399
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    58400 non-null  object
 1   item_id    58400 non-null  int64 
 2   recommend  58400 non-null  bool  
 3   review     58400 non-null  object
 4   year       58400 non-null  object
dtypes: bool(1), int64(1), object(3)
memory usage: 1.8+ MB
None
