# <h1 align=center> **PROYECTO INDIVIDUAL 01 - MLOPS** </h1>
# <h2 align=center> **ETL - Extraction Transformation and Load** </h1>



In [1]:
#importacion de liberias a utilizar

import json
import pandas as pd
import pyarrow as pa
import ast
import pyarrow.parquet as pq


#### Procesamiento del dataset Steam_games.json

In [2]:
#Convertir la lista de objetos JSON en un DataFrame

registros = []
with open("steam_games.json\output_steam_games.json", "r", encoding="Latin-1") as file:
    for linea in file:
        try:
            objeto_json = json.loads(linea)
            registros.append(objeto_json)
        except json.JSONDecodeError:
            print(f"Error de formato JSON en: {linea}")

df_games = pd.DataFrame(registros)
df_games.head()


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [3]:
df_games.info() # resumen del dataset con cantidad de columnas, tipos de datos y non-nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


In [4]:
df_games.drop(columns=["publisher","title","url","early_access","reviews_url","specs"], inplace=True) # limpieza de columnas no importantes para este caso

In [5]:
df_games.dropna(subset=["id"], inplace=True) # eliminamos los valores nulos de la columna id

In [6]:
df_games.reset_index(drop=True, inplace=True) # reseteamos los indices

In [7]:
df_games.head() # Vemos de nuevo el df sin los nulls en la columna id

Unnamed: 0,genres,app_name,release_date,tags,price,id,developer
0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140,Kotoshiro
1,"[Free to Play, Indie, RPG, Strategy]",Ironbound,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,643980,Secret Level SRL
2,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",Free to Play,670290,Poolians.com
3,"[Action, Adventure, Casual]",弹炸人2222,2017-12-07,"[Action, Adventure, Casual]",0.99,767400,彼岸领域
4,,Log Challenge,,"[Action, Indie, Casual, Sports]",2.99,773570,


In [8]:
df_games.isnull().sum() # miramos los nulos

genres          3282
app_name           1
release_date    2066
tags             162
price           1377
id                 0
developer       3298
dtype: int64

In [9]:
df_games['release_year'] = df_games['release_date'].str.extract(r'(\d{4})') #Extraemos el año de la columna release_date con la expresion regular r'(\d{4})'


In [10]:
df_games.drop(columns=["release_date"],inplace=True) # eliminamos la columna release_date ya que se sacamos la info a utilizar, el anio

In [11]:
df_games.info() # validamos la columnas restantes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28851 non-null  object
 1   app_name      32132 non-null  object
 2   tags          31971 non-null  object
 3   price         30756 non-null  object
 4   id            32133 non-null  object
 5   developer     28835 non-null  object
 6   release_year  29966 non-null  object
dtypes: object(7)
memory usage: 1.7+ MB


In [12]:
#Crear dummies para la columna genres
df_games['genres'] = df_games['genres'].fillna('[]')  # Rellenar los valores faltantes con una lista vacía
df_games['genres'] = df_games['genres'].apply(lambda x: ', '.join(x))  # Convertir la lista de géneros a una cadena separada por comas

#Crear variables ficticias para los géneros
dummy_genres = df_games['genres'].str.get_dummies(', ') 

#Concatenar las variables ficticias con el DataFrame original
df_games = pd.concat([df_games, dummy_genres], axis=1)
df_games.head()

Unnamed: 0,genres,app_name,tags,price,id,developer,release_year,Accounting,Action,Adventure,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,[,]
0,"Action, Casual, Indie, Simulation, Strategy",Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140,Kotoshiro,2018.0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,"Free to Play, Indie, RPG, Strategy",Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,643980,Secret Level SRL,2018.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,"Casual, Free to Play, Indie, Simulation, Sports",Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...",Free to Play,670290,Poolians.com,2017.0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,"Action, Adventure, Casual",弹炸人2222,"[Action, Adventure, Casual]",0.99,767400,彼岸领域,2017.0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,"[, ]",Log Challenge,"[Action, Indie, Casual, Sports]",2.99,773570,,,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [13]:
df_games.drop(columns=["[","]","genres"],inplace=True) # eliminamos la columna genres y los [] que no nos sirven

In [14]:
df_games.isnull().sum() #Verificamos cuanto nulos hay aun 

app_name                        1
tags                          162
price                        1377
id                              0
developer                    3298
release_year                 2167
Accounting                      0
Action                          0
Adventure                       0
Animation &amp; Modeling        0
Audio Production                0
Casual                          0
Design &amp; Illustration       0
Early Access                    0
Education                       0
Free to Play                    0
Indie                           0
Massively Multiplayer           0
Photo Editing                   0
RPG                             0
Racing                          0
Simulation                      0
Software Training               0
Sports                          0
Strategy                        0
Utilities                       0
Video Production                0
Web Publishing                  0
dtype: int64

In [15]:
df_games.fillna('0',inplace=True) # los reemplazamos con 0 
df_games.isnull().sum()

app_name                     0
tags                         0
price                        0
id                           0
developer                    0
release_year                 0
Accounting                   0
Action                       0
Adventure                    0
Animation &amp; Modeling     0
Audio Production             0
Casual                       0
Design &amp; Illustration    0
Early Access                 0
Education                    0
Free to Play                 0
Indie                        0
Massively Multiplayer        0
Photo Editing                0
RPG                          0
Racing                       0
Simulation                   0
Software Training            0
Sports                       0
Strategy                     0
Utilities                    0
Video Production             0
Web Publishing               0
dtype: int64

In [16]:
df_games['release_year'] = df_games['release_year'].astype(int) # sin valores nan, cambiamos el tipo de dato de la columna año de lanzamiento 

In [17]:
df_games = df_games.rename(columns={'id': 'item_id'}) # renombramos la columna id por item_id

In [18]:
string_prices = df_games[df_games['price'].apply(lambda x: isinstance(x, str))] # verificar en la columna precio si hay valores en str
print(string_prices['price'].value_counts())

price
0                                1377
Free                              905
Free to Play                      520
Free To Play                      462
Free Mod                            4
Free Demo                           3
Play for Free!                      2
Third-party                         2
Play Now                            2
Starting at $499.00                 1
Free Movie                          1
Free to Try                         1
Starting at $449.00                 1
Install Theme                       1
Play the Demo                       1
Free HITMAN™ Holiday Pack           1
Play WARMACHINE: Tactics Demo       1
Install Now                         1
Free to Use                         1
Name: count, dtype: int64


In [19]:
df_games.loc[df_games['price'].apply(lambda x: isinstance(x, str)), 'price'] = 0 # convertir los valores de la columna price que son strings a cero


In [51]:
df_games.info() # revisa si aun hay valores tipo string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   app_name                   32133 non-null  object
 1   tags                       32133 non-null  object
 2   price                      32133 non-null  object
 3   item_id                    32133 non-null  object
 4   developer                  32133 non-null  object
 5   release_year               32133 non-null  int32 
 6   Accounting                 32133 non-null  int64 
 7   Action                     32133 non-null  int64 
 8   Adventure                  32133 non-null  int64 
 9   Animation &amp; Modeling   32133 non-null  int64 
 10  Audio Production           32133 non-null  int64 
 11  Casual                     32133 non-null  int64 
 12  Design &amp; Illustration  32133 non-null  int64 
 13  Early Access               32133 non-null  int64 
 14  Educat

In [21]:
games=df_games.to_csv('games.csv',index=False) # creamos un csv con el df_games final

In [22]:
games = pd.read_csv('games.csv') #leer el archivo CSV en un DataFrame de pandas

table = pa.Table.from_pandas(games) # convertir el DataFrame de pandas a una tabla de PyArrow

pq.write_table(table, 'games.parquet') # Escribir la tabla en un archivo Parquet. 

#Parquet por su ventaja sobre los archivo csv en terminos de espacio en almacenamiento y velocidad de lectura y escritura. 

In [23]:
table = pq.read_table('games.parquet') # Leer el archivo Parquet en una tabla de PyArrow

games_parquet = table.to_pandas() # Convertir la tabla de PyArrow a un DataFrame de pandas

#### Procesamiento del dataset users_items.json

In [24]:
filas = list()
with open("users_items.json\\australian_users_items.json", "r", encoding="Latin-1") as archivo:
    for linea in archivo.readlines():
        filas.append(ast.literal_eval(linea))

df_items = pd.DataFrame(filas)
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [25]:
df_items = df_items.explode("items").reset_index() # Creamos una nueva fila para cada elemento de la lista de la columna items
df_items = df_items.drop(columns="index")

In [26]:
df_items = pd.concat([df_items, pd.json_normalize(df_items['items'])], axis=1) # unimos las columnas resultantes del explode
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '20', 'item_name': 'Team Fortress ...",20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '30', 'item_name': 'Day of Defeat'...",30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '40', 'item_name': 'Deathmatch Cla...",40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '50', 'item_name': 'Half-Life: Opp...",50,Half-Life: Opposing Force,0.0,0.0


In [27]:
df_items.isnull().sum() # verificamos los nulos

user_id                 0
items_count             0
steam_id                0
user_url                0
items               16806
item_id             16806
item_name           16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

In [28]:
df_items = df_items.dropna() # eliminamos los nulos
df_items.isnull().sum() # y volvemos a verificar

user_id             0
items_count         0
steam_id            0
user_url            0
items               0
item_id             0
item_name           0
playtime_forever    0
playtime_2weeks     0
dtype: int64

In [29]:
df_items.info() # vemos el estus de las data y los tipo de datos por columna

<class 'pandas.core.frame.DataFrame'>
Index: 5153209 entries, 0 to 5170013
Data columns (total 9 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   steam_id          object 
 3   user_url          object 
 4   items             object 
 5   item_id           object 
 6   item_name         object 
 7   playtime_forever  float64
 8   playtime_2weeks   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 393.2+ MB


In [30]:
df_items.drop(["user_url","items"], axis=1, inplace=True) # eliminamos columnas que no se utilizaran

In [31]:
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,50,Half-Life: Opposing Force,0.0,0.0


In [32]:
items= df_items.to_csv('items.csv',index=False) # convertimos a csv
items= pd.read_csv('items.csv') # leemos el archivo csv creado

In [33]:
items.head()

Unnamed: 0,user_id,items_count,steam_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,50,Half-Life: Opposing Force,0.0,0.0


In [34]:
items = pd.read_csv('items.csv') # Leer el archivo CSV en un DataFrame de pandas

table = pa.Table.from_pandas(items) # Convertir el DataFrame de pandas a una tabla de PyArrow

pq.write_table(table, 'items.parquet') # Escribir la tabla en un archivo Parquet

In [35]:
table = pq.read_table('items.parquet') # Leer el archivo Parquet en una tabla de PyArrow

items_parquet = table.to_pandas() # Convertir la tabla de PyArrow a un DataFrame de pandas

items_parquet # Imprimir el DataFrame


Unnamed: 0,user_id,items_count,steam_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...
5153204,76561198329548331,7,76561198329548331,346330,BrainBread 2,0.0,0.0
5153205,76561198329548331,7,76561198329548331,373330,All Is Dust,0.0,0.0
5153206,76561198329548331,7,76561198329548331,388490,One Way To Die: Steam Edition,3.0,3.0
5153207,76561198329548331,7,76561198329548331,521570,You Have 10 Seconds 2,4.0,4.0


#### Procesamiento del dataset user_reviews.json

In [36]:
filas = list()
with open("user_reviews.json\\australian_user_reviews.json", "r",encoding="Latin-1") as archivo:
    for linea in archivo.readlines():
        filas.append(ast.literal_eval(linea))

df_reviews = pd.DataFrame(filas)
df_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [37]:
df_reviews = df_reviews.explode("reviews").reset_index() # Hacemos explode en la columa reviews para separar ese diccionario
df_reviews = df_reviews.drop(columns="index")
df_reviews = pd.concat([df_reviews, pd.json_normalize(df_reviews['reviews'])], axis=1)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...",,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011....",,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011...",,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....",,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2...",,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [38]:
df_reviews.info() # Observamos los nombres de las columnas su tipo de dato y los no nulos.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59333 non-null  object
 1   user_url     59333 non-null  object
 2   reviews      59305 non-null  object
 3   funny        59305 non-null  object
 4   posted       59305 non-null  object
 5   last_edited  59305 non-null  object
 6   item_id      59305 non-null  object
 7   helpful      59305 non-null  object
 8   recommend    59305 non-null  object
 9   review       59305 non-null  object
dtypes: object(10)
memory usage: 4.5+ MB


In [39]:
nulos_reviews = df_reviews.isna().sum() # Verificamos nulos
nulos_reviews

user_id         0
user_url        0
reviews        28
funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

In [40]:
df_reviews['year'] = df_reviews['posted'].str.extract(r'(\d{4})') ## Utilizar una expresión regular para extraer el año
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews,funny,posted,last_edited,item_id,helpful,recommend,review,year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...",,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011....",,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,2011
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011...",,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,2011
3,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....",,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014
4,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2...",,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,2013


In [41]:
df_reviews["year"].isna().value_counts() # revisando cantidad de nulos en year

year
False    49186
True     10147
Name: count, dtype: int64

In [42]:
df_reviews = df_reviews.dropna(subset=['year']) # Eliminamos los valores vacíos de la columna año


In [43]:
df_reviews.isnull().sum() # revision de nulos en el dataset

user_id        0
user_url       0
reviews        0
funny          0
posted         0
last_edited    0
item_id        0
helpful        0
recommend      0
review         0
year           0
dtype: int64

#### Análisis de sentimientos

In [44]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(review):
    
    if isinstance(review, str):
        sentiment = analyzer.polarity_scores(review)

        if sentiment['compound'] >= 0.05:
            return 2
        elif sentiment['compound'] <= -0.05:
            return 0
        else:
            return 1
    else:
        return 1
        
#Aplica la función de análisis de sentimiento a la columna 'reviews' si existe al menos una reseña

df_reviews['sentiment_analysis'] = df_reviews['review'].apply(analyze_sentiment)


In [45]:
df_reviews["sentiment_analysis"].value_counts()

sentiment_analysis
2    31657
1     9926
0     7603
Name: count, dtype: int64

In [46]:
df_reviews.drop(["reviews",'user_url', 'funny', 'last_edited',"posted","review"], axis=1, inplace=True) # Eliminamos columnas irrelevantes de reviews

In [47]:
df_reviews.info() # status de las columnas que utilizaremos posteriormente para el analisis

<class 'pandas.core.frame.DataFrame'>
Index: 49186 entries, 0 to 59304
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             49186 non-null  object
 1   item_id             49186 non-null  object
 2   helpful             49186 non-null  object
 3   recommend           49186 non-null  object
 4   year                49186 non-null  object
 5   sentiment_analysis  49186 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.6+ MB


In [48]:
reviews= df_reviews.to_csv('reviews.csv',index=False) # reconvertir a csv

In [49]:
reviews = pd.read_csv('reviews.csv') # Leer el archivo CSV en un DataFrame de pandas

table = pa.Table.from_pandas(reviews) # Convertir el DataFrame de pandas a una tabla de PyArrow

pq.write_table(table, 'reviews.parquet') # Escribir la tabla en un archivo Parquet

In [50]:
table = pq.read_table('reviews.parquet') # Leer el archivo Parquet en una tabla de PyArrow

reviews_parquet = table.to_pandas() # Convertir la tabla de PyArrow a un DataFrame de pandas

reviews_parquet # Convertir la tabla de PyArrow a un DataFrame de pandas


Unnamed: 0,user_id,item_id,helpful,recommend,year,sentiment_analysis
0,76561197970982479,1250,No ratings yet,True,2011,2
1,76561197970982479,22200,No ratings yet,True,2011,2
2,76561197970982479,43110,No ratings yet,True,2011,2
3,js41637,251610,15 of 20 people (75%) found this review helpful,True,2014,2
4,js41637,227300,0 of 1 people (0%) found this review helpful,True,2013,2
...,...,...,...,...,...,...
49181,wayfeng,730,1 of 1 people (100%) found this review helpful,True,2015,1
49182,76561198251004808,253980,No ratings yet,True,2015,2
49183,72947282842,730,No ratings yet,True,2015,0
49184,ApxLGhost,730,No ratings yet,True,2015,2
