In [2]:
# Importacion de Librerias
import pandas as pd
import ast
import utils
import re
import gzip

In [3]:
# Se lee el dataset que contiene las reviews que hacen los usuarios y se guarda en un dataframe

# Carga del dataframe con el archivo user_reviews.json.gz
data = []
with gzip.open('dataset/user_reviews.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            # Suponiendo que las líneas contienen datos JSON válidos
            json_data = eval(line.strip())  # Suponiendo una estructura JSON simple, use ast.literal_eval() para mayor seguridad
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_reviews = pd.DataFrame(data)
df_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [4]:
# Observamos la estructura del archivo
df_reviews.info()
df_reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [5]:
# Mostrar los nombres de las columnas disponibles del DataFrame user_reviews
print(df_reviews.columns)

# Mostrar sus tipos de datos
for column in df_reviews.columns:
    print(f"Etiqueta: {column}, Tipo de dato: {df_reviews[column].dtype}")

Index(['user_id', 'user_url', 'reviews'], dtype='object')
Etiqueta: user_id, Tipo de dato: object
Etiqueta: user_url, Tipo de dato: object
Etiqueta: reviews, Tipo de dato: object


In [6]:
# Verificar duplicados
tiene_duplicados = df_reviews['user_id'].duplicated().any()
duplicados = df_reviews[df_reviews['user_id'].duplicated()]['user_id']
print(duplicados)

456             bokkkbokkk
1182            ImSeriouss
1456     76561198062039159
1477     76561198045009232
1746          nitr0ticwolf
               ...        
17819    76561198076474887
17916          yolofaceguy
18028    76561198075591109
18234    76561198092022514
18309       SuchGayMuchWow
Name: user_id, Length: 314, dtype: object


In [7]:
# Confirmamos la data
df_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [8]:
# La columna 'reviews' se presenta anidada, siendo una lista con uno o mas diccionarios como elementos. 
# Se decide separar por diccionario

# Se transforma a columnas cada elemento de las listas
df_reviews_separado = pd.json_normalize(df_reviews['reviews'])
df_reviews_separado.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


In [9]:
# Se oberva que user_id' y 'user_url' desaparecen, lo se concatena con el dataframe anterior

# Se agrega el 'user_id' y 'user_url' a las columnas separadas 
df_reviews_separado = pd.concat([df_reviews[['user_id', 'user_url']], df_reviews_separado], axis=1)
df_reviews_separado.head()

Unnamed: 0,user_id,user_url,0,1,2,3,4,5,6,7,8,9
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,evcentric,http://steamcommunity.com/id/evcentric,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,doctr,http://steamcommunity.com/id/doctr,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,maplemage,http://steamcommunity.com/id/maplemage,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


Ahora que se tienen los diccionarios por columnas, con el usuario que genera dicha información, se genera un registro por cada diccionario, manteniendo en cada caso el usuario que lo genera.

In [10]:
# Se adiciona un registro por cada diccionario usando pd.melt
df_reviews_separado = pd.melt(df_reviews_separado, id_vars=['user_id', 'user_url'], 
                       value_vars=list(range(9)),
                       value_name='reviews')
df_reviews_separado.head()

Unnamed: 0,user_id,user_url,variable,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,js41637,http://steamcommunity.com/id/js41637,0,"{'funny': '', 'posted': 'Posted June 24, 2014...."
2,evcentric,http://steamcommunity.com/id/evcentric,0,"{'funny': '', 'posted': 'Posted February 3.', ..."
3,doctr,http://steamcommunity.com/id/doctr,0,"{'funny': '', 'posted': 'Posted October 14, 20..."
4,maplemage,http://steamcommunity.com/id/maplemage,0,"{'funny': '3 people found this review funny', ..."


In [11]:
# Obsservamos si tenemos nulos
df_reviews_separado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232191 entries, 0 to 232190
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   232191 non-null  object
 1   user_url  232191 non-null  object
 2   variable  232191 non-null  object
 3   reviews   59036 non-null   object
dtypes: object(4)
memory usage: 7.1+ MB


In [12]:
# Se eliminan las filas con valor None de la columna reviews
df_reviews_separado = df_reviews_separado.dropna()

# Obsservamos si tenemos nulos
df_reviews_separado.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59036 entries, 0 to 232129
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   59036 non-null  object
 1   user_url  59036 non-null  object
 2   variable  59036 non-null  object
 3   reviews   59036 non-null  object
dtypes: object(4)
memory usage: 2.3+ MB


In [13]:
# Se transforma  cada diccionario en columna.
# Se separan por columnas cada una de las claves de 'reviews'
df_reviews = df_reviews_separado['reviews'].apply(pd.Series, dtype='object')
df_reviews = df_reviews.add_prefix('reviews_')
df_reviews.head()

Unnamed: 0,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [14]:
# Se concatena nuevamente 'user_id' y 'user_url'
# Se une con el 'user_id' y 'user_url'
df_reviews = pd.concat([df_reviews_separado[['user_id', 'user_url']], df_reviews], axis=1)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [15]:
# Se reemplazar espacios vacios con nulos
df_reviews.replace('', None, inplace=True)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [16]:
# Se eliminan las columnas reviews_funny y reviews_last_edited que poseen nulos y que no se consideran necesarios
df_reviews = df_reviews.drop(columns=['reviews_funny', 'reviews_last_edited'])
df_reviews.columns

Index(['user_id', 'user_url', 'reviews_posted', 'reviews_item_id',
       'reviews_helpful', 'reviews_recommend', 'reviews_review'],
      dtype='object')

In [17]:
# Confirmanos la data
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,Posted February 3.,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [18]:
# Se implemente una función, que a partir de un texto, extraiga la fecha

def extraer_fecha(texto):
    match = re.search(r'(\w+\s\d{1,2},\s\d{4})', texto)
    if match:
        fecha_str = match.group(1)
        try:
            fecha_dt = pd.to_datetime(fecha_str)
            return fecha_dt.strftime('%Y-%m-%d')
        except:
            return 'Invalido'
    else:
        return 'Invalido'

In [19]:
# Se extrae la fecha de la columna reviews_posted 
df_reviews['reviews_date'] = df_reviews['reviews_posted'].apply(extraer_fecha)
df_reviews['reviews_date']

0         2011-11-05
1         2014-06-24
2           Invalido
3         2013-10-14
4         2014-04-15
             ...    
231919    2014-08-15
231921    2014-08-02
232047    2015-07-31
232127    2015-12-20
232129      Invalido
Name: reviews_date, Length: 59036, dtype: object

In [20]:
# Se analiza la cantidad de invalidos en la columna reviews_date
df_reviews[df_reviews['reviews_date'] == 'Invalido']

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,reviews_date
2,evcentric,http://steamcommunity.com/id/evcentric,Posted February 3.,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...,Invalido
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,Posted May 20.,730,0 of 1 people (0%) found this review helpful,True,ZIKA DO BAILE,Invalido
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,Posted July 24.,730,No ratings yet,True,BEST GAME IN THE BLOODY WORLD,Invalido
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,Posted June 16.,252950,0 of 1 people (0%) found this review helpful,True,love it,Invalido
10,76561198077246154,http://steamcommunity.com/profiles/76561198077...,Posted June 11.,440,No ratings yet,True,mt bom,Invalido
...,...,...,...,...,...,...,...,...
224181,steamxanbunny,http://steamcommunity.com/id/steamxanbunny,Posted April 12.,394690,No ratings yet,True,I cannot say much right now due to the game no...,Invalido
226733,kamineyyy,http://steamcommunity.com/id/kamineyyy,Posted March 28.,234140,No ratings yet,True,"Oh what a day .., What a lovely day to play th...",Invalido
228737,76561198071901614,http://steamcommunity.com/profiles/76561198071...,Posted May 17.,376210,10 of 28 people (36%) found this review helpful,True,░░░░░░░░░░░█▀▀░░█░░░░░░░░░░░▄▀▀▀▀░░░░░█▄▄░░░░░...,Invalido
229859,76561198082767148,http://steamcommunity.com/profiles/76561198082...,Posted January 3.,730,No ratings yet,False,got VACed,Invalido


In [21]:
# Se decide elimninar estos datos Invalidos
df_reviews = df_reviews[df_reviews['reviews_date'] != 'Invalido']
df_reviews

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,reviews_date
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24
3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14
4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15
5,Wackky,http://steamcommunity.com/id/Wackky,"Posted May 5, 2014.",249130,7 of 8 people (88%) found this review helpful,True,This game is Marvellous.,2014-05-05
...,...,...,...,...,...,...,...,...
231901,76561198138691719,http://steamcommunity.com/profiles/76561198138...,"Posted December 30, 2015.",332310,No ratings yet,True,Normally I would hardly play a lego based game...,2015-12-30
231919,SKELETRONPRIMEISOP,http://steamcommunity.com/id/SKELETRONPRIMEISOP,"Posted August 15, 2014.",440,No ratings yet,True,TF2 is alot of fun and its really good but the...,2014-08-15
231921,76561198141079508,http://steamcommunity.com/profiles/76561198141...,"Posted August 2, 2014.",304930,No ratings yet,True,Fun game with friends,2014-08-02
232047,ShadowYT100,http://steamcommunity.com/id/ShadowYT100,"Posted July 31, 2015.",265630,No ratings yet,True,So Fun!! :D,2015-07-31


In [22]:
# se Renombra la columna reviews_review reviews
df_reviews = df_reviews.rename(columns={'reviews_review': 'reviews'})
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews,reviews_date
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24
3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14
4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15
5,Wackky,http://steamcommunity.com/id/Wackky,"Posted May 5, 2014.",249130,7 of 8 people (88%) found this review helpful,True,This game is Marvellous.,2014-05-05


In [23]:
# Se elimina la columa reviews_posted por considerarse que no es necearia para la Api
df_reviews = df_reviews.drop('reviews_posted', axis=1)
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48936 entries, 0 to 232127
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            48936 non-null  object
 1   user_url           48936 non-null  object
 2   reviews_item_id    48936 non-null  object
 3   reviews_helpful    48936 non-null  object
 4   reviews_recommend  48936 non-null  bool  
 5   reviews            48909 non-null  object
 6   reviews_date       48936 non-null  object
dtypes: bool(1), object(6)
memory usage: 2.7+ MB


In [24]:
# En la columna reviews se elimnan los nulos
df_reviews = df_reviews.dropna(subset=['reviews'])
# Se verifican nulos
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48909 entries, 0 to 232127
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            48909 non-null  object
 1   user_url           48909 non-null  object
 2   reviews_item_id    48909 non-null  object
 3   reviews_helpful    48909 non-null  object
 4   reviews_recommend  48909 non-null  bool  
 5   reviews            48909 non-null  object
 6   reviews_date       48909 non-null  object
dtypes: bool(1), object(6)
memory usage: 2.7+ MB


In [25]:
# Se verifica la data
df_reviews

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews,reviews_date
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05
1,js41637,http://steamcommunity.com/id/js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24
3,doctr,http://steamcommunity.com/id/doctr,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14
4,maplemage,http://steamcommunity.com/id/maplemage,211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15
5,Wackky,http://steamcommunity.com/id/Wackky,249130,7 of 8 people (88%) found this review helpful,True,This game is Marvellous.,2014-05-05
...,...,...,...,...,...,...,...
231901,76561198138691719,http://steamcommunity.com/profiles/76561198138...,332310,No ratings yet,True,Normally I would hardly play a lego based game...,2015-12-30
231919,SKELETRONPRIMEISOP,http://steamcommunity.com/id/SKELETRONPRIMEISOP,440,No ratings yet,True,TF2 is alot of fun and its really good but the...,2014-08-15
231921,76561198141079508,http://steamcommunity.com/profiles/76561198141...,304930,No ratings yet,True,Fun game with friends,2014-08-02
232047,ShadowYT100,http://steamcommunity.com/id/ShadowYT100,265630,No ratings yet,True,So Fun!! :D,2015-07-31


In [26]:
df_reviews.columns

Index(['user_id', 'user_url', 'reviews_item_id', 'reviews_helpful',
       'reviews_recommend', 'reviews', 'reviews_date'],
      dtype='object')

In [27]:
"""  Se adiciona una colunmna o etiqueta sentiment_analysis asignando valores aplicando por medio de la 
 librearia textlob para análisis de sentimientos con NLP, que como se indica en la rubrica, se debe aplicar la siguiente escala:
 debe tomar el valor '0' si es malo, '1' si es neutral y '2' si es positivo. Lo anterior para facilitar el trabajo de los modelos
 de machine learning y el análisis de datos """
    
from textblob import TextBlob


# Definir la función de análisis de sentimientos
def analizar_sentimiento(review):
    if not review:
        return 1  # Neutral if the review is empty or None
    analisis = TextBlob(review)
    if analisis.sentiment.polarity < 0:
        return 0  # Malo
    elif analisis.sentiment.polarity == 0:
        return 1  # Neutral
    else:
        return 2  # Positivo

# Aplicar el análisis de sentimiento directamente
df_reviews['sentiment_analysis'] = df_reviews['reviews'].apply(analizar_sentimiento)


In [29]:
# Se verifica el estado de la nueva etiqueta
df_reviews

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews,reviews_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05,2
1,js41637,http://steamcommunity.com/id/js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24,2
3,doctr,http://steamcommunity.com/id/doctr,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14,2
4,maplemage,http://steamcommunity.com/id/maplemage,211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15,1
5,Wackky,http://steamcommunity.com/id/Wackky,249130,7 of 8 people (88%) found this review helpful,True,This game is Marvellous.,2014-05-05,0
...,...,...,...,...,...,...,...,...
231901,76561198138691719,http://steamcommunity.com/profiles/76561198138...,332310,No ratings yet,True,Normally I would hardly play a lego based game...,2015-12-30,0
231919,SKELETRONPRIMEISOP,http://steamcommunity.com/id/SKELETRONPRIMEISOP,440,No ratings yet,True,TF2 is alot of fun and its really good but the...,2014-08-15,2
231921,76561198141079508,http://steamcommunity.com/profiles/76561198141...,304930,No ratings yet,True,Fun game with friends,2014-08-02,0
232047,ShadowYT100,http://steamcommunity.com/id/ShadowYT100,265630,No ratings yet,True,So Fun!! :D,2015-07-31,2


In [30]:
# Finalizando, se exportar el dataset copia a formato parquet
df_reviews.to_parquet('dataset/df_user_reviews_final.parquet', engine='pyarrow')