In [30]:
import pandas as pd
import ast

# Ruta al archivo que contiene el JSON
ruta_archivo = '../PI_ML_OPS_data util/australian_user_reviews.json'

lista_objetos_json = []

with open(ruta_archivo, "r", encoding="utf-8") as file:
    for line in file:
        lista_objetos_json.append(ast.literal_eval(line))


# Convierte la lista de objetos a un DataFrame de Pandas para mejor lectura
data = pd.DataFrame(lista_objetos_json)

# Mostrar los primeros registros del DataFrame
print(data.head())

             user_id                                           user_url  \
0  76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1            js41637               http://steamcommunity.com/id/js41637   
2          evcentric             http://steamcommunity.com/id/evcentric   
3              doctr                 http://steamcommunity.com/id/doctr   
4          maplemage             http://steamcommunity.com/id/maplemage   

                                             reviews  
0  [{'funny': '', 'posted': 'Posted November 5, 2...  
1  [{'funny': '', 'posted': 'Posted June 24, 2014...  
2  [{'funny': '', 'posted': 'Posted February 3.',...  
3  [{'funny': '', 'posted': 'Posted October 14, 2...  
4  [{'funny': '3 people found this review funny',...  


In [31]:
# Eliminar filas con valores NaN
# Solo aquellas que tengan toda la fila vacía
data_cleaned = data.dropna(how='all')

# Mostrar los primeros registros para verificar la lectura
print(data_cleaned.head())

             user_id                                           user_url  \
0  76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1            js41637               http://steamcommunity.com/id/js41637   
2          evcentric             http://steamcommunity.com/id/evcentric   
3              doctr                 http://steamcommunity.com/id/doctr   
4          maplemage             http://steamcommunity.com/id/maplemage   

                                             reviews  
0  [{'funny': '', 'posted': 'Posted November 5, 2...  
1  [{'funny': '', 'posted': 'Posted June 24, 2014...  
2  [{'funny': '', 'posted': 'Posted February 3.',...  
3  [{'funny': '', 'posted': 'Posted October 14, 2...  
4  [{'funny': '3 people found this review funny',...  


In [32]:
# Reviso cantidad de nulos iniciales
print(data.isna().sum())

user_id     0
user_url    0
reviews     0
dtype: int64


In [33]:
# Reviso cantidad de nulos luego de dropna
print(data_cleaned.isna().sum())

user_id     0
user_url    0
reviews     0
dtype: int64


In [34]:
# Obtengo la composición actual del dataframe
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [35]:
data_cleaned

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [42]:
data_cleaned.reviews[8]

[{'funny': '5 people found this review funny',
  'posted': 'Posted February 1, 2015.',
  'last_edited': '',
  'item_id': '72850',
  'helpful': '3 of 3 people (100%) found this review helpful',
  'recommend': True,
  'review': 'Killed the Emperor, nobody cared and got away with it. Accidentally killed a chicken and everybody decided to gang up on me. 10/10'},
 {'funny': '1 person found this review funny',
  'posted': 'Posted June 20, 2014.',
  'last_edited': '',
  'item_id': '440',
  'helpful': '3 of 3 people (100%) found this review helpful',
  'recommend': True,
  'review': '10/10 would eat your money for hats and keys'}]

In [25]:
from textblob import TextBlob

def analyze_sentiment(text):
    # Crear un objeto TextBlob con el texto
    blob = TextBlob(text)
    
    # Obtener la polaridad del sentimiento (-1 a 1, donde -1 es negativo, 0 es neutral y 1 es positivo)
    sentiment_score = blob.sentiment.polarity
    
    # Definir umbrales para clasificar el sentimiento
    if sentiment_score <= -0.2:
        return 0  # Sentimiento negativo
    elif sentiment_score >= 0.04:
        return 2  # Sentimiento positivo
    else:
        return 1  # Sentimiento neutral

def analyze_sentiments_for_reviews(reviews):
    # Si no hay comentarios, retornar un valor predeterminado (puedes ajustar según tus necesidades)
    if not reviews:
        return 1  # Sentimiento neutral
    
    # Extraer los textos de los comentarios y unirlos en un solo texto
    review_texts = [review['review'] for review in reviews if 'review' in review]
    combined_text = ' '.join(review_texts)
    
    # Realizar el análisis de sentimiento en el texto combinado
    return analyze_sentiment(combined_text)

# Aplicar la función a cada fila del DataFrame
data_cleaned['sentiment_analysis'] = data_cleaned['reviews'].apply(analyze_sentiments_for_reviews)


In [26]:
data_cleaned.sentiment_analysis

0        2
1        1
2        2
3        2
4        2
        ..
25794    2
25795    1
25796    1
25797    2
25798    2
Name: sentiment_analysis, Length: 25799, dtype: int64

In [27]:
# Contar la cantidad de valores únicos y sus frecuencias en la columna 'sentiment_analysis'
sentiment_counts = data_cleaned['sentiment_analysis'].value_counts()

# Mostrar el resultado
print(sentiment_counts)

sentiment_analysis
2    15560
1     8480
0     1759
Name: count, dtype: int64


In [43]:
from textblob import TextBlob

# Función para realizar el análisis de sentimiento
def analyze_sentiment(review_text):
    if not review_text:
        return 1  # Neutral si el texto está vacío
    
    analysis = TextBlob(review_text)
    
    # Clasificación del sentimiento
    if analysis.sentiment.polarity > 0:
        return 2  # Positivo
    elif analysis.sentiment.polarity == 0:
        return 1  # Neutral
    else:
        return 0  # Negativo

# Iterar sobre las filas del DataFrame
for index, row in data_cleaned.iterrows():
    # Iterar sobre los diccionarios en la columna 'reviews'
    for i, review_dict in enumerate(row['reviews']):
        # Obtener el texto de la reseña
        review_text = review_dict.get('review', '')
        
        # Realizar el análisis de sentimiento y agregar el resultado al diccionario
        sentiment_score = analyze_sentiment(review_text)
        review_dict['sentiment_analysis'] = sentiment_score

# Verificar los cambios en el DataFrame
print(data_cleaned.head())


             user_id                                           user_url  \
0  76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1            js41637               http://steamcommunity.com/id/js41637   
2          evcentric             http://steamcommunity.com/id/evcentric   
3              doctr                 http://steamcommunity.com/id/doctr   
4          maplemage             http://steamcommunity.com/id/maplemage   

                                             reviews  
0  [{'funny': '', 'posted': 'Posted November 5, 2...  
1  [{'funny': '', 'posted': 'Posted June 24, 2014...  
2  [{'funny': '', 'posted': 'Posted February 3.',...  
3  [{'funny': '', 'posted': 'Posted October 14, 2...  
4  [{'funny': '3 people found this review funny',...  


In [69]:
data_cleaned.reviews[16]

[{'funny': '',
  'posted': 'Posted December 19, 2014.',
  'last_edited': '',
  'item_id': '33440',
  'helpful': '1 of 3 people (33%) found this review helpful',
  'recommend': False,
  'review': "This Game Doesn't Work",
  'sentiment_analysis': 0}]

In [28]:

# Guardar el DataFrame modificado en un archivo CSV
data_cleaned.to_csv('../PI_ML_OPS_data util/user_reviews.csv', index=False)