In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

%load_ext autoreload
%autoreload 2
import utils

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Extraigo datos:

df_reviews = pd.read_parquet('../data/user_reviews_limpo.parquet')
df_games = pd.read_parquet('../data/steam_games_limpo.parquet')
df_items = pd.read_parquet('../data/user_items_limpo.parquet')

## Análisis de sentimientos

Se pide crear una nueva columna llamada 'sentiment_analysis' que reemplace a 'reviews_review' donde se realice un análisis de sentimiento de los comentarios con la siguiente escala:

    *0 si es malo,
    *1 si es neutral o esta sin review
    *2 si es positivo.
    
Dado que el objetivo de este proyecto es realizar una prueba de concepto, consiguiendo un producto mínimo viable, se realiza un análisis de sentimiento básico utilizando TextBlob que es una biblioteca de procesamiento de lenguaje natural (NLP) en Python. El objetivo de esta metodología es asignar un valor numérico a un texto, en este caso a los comentarios que los usuarios dejaron para un juego determinado, para representar si el sentimiento expresado en el texto es negativo, neutral o positivo.

Esta metodología toma una revisión de texto como entrada, utiliza TextBlob para calcular la polaridad de sentimiento y luego clasifica la revisión como negativa, neutral o positiva en función de la polaridad calculada. En este caso, se consideraron las polaridades por defecto del modelo, el cuál utiliza umbrales -0.2 y 0.2, siendo polaridades negativas por debajo de -0.2, positivas por encima de 0.2 y neutrales entre medio de ambos.

In [3]:
df_reviews['sentiment_analysis'] = df_reviews['reviews_review'].apply(utils.sentiment_analysis)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,reviews_date,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05,1
1,js41637,http://steamcommunity.com/id/js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24,1
2,evcentric,http://steamcommunity.com/id/evcentric,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...,Formato inválido,2
3,doctr,http://steamcommunity.com/id/doctr,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013-10-14,2
4,maplemage,http://steamcommunity.com/id/maplemage,211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014-04-15,1


In [4]:
#Ya teniendo las reseñas como valor numerico, elimino la columna "reviews_review"
df_reviews = df_reviews.drop(columns=['reviews_review'])
df_reviews.columns

Index(['user_id', 'user_url', 'reviews_item_id', 'reviews_helpful',
       'reviews_recommend', 'reviews_date', 'sentiment_analysis'],
      dtype='object')

In [5]:
#renombre columna ID para posterior Merge
nuevos_nombres = {'id': 'item_id'}

df_games_rename = df_games.rename(columns=nuevos_nombres)
df_games_rename.head()

Unnamed: 0,genres,price,early_access,item_id,release_anio,publisher,app_name,title,developer
0,Action,4.99,0.0,761140.0,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Casual,4.99,0.0,761140.0,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Indie,4.99,0.0,761140.0,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Simulation,4.99,0.0,761140.0,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Strategy,4.99,0.0,761140.0,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [6]:
df_games_rename['item_id'].astype(int)

0        761140
0        761140
0        761140
0        761140
0        761140
          ...  
32132    610660
32132    610660
32132    610660
32133    658870
32133    658870
Name: item_id, Length: 71551, dtype: int32

### Se crea un df_datosLimpios 
Para elegir en post los datos a exportar

In [7]:
df_datosLimpios = pd.merge(df_items, df_reviews, how='inner', on= 'user_id')
df_datosLimpios.head()
df_datosLimpios

Unnamed: 0,item_id,item_name,playtime_forever,steam_id,items_count,user_id,user_url_x,user_url_y,reviews_item_id,reviews_helpful,reviews_recommend,reviews_date,sentiment_analysis
0,10,Counter-Strike,6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,2011-11-05,1
1,10,Counter-Strike,6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,http://steamcommunity.com/profiles/76561197970...,22200,No ratings yet,True,2011-07-15,2
2,10,Counter-Strike,6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,http://steamcommunity.com/profiles/76561197970...,43110,No ratings yet,True,2011-04-21,1
3,20,Team Fortress Classic,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,http://steamcommunity.com/profiles/76561197970...,1250,No ratings yet,True,2011-11-05,1
4,20,Team Fortress Classic,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,http://steamcommunity.com/profiles/76561197970...,22200,No ratings yet,True,2011-07-15,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5983155,57300,Amnesia: The Dark Descent,0,76561198135905916,69,codex131,http://steamcommunity.com/id/codex131,http://steamcommunity.com/id/codex131,730,No ratings yet,True,Formato inválido,2
5983156,57300,Amnesia: The Dark Descent,0,76561198135905916,69,codex131,http://steamcommunity.com/id/codex131,http://steamcommunity.com/id/codex131,440,No ratings yet,True,Formato inválido,2
5983157,417860,Emily is Away,36,76561198135905916,69,codex131,http://steamcommunity.com/id/codex131,http://steamcommunity.com/id/codex131,273110,1 of 2 people (50%) found this review helpful,True,Formato inválido,1
5983158,417860,Emily is Away,36,76561198135905916,69,codex131,http://steamcommunity.com/id/codex131,http://steamcommunity.com/id/codex131,730,No ratings yet,True,Formato inválido,2


In [8]:
df_datosLimpios2 = pd.merge(df_datosLimpios, df_games_rename, how='inner', on='item_id')

In [9]:
df_datosLimpios2.columns

Index(['item_id', 'item_name', 'playtime_forever', 'steam_id', 'items_count',
       'user_id', 'user_url_x', 'user_url_y', 'reviews_item_id',
       'reviews_helpful', 'reviews_recommend', 'reviews_date',
       'sentiment_analysis', 'genres', 'price', 'early_access', 'release_anio',
       'publisher', 'app_name', 'title', 'developer'],
      dtype='object')

### Creo DF_DataExport 

In [10]:
df_data = df_datosLimpios2[['release_anio','genres','playtime_forever','user_id','item_id', 'item_name','sentiment_analysis', 'reviews_recommend', 'reviews_date' ]]
df_data

Unnamed: 0,release_anio,genres,playtime_forever,user_id,item_id,item_name,sentiment_analysis,reviews_recommend,reviews_date
0,2000,Action,6,76561197970982479,10,Counter-Strike,1,True,2011-11-05
1,2000,Action,6,76561197970982479,10,Counter-Strike,2,True,2011-07-15
2,2000,Action,6,76561197970982479,10,Counter-Strike,1,True,2011-04-21
3,2000,Action,0,js41637,10,Counter-Strike,1,True,2014-06-24
4,2000,Action,0,js41637,10,Counter-Strike,1,True,2013-09-08
...,...,...,...,...,...,...,...,...,...
11854181,2016,Indie,0,inven,433920,Aveyond 4: Shadow Of The Mist,1,True,2015-04-06
11854182,2016,RPG,0,inven,433920,Aveyond 4: Shadow Of The Mist,1,True,2015-04-06
11854183,2015,Action,226,76561198134165301,352760,Kaiju Panic,1,True,Formato inválido
11854184,2015,Indie,226,76561198134165301,352760,Kaiju Panic,1,True,Formato inválido


In [11]:
#Modifico la fecha de reviews_date para extraer el año
df_data['reviews_date'] = pd.to_datetime(df_data['reviews_date'], errors='coerce')
df_data['reviews_anio'] = df_data['reviews_date'].dt.year

In [12]:
df_data

Unnamed: 0,release_anio,genres,playtime_forever,user_id,item_id,item_name,sentiment_analysis,reviews_recommend,reviews_date,reviews_anio
0,2000,Action,6,76561197970982479,10,Counter-Strike,1,True,2011-11-05,2011.0
1,2000,Action,6,76561197970982479,10,Counter-Strike,2,True,2011-07-15,2011.0
2,2000,Action,6,76561197970982479,10,Counter-Strike,1,True,2011-04-21,2011.0
3,2000,Action,0,js41637,10,Counter-Strike,1,True,2014-06-24,2014.0
4,2000,Action,0,js41637,10,Counter-Strike,1,True,2013-09-08,2013.0
...,...,...,...,...,...,...,...,...,...,...
11854181,2016,Indie,0,inven,433920,Aveyond 4: Shadow Of The Mist,1,True,2015-04-06,2015.0
11854182,2016,RPG,0,inven,433920,Aveyond 4: Shadow Of The Mist,1,True,2015-04-06,2015.0
11854183,2015,Action,226,76561198134165301,352760,Kaiju Panic,1,True,NaT,
11854184,2015,Indie,226,76561198134165301,352760,Kaiju Panic,1,True,NaT,


In [13]:
#Elimino columnna Reviews date
df_data = df_data.drop('reviews_date', axis=1)

In [14]:
utils.types_data_df(df_data)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,release_anio,"[<class 'str'>, <class 'NoneType'>]",99.88,0.12,14563
1,genres,[<class 'str'>],100.0,0.0,0
2,playtime_forever,[<class 'int'>],100.0,0.0,0
3,user_id,[<class 'str'>],100.0,0.0,0
4,item_id,[<class 'int'>],100.0,0.0,0
5,item_name,[<class 'str'>],100.0,0.0,0
6,sentiment_analysis,[<class 'int'>],100.0,0.0,0
7,reviews_recommend,[<class 'bool'>],100.0,0.0,0
8,reviews_anio,[<class 'float'>],82.39,17.61,2087426


In [15]:
df_data['release_anio'] = pd.to_numeric(df_data['release_anio'], errors='coerce').astype('Int64')

In [16]:
df_data['reviews_anio'] = pd.to_numeric(df_data['reviews_anio'], errors='coerce').astype('Int64')


In [17]:
df_data.to_parquet('../data/data_export_api.parquet')

In [20]:
### guardo en compresion brotli
file_path_brotli = '../data/data_export_api_brotli.parquet'
df_data.to_parquet(file_path_brotli, engine='pyarrow', compression='brotli')

In [21]:
file_path_gzip = '../data/data_export_api_gzip.parquet'

# Guardar el DataFrame en un archivo Parquet comprimido con Gzip
df_data.to_parquet(file_path_gzip, engine='pyarrow', compression='gzip')