En este Notebook, se presenta un conjunto de pasos para realizar el proceso de Extracción, Carga y Transformación sobre el cojunto de datos users_items con el proposito de construir un MVP (Minimum Viable Product) o Producto Mínimo Viable

Tratamiento de Datos (Transformación) Dataset Unificado

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Leer y cargar los distintos dataset
ruta = "..\\Datasets\\"
datasets = ["steam_games.parquet", "users_items.parquet", "user_review.parquet" ]

df_game = pd.read_parquet(ruta + datasets[0])
df_items = pd.read_parquet(ruta + datasets[1])
df_reviews = pd.read_parquet(ruta + datasets[2])

Dataset steam_games

In [3]:
# Convertimos la variable item_id al tipo int
df_game["item_id"] = df_game["item_id"].astype(int)
df_game.head(2)

Unnamed: 0,genres,title,year,price,item_id,developer
0,Action,Lost Summoner Kitty,2018,4.99,761140,Kotoshiro
1,Casual,Lost Summoner Kitty,2018,4.99,761140,Kotoshiro


Dataset users_items

In [4]:
df_items.head(2)

Unnamed: 0,user_id,items_count,playtime_forever,playtime_2weeks
0,76561197970982479,277,6.0,0.0
1,76561197970982479,277,0.0,0.0


In [5]:
df_reviews["item_id"] = df_reviews["item_id"].astype(int)
df_reviews.head(2)

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
1,76561197970982479,22200,True,2


In [6]:
# Realizar merge de dataframes steam-games y user_reviews a traves del atributo item_id 
df_game_reviews = pd.merge(df_game, df_reviews, on = 'item_id' )

In [7]:
# Realizar merge de dataframes steam-games y df_items a traves del atributo user_id
df_unificado = pd.merge(df_game_reviews, df_items, on = 'user_id' )

In [8]:
# Observemos filas duplicadas y eliminamos
total = df_unificado.duplicated(subset=df_unificado.columns).sum()

# Eliminamos las filas duplicadas.
df_unificado.drop_duplicates(keep= 'last', inplace=True)
print(f'Cantidad de filas duplicadas a eliminar: {total}')

Cantidad de filas duplicadas a eliminar: 148206


In [9]:
# Observemos el nuevo dataframe, numero de filas, columnas, valores nulos y tipo de datos
print(f'Filas : {df_unificado.shape[0]}, variables : {df_unificado.shape[1]}')
nulos_por_variables = [(column, df_unificado[column].isnull().sum()) for column in df_unificado.columns ]
print("Cantidad de Nulos")
nulos_por_variables

Filas : 8196278, variables : 12
Cantidad de Nulos


[('genres', 0),
 ('title', 0),
 ('year', 0),
 ('price', 0),
 ('item_id', 0),
 ('developer', 0),
 ('user_id', 0),
 ('recommend', 0),
 ('sentiment_analysis', 0),
 ('items_count', 0),
 ('playtime_forever', 12923),
 ('playtime_2weeks', 12923)]

In [10]:
# Eliminar valores nulos en playtime_forever y 'playtime_2weeks 
df_unificado.dropna(subset= ['playtime_forever','playtime_2weeks'], inplace= True)
print(f'Filas : {df_unificado.shape[0]}, variables : {df_unificado.shape[1]}')

Filas : 8183355, variables : 12


In [11]:
# Mostramos algunos registro como ejemplo
df_unificado.sample(3)

Unnamed: 0,genres,title,year,price,item_id,developer,user_id,recommend,sentiment_analysis,items_count,playtime_forever,playtime_2weeks
6545786,Free to Play,Warframe,2013,0.0,230410,Digital Extremes,Moghur,True,1,127,235.0,0.0
4566958,Action,Evolve Stage 2,2015,0.0,273350,Turtle Rock Studios,76561198009026821,True,2,245,95.0,0.0
7866885,Indie,Bad Rats: the Rats' Revenge,2009,0.99,34900,Invent4 Entertainment,76561198075871336,False,2,110,175.0,0.0


In [12]:
# Creamos un nuevo dataset a partir del nuevo dataframe
dataset_unificado = '..\\Datasets\\unificado.parquet'
df_unificado.to_parquet(dataset_unificado, compression='snappy', index=False)