# ---- PREPARATION OF CLEAN INFO FOR FASTAPI ----

This notebook contains the necessary transformations on the information to be processed by the functions developed for the API. The data sources are the files: items.csv, reviews.csv, games.csv, and reviews_sa.csv

In [None]:
import pandas as pd                                         # ---> Libraries to be used
import ast

In [None]:
df_items = pd.read_csv('./Datasets/processing/items.csv')   # ---> Reading of csv files previously processed with ETL techniques
df_games = pd.read_csv('./Datasets/processing/games.csv')             
df_reviews = pd.read_csv('./Datasets/processing/reviews_sa.csv')

### 1. Data preparation to answer 'Endpoint1 = PlayTimeGenre'

In [None]:
# ---> An Inner Merge of the DataFrames: df_items, df_games, on the column 'item_id' is performed
df_items_games = pd.merge(df_items, df_games[['item_id', 'released_year', 'genres']], on='item_id', how='inner')
df_items_games.head()

In [None]:
# ---> It is grouped by genre and year of release, and the hours of play are added up (rounded to 2 decimal places)
df_items_games = df_items_games.groupby(['released_year', 'genres'])['hours_game'].sum().reset_index()
df_items_games['hours_game'] = round(df_items_games['hours_game'], 2)
df_items_games.head()

In [None]:
# ---> Calculation to rank the genres with the most hours played per year 
df_genre_top_hoursgame = df_items_games.loc[df_items_games.groupby(['released_year', 'genres'])['hours_game'].idxmax()]

In [None]:
print(df_genre_top_hoursgame)

In [None]:
# ---> Routine to order the DataFrame: 'df_genre_top_hoursgame'
genre_top_hoursgame = df_genre_top_hoursgame.sort_values(by=['genres', 'hours_game','released_year'], ascending=[True, False, True])
print(df_genre_top_hoursgame)

In [None]:
# ---> Upload results to a CSV file
df_genre_top_hoursgame.to_csv('./Datasets/api/playtimegenre.csv', encoding='utf-8', index=False)

### 2. Data preparation to answer 'Endpoint2 = UserForGenre'

In [None]:
# ---> A Merge of the DataFrames: df_items, df_games, on the column 'item_id' is performed
df_items_games2 = pd.merge(df_items,df_games,left_on='item_id', right_on='item_id', how='right')
df_items_games2.head()

In [None]:
df_items_games2.info()

In [None]:
df_items_games2.shape

In [None]:
# ---> Removal of non-relevant information
df_items_games2.drop(['item_id'], axis=1, inplace=True)
df_items_games2.dropna(inplace=True)
df_items_games2.shape

In [None]:
# ---> Conversion from Float to Integer
df_items_games2['hours_game'] = df_items_games2['hours_game'].astype('int64')

In [None]:
df_items_games2.info()

In [None]:
# ---> Calculation to rank users with the most hours played by gender and year
df_topuser_genrehours = df_items_games2.loc[df_items_games2.groupby(['genres', 'released_year'])['hours_game'].idxmax()]
df_topuser_genrehours.head()

In [None]:
# ---> filter out columns that are not required
df_topuser_genrehours = df_topuser_genrehours[['genres', 'released_year', 'user_id', 'hours_game']]
df_topuser_genrehours.head()

In [None]:
# ---> Calculation to obtain the hours played per year for the selected user
df_topuser_genrehours2 = df_topuser_genrehours.groupby(['genres', 'released_year', 'user_id'])['hours_game'].sum().reset_index()
df_topuser_genrehours2.head()

In [None]:
# ---> Upload results to a CSV file
df_topuser_genrehours2.to_csv('./Datasets/api/userforgenre.csv', encoding='utf-8', index=False)

### 3. Data preparation to answer 'Endpoint3 = UsersRecommend'

In [None]:
# ---> A Merge of the DataFrames: df_reviews, df_games, on the column 'item_id' is performed
df_reviews_games = pd.merge(df_reviews, df_games, on='item_id', how='left')
df_reviews_games.head()

In [None]:
df_reviews_games.info()

In [None]:
# ---> Filter a new DataFrame by columns: 'recommend' and 'sentiment' (neutral and positive scores)
df_conditions = df_reviews_games.loc[(df_reviews_games['recommend'] == True) & (df_reviews_games['sentiment'].isin([1, 2])), ['released_year', 'app_name']]
print(df_conditions)

In [None]:
# ---> Grouping by columns: 'released_year' and 'app_name' and counting occurrences
df_conditions1 = df_conditions.groupby(['released_year', 'app_name']).size().reset_index(name='count')

In [None]:
# ---> Sort by columns: 'released_year' and 'count' in descending order
df_conditions1 = df_conditions1.sort_values(by=['released_year', 'count'], ascending=[False, False])
df_conditions1.head()

In [None]:
df_conditions1.info()

In [None]:
# ---> Conversion from Float to Integer
df_conditions1['released_year'] = df_conditions1['released_year'].astype('int64')

In [None]:
df_conditions1.info()

In [None]:
# ---> Top 3 of game recommendations by year
df_top_year = df_conditions1.groupby('released_year').head(3)
print(df_top_year)

In [None]:
# ---> Upload results to a CSV file
df_top_year.to_csv('./Datasets/api/usersrecommend.csv', encoding='utf-8', index=False)

##### - 4. UsersWorstDeveloper

In [None]:
'''
# Unir los DataFrames
df_merged = pd.merge(df_user_reviews, df_steam_games, on='item_id', how='left')

# Filtrar las filas que cumplen con las condiciones
df_filtered = df_merged.loc[(df_merged['recommend'] == False) & (df_merged['sentiment_analysis'] == 0),
                           ['year_x', 'developer']]
 
# Renombrar la columna 'year_x' a 'year'
df_filtered = df_filtered.rename(columns={'year_x': 'year'})

# Contar las ocurrencias de cada desarrolladora por año
developer_counts = df_filtered.groupby(['year', 'developer']).size().reset_index(name='count')
    
# Ordenar por 'year' y 'count' en orden descendente
grouped_result = developer_counts.sort_values(by=['year', 'count'], ascending=[False, False])    

# Obtener el top 3 por año
result_df = grouped_result.groupby('year').head(3)    

# Guardar el resultado en un archivo CSV
result_df.to_csv('UsersWorstDeveloper.csv', index=False)
'''

##### - 5. sentiment_analysis

In [None]:
'''
# Merge de los DataFrames
merged_df = pd.merge(df_steam_games, df_user_reviews, on='item_id', how='inner')

# Agrupar por desarrolladora y sentimiento, y contar las ocurrencias
result_df = merged_df.groupby(['developer', 'sentiment_analysis']).size().reset_index(name='count')

# Pivoteando el DataFrame para tener sentimientos como columnas
result_df = result_df.pivot(index='developer', columns='sentiment_analysis', values='count').reset_index()

# Renombrar las columnas según el formato deseado
result_df.columns = ['developer', 'Negative', 'Neutral', 'Positive']

# Rellenar NaN con 0
result_df = result_df.fillna(0)

# Convertir las columnas a tipo int
result_df[['Negative', 'Neutral', 'Positive']] = result_df[['Negative', 'Neutral', 'Positive']].astype(int)

# Guardar el resultado en un archivo CSV
result_df.to_csv('sentiment_analysis.csv', index=False)
'''