# ---- PREPARATION OF CLEAN INFO FOR API APP ----

This notebook contains the necessary transformations on the information to be processed by the functions developed for the API. The data sources are the files: items.csv, reviews.csv, games.csv, and reviews_sa.csv

In [1]:
import pandas as pd                                 # ---> Libraries to be used
import ast

In [2]:
df_items = pd.read_csv('./Datasets/processing/items.csv')              # ---> Reading of csv files previously processed with ETL techniques
df_games = pd.read_csv('./Datasets/processing/games.csv')             
#df_user_reviews = pd.read_csv('user_reviews_cleaned.csv')


### 1. Data preparation to answer 'Endpoint = PlayTimeGenre'

In [3]:
# ---> An Inner Merge of the DataFrames: df_items, df_games, on the column 'item_id' is performed
df_items_games = pd.merge(df_items, df_games[['item_id', 'released_year', 'genres']], on='item_id', how='inner')
df_items_games.head()

Unnamed: 0,item_id,user_id,hours_game,released_year,genres
0,10,76561197970982479,0.1,2000,Action
1,10,doctr,1.55,2000,Action
2,10,corrupted_soul,1.8,2000,Action
3,10,WeiEDKrSat,5.47,2000,Action
4,10,death-hunter,104.58,2000,Action


In [4]:
# ---> It is grouped by genre and year of release, and the hours of play are added up (rounded to 2 decimal places)
df_items_games = df_items_games.groupby(['released_year', 'genres'])['hours_game'].sum().reset_index()
df_items_games['hours_game'] = round(df_items_games['hours_game'], 2)
df_items_games.head()

Unnamed: 0,released_year,genres,hours_game
0,1983,Action,57.92
1,1983,Adventure,57.92
2,1983,Casual,57.92
3,1984,Action,6.4
4,1984,Adventure,6.4


In [5]:
# ---> Calculation to rank the genres with the most hours played per year 
df_genre_top_hoursgame = df_items_games.loc[df_items_games.groupby(['released_year', 'genres'])['hours_game'].idxmax()]

In [6]:
print(df_genre_top_hoursgame)

     released_year        genres  hours_game
0             1983        Action       57.92
1             1983     Adventure       57.92
2             1983        Casual       57.92
3             1984        Action        6.40
4             1984     Adventure        6.40
..             ...           ...         ...
335           2018     Adventure        0.35
336           2018  Early Access      102.12
337           2018         Indie       38.78
338           2018           RPG       63.69
339           2018    Simulation       36.71

[340 rows x 3 columns]


In [7]:
# ---> Routine to order the DataFrame: 'df_genre_top_hoursgame'
genre_top_hoursgame = df_genre_top_hoursgame.sort_values(by=['genres', 'hours_game','released_year'], ascending=[True, False, True])
print(df_genre_top_hoursgame)

     released_year        genres  hours_game
0             1983        Action       57.92
1             1983     Adventure       57.92
2             1983        Casual       57.92
3             1984        Action        6.40
4             1984     Adventure        6.40
..             ...           ...         ...
335           2018     Adventure        0.35
336           2018  Early Access      102.12
337           2018         Indie       38.78
338           2018           RPG       63.69
339           2018    Simulation       36.71

[340 rows x 3 columns]


In [8]:
# ---> Upload results to a CSV file
df_genre_top_hoursgame.to_csv('./Datasets/api/playtimegenre.csv', encoding='utf-8', index=False)

##### - 2. UserForGenre

In [9]:
'''
df_game = df_steam_games.drop(['name', 'developer', 'price', 'tags','specs'],axis=1)

# Merge de los DataFrames
merged_data=pd.merge(df_user_items,df_game,left_on='item_id', right_on='item_id', how='right')

merged_data.drop(['item_id'], axis=1, inplace=True)
merged_data.dropna(inplace=True)

merged_data['hours_game'] = merged_data['hours_game'].astype(int)
merged_data['year'] = merged_data['year'].astype(int)

#Convertimos la columna genres a tipo lista
merged_data['genres'] = merged_data['genres'].apply(ast.literal_eval)

#Expandimos las listas en columnas
merged_data = merged_data.explode('genres').reset_index(drop=True)

#Encontrar el jugador con más horas jugadas por género y año
max_hours_df = merged_data.loc[merged_data.groupby(['genres', 'year'])['hours_game'].idxmax()]

#Filtrar las columnas necesarias
max_hours_df = max_hours_df[['genres', 'year', 'user_id', 'hours_game']]

#Segunda agrupación para obtener las horas jugadas por año para el usuario seleccionado
max_hours = max_hours_df.groupby(['genres', 'year', 'user_id'])['hours_game'].sum().reset_index()

max_hours.to_csv('UserForGenre.csv', index=False)
'''

"\ndf_game = df_steam_games.drop(['name', 'developer', 'price', 'tags','specs'],axis=1)\n\n# Merge de los DataFrames\nmerged_data=pd.merge(df_user_items,df_game,left_on='item_id', right_on='item_id', how='right')\n\nmerged_data.drop(['item_id'], axis=1, inplace=True)\nmerged_data.dropna(inplace=True)\n\nmerged_data['hours_game'] = merged_data['hours_game'].astype(int)\nmerged_data['year'] = merged_data['year'].astype(int)\n\n#Convertimos la columna genres a tipo lista\nmerged_data['genres'] = merged_data['genres'].apply(ast.literal_eval)\n\n#Expandimos las listas en columnas\nmerged_data = merged_data.explode('genres').reset_index(drop=True)\n\n#Encontrar el jugador con más horas jugadas por género y año\nmax_hours_df = merged_data.loc[merged_data.groupby(['genres', 'year'])['hours_game'].idxmax()]\n\n#Filtrar las columnas necesarias\nmax_hours_df = max_hours_df[['genres', 'year', 'user_id', 'hours_game']]\n\n#Segunda agrupación para obtener las horas jugadas por año para el usuario se

##### - 3. UsersRecommend

In [10]:
'''
# Merge de los DataFrames
df_merged = pd.merge(df_user_reviews, df_steam_games, on='item_id', how='left')

# Filtrar las filas que cumplen con las condiciones
result = df_merged.loc[(df_merged['recommend'] == True) & (df_merged['sentiment_analysis'].isin([1, 2])),
                           ['year_x', 'name']]

# Renombrar la columna 'year_x' a 'year'
result = result.rename(columns={'year_x': 'year'})

# Agrupar por 'year' y 'name' y contar las ocurrencias
grouped_result = result.groupby(['year', 'name']).size().reset_index(name='count')

# Ordenar por 'year' y 'count' en orden descendente
grouped_result = grouped_result.sort_values(by=['year', 'count'], ascending=[False, False])

# Obtener el top 3 por año
top3_by_year = grouped_result.groupby('year').head(3)

# Guardar el resultado en un archivo CSV
top3_by_year.to_csv('UsersRecommend.csv', index=False)
'''

"\n# Merge de los DataFrames\ndf_merged = pd.merge(df_user_reviews, df_steam_games, on='item_id', how='left')\n\n# Filtrar las filas que cumplen con las condiciones\nresult = df_merged.loc[(df_merged['recommend'] == True) & (df_merged['sentiment_analysis'].isin([1, 2])),\n                           ['year_x', 'name']]\n\n# Renombrar la columna 'year_x' a 'year'\nresult = result.rename(columns={'year_x': 'year'})\n\n# Agrupar por 'year' y 'name' y contar las ocurrencias\ngrouped_result = result.groupby(['year', 'name']).size().reset_index(name='count')\n\n# Ordenar por 'year' y 'count' en orden descendente\ngrouped_result = grouped_result.sort_values(by=['year', 'count'], ascending=[False, False])\n\n# Obtener el top 3 por año\ntop3_by_year = grouped_result.groupby('year').head(3)\n\n# Guardar el resultado en un archivo CSV\ntop3_by_year.to_csv('UsersRecommend.csv', index=False)\n"

##### - 4. UsersWorstDeveloper

In [11]:
'''
# Unir los DataFrames
df_merged = pd.merge(df_user_reviews, df_steam_games, on='item_id', how='left')

# Filtrar las filas que cumplen con las condiciones
df_filtered = df_merged.loc[(df_merged['recommend'] == False) & (df_merged['sentiment_analysis'] == 0),
                           ['year_x', 'developer']]
 
# Renombrar la columna 'year_x' a 'year'
df_filtered = df_filtered.rename(columns={'year_x': 'year'})

# Contar las ocurrencias de cada desarrolladora por año
developer_counts = df_filtered.groupby(['year', 'developer']).size().reset_index(name='count')
    
# Ordenar por 'year' y 'count' en orden descendente
grouped_result = developer_counts.sort_values(by=['year', 'count'], ascending=[False, False])    

# Obtener el top 3 por año
result_df = grouped_result.groupby('year').head(3)    

# Guardar el resultado en un archivo CSV
result_df.to_csv('UsersWorstDeveloper.csv', index=False)
'''

"\n# Unir los DataFrames\ndf_merged = pd.merge(df_user_reviews, df_steam_games, on='item_id', how='left')\n\n# Filtrar las filas que cumplen con las condiciones\ndf_filtered = df_merged.loc[(df_merged['recommend'] == False) & (df_merged['sentiment_analysis'] == 0),\n                           ['year_x', 'developer']]\n \n# Renombrar la columna 'year_x' a 'year'\ndf_filtered = df_filtered.rename(columns={'year_x': 'year'})\n\n# Contar las ocurrencias de cada desarrolladora por año\ndeveloper_counts = df_filtered.groupby(['year', 'developer']).size().reset_index(name='count')\n    \n# Ordenar por 'year' y 'count' en orden descendente\ngrouped_result = developer_counts.sort_values(by=['year', 'count'], ascending=[False, False])    \n\n# Obtener el top 3 por año\nresult_df = grouped_result.groupby('year').head(3)    \n\n# Guardar el resultado en un archivo CSV\nresult_df.to_csv('UsersWorstDeveloper.csv', index=False)\n"

##### - 5. sentiment_analysis

In [12]:
'''
# Merge de los DataFrames
merged_df = pd.merge(df_steam_games, df_user_reviews, on='item_id', how='inner')

# Agrupar por desarrolladora y sentimiento, y contar las ocurrencias
result_df = merged_df.groupby(['developer', 'sentiment_analysis']).size().reset_index(name='count')

# Pivoteando el DataFrame para tener sentimientos como columnas
result_df = result_df.pivot(index='developer', columns='sentiment_analysis', values='count').reset_index()

# Renombrar las columnas según el formato deseado
result_df.columns = ['developer', 'Negative', 'Neutral', 'Positive']

# Rellenar NaN con 0
result_df = result_df.fillna(0)

# Convertir las columnas a tipo int
result_df[['Negative', 'Neutral', 'Positive']] = result_df[['Negative', 'Neutral', 'Positive']].astype(int)

# Guardar el resultado en un archivo CSV
result_df.to_csv('sentiment_analysis.csv', index=False)
'''

"\n# Merge de los DataFrames\nmerged_df = pd.merge(df_steam_games, df_user_reviews, on='item_id', how='inner')\n\n# Agrupar por desarrolladora y sentimiento, y contar las ocurrencias\nresult_df = merged_df.groupby(['developer', 'sentiment_analysis']).size().reset_index(name='count')\n\n# Pivoteando el DataFrame para tener sentimientos como columnas\nresult_df = result_df.pivot(index='developer', columns='sentiment_analysis', values='count').reset_index()\n\n# Renombrar las columnas según el formato deseado\nresult_df.columns = ['developer', 'Negative', 'Neutral', 'Positive']\n\n# Rellenar NaN con 0\nresult_df = result_df.fillna(0)\n\n# Convertir las columnas a tipo int\nresult_df[['Negative', 'Neutral', 'Positive']] = result_df[['Negative', 'Neutral', 'Positive']].astype(int)\n\n# Guardar el resultado en un archivo CSV\nresult_df.to_csv('sentiment_analysis.csv', index=False)\n"