# ---- ETL FOR FILE OUTPUT_STEAM_GAMES ----

In [1]:
import pandas as pd                                     # ---> Libraries to be used
import ast
import json                                             
import warnings
warnings.filterwarnings('ignore')

path_games = './Datasets/output_steam_games.json'       # ---> Path to JSON file

# I. EXTRACTION

### 1.1  Conversion of the file 'output_steam_games.json' to a python-recognizable structure

In [2]:
lst_games = []
with open(path_games, 'r', encoding='utf-8') as f:      # ---> Opening the JSON file
    for l in f.readlines():                             # ---> Reading and iteration of each line of the file
        elemento = json.loads(l)                        # ---> Conversion of elements(strings) to JSON objects
        lst_games.append(elemento)                      # ---> Load JSON objects to list: 'lst_games'

In [3]:
df_games = pd.DataFrame(lst_games)                      # ---> Creation of DataFrame 'df_games'
pd.set_option('display.max_colwidth', None)             # ---> Option to allow maximun content visualization
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


### 1.2  DataFrame overview

In [4]:
df_games.shape

(120445, 13)

In [5]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


# II. TRANSFORMATION

### 2.1 Clean-up actions

In [6]:
df_games = df_games.dropna(how='all')                   # ---> Removal of complete rows without information (first round)
df_games.shape

(32135, 13)

In [7]:
df_games1 = df_games.drop(['publisher', 'title', 'url', 'tags', 'price', 'early_access'], axis=1) # ---> Removal of complete columns without relevant information (first round)
df_games1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32135 entries, 88310 to 120444
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28852 non-null  object
 1   app_name      32133 non-null  object
 2   release_date  30068 non-null  object
 3   reviews_url   32133 non-null  object
 4   specs         31465 non-null  object
 5   id            32133 non-null  object
 6   developer     28836 non-null  object
dtypes: object(7)
memory usage: 2.0+ MB


In [8]:
df_games1.head()

Unnamed: 0,genres,app_name,release_date,reviews_url,specs,id,developer
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro
88311,"[Free to Play, Indie, RPG, Strategy]",Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL
88312,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,2017-07-24,http://steamcommunity.com/app/670290/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, In-App Purchases, Stats]",670290,Poolians.com
88313,"[Action, Adventure, Casual]",弹炸人2222,2017-12-07,http://steamcommunity.com/app/767400/reviews/?browsefilter=mostrecent&p=1,[Single-player],767400,彼岸领域
88314,,Log Challenge,,http://steamcommunity.com/app/773570/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Full controller support, HTC Vive, Oculus Rift, Tracked Motion Controllers, Room-Scale]",773570,


In [9]:
df_games_raw = df_games1.drop(columns=['genres', 'release_date', 'reviews_url', 'developer'])   # ---> Creation of the DataFrame 'df_games_raw' to develop the recommendation model (item-item) 
df_games_raw.head()

Unnamed: 0,app_name,specs,id
88310,Lost Summoner Kitty,[Single-player],761140
88311,Ironbound,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980
88312,Real Pool 3D - Poolians,"[Single-player, Multi-player, Online Multi-Player, In-App Purchases, Stats]",670290
88313,弹炸人2222,[Single-player],767400
88314,Log Challenge,"[Single-player, Full controller support, HTC Vive, Oculus Rift, Tracked Motion Controllers, Room-Scale]",773570


In [10]:
df_games_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32135 entries, 88310 to 120444
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  32133 non-null  object
 1   specs     31465 non-null  object
 2   id        32133 non-null  object
dtypes: object(3)
memory usage: 1004.2+ KB


### 2.2  Process for unnesting column 'genres' from python DataFrame 'df_games1'

In [11]:
df_games2 = df_games1.explode('genres')                 # ---> Unnesting column 'genres' from 'df_games1'

In [12]:
df_games2.head()

Unnamed: 0,genres,app_name,release_date,reviews_url,specs,id,developer
88310,Action,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro
88310,Casual,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro
88310,Indie,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro
88310,Simulation,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro
88310,Strategy,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro


In [13]:
df_games2.shape

(74837, 7)

### 2.3  Formatting and type handling for data

In [14]:
df_games2 = df_games2.dropna(how='all')                                         # ---> Removal of complete rows without information (second round)

In [15]:
df_games2['released_year'] = df_games2['release_date'].str.extract(r'(\d{4})')  # ---> Creation of 'released_year' column from 'release_date' column

In [16]:
df_games2.head(10)

Unnamed: 0,genres,app_name,release_date,reviews_url,specs,id,developer,released_year
88310,Action,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Casual,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Indie,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Simulation,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Strategy,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88311,Free to Play,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,Indie,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,RPG,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,Strategy,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88312,Casual,Real Pool 3D - Poolians,2017-07-24,http://steamcommunity.com/app/670290/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, In-App Purchases, Stats]",670290,Poolians.com,2017


In [17]:
df_games2 = df_games2.map(lambda x: x.strip() if isinstance(x, str) else x) # ---> Aplying strip function to eliminate blank spaces (start,end)
df_games2.head(10)

Unnamed: 0,genres,app_name,release_date,reviews_url,specs,id,developer,released_year
88310,Action,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Casual,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Indie,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Simulation,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Strategy,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88311,Free to Play,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,Indie,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,RPG,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88311,Strategy,Ironbound,2018-01-04,http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievements, Steam Trading Cards, In-App Purchases]",643980,Secret Level SRL,2018
88312,Casual,Real Pool 3D - Poolians,2017-07-24,http://steamcommunity.com/app/670290/reviews/?browsefilter=mostrecent&p=1,"[Single-player, Multi-player, Online Multi-Player, In-App Purchases, Stats]",670290,Poolians.com,2017


In [18]:
df_games2.rename(columns={'id': 'item_id'}, inplace=True)       # ---> update the name of the column 'id' to 'item_id'
df_games2.head(3)

Unnamed: 0,genres,app_name,release_date,reviews_url,specs,item_id,developer,released_year
88310,Action,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Casual,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018
88310,Indie,Lost Summoner Kitty,2018-01-04,http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,[Single-player],761140,Kotoshiro,2018


### 2.4  Handling of 'NaN and Empty' values

In [19]:
df_games2.drop(['reviews_url','release_date'], axis=1, inplace=True)    # ---> Review and removal of complete columns without relevant information (second round)

In [20]:
print(df_games2.isna().sum())                                           # ---> Number of NaN fields in 'df_games2' by column

genres           3282
app_name            2
specs             940
item_id             2
developer        3478
released_year    2386
dtype: int64


In [21]:
print((df_games2 == '').sum())                                          # ---> Number of empty fields in 'df_games2' by column

genres           0
app_name         0
specs            0
item_id          0
developer        0
released_year    0
dtype: int64


In [22]:
df_games3 = df_games2.dropna(subset=['developer', 'released_year', 'genres', 'item_id'])     # ---> Removal of NaN rows from columns: 'developer', 'released_year', 'genres' and 'item_id'
print(df_games3.isna().sum())     

genres             0
app_name           0
specs            557
item_id            0
developer          0
released_year      0
dtype: int64


In [23]:
df_games3['released_year'] = df_games3['released_year'].astype('int64')     # ---> Conversion of 'released_year' column to int64 type

In [24]:
amp_files = df_games3.loc[df_games3['genres'].str.contains('&')]
print(amp_files['genres'])

89303      Animation &amp; Modeling
89404      Animation &amp; Modeling
89692     Design &amp; Illustration
89701     Design &amp; Illustration
89709     Design &amp; Illustration
                    ...            
118761     Animation &amp; Modeling
118787    Design &amp; Illustration
120229     Animation &amp; Modeling
120229    Design &amp; Illustration
120385    Design &amp; Illustration
Name: genres, Length: 643, dtype: object


In [25]:
df_games3['genres'] = df_games3['genres'].str.replace('&amp;', '')

In [26]:
df_games3.head()

Unnamed: 0,genres,app_name,specs,item_id,developer,released_year
88310,Action,Lost Summoner Kitty,[Single-player],761140,Kotoshiro,2018
88310,Casual,Lost Summoner Kitty,[Single-player],761140,Kotoshiro,2018
88310,Indie,Lost Summoner Kitty,[Single-player],761140,Kotoshiro,2018
88310,Simulation,Lost Summoner Kitty,[Single-player],761140,Kotoshiro,2018
88310,Strategy,Lost Summoner Kitty,[Single-player],761140,Kotoshiro,2018


In [27]:
df_games3.shape

(70864, 6)

In [28]:
df_games3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70864 entries, 88310 to 120443
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   genres         70864 non-null  object
 1   app_name       70864 non-null  object
 2   specs          70307 non-null  object
 3   item_id        70864 non-null  object
 4   developer      70864 non-null  object
 5   released_year  70864 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 3.8+ MB


# III. LOAD

In [29]:
df_games3.to_csv('./Datasets/processing/games.csv', encoding='utf-8', index=False)
df_games_raw.to_csv('./Datasets/processing/games_raw.csv', encoding='utf-8', index=False)