# ---- ETL FOR FILE AUSTRALIAN_USER_ITEMS ----

In [1]:
import pandas as pd
import ast                                              # ---> Libraries to be used
import warnings
warnings.filterwarnings('ignore')

path_items = './Datasets/australian_users_items.json'   # ---> Path to JSON file

# I. EXTRACTION

### 1.1  Conversion of the file 'australian_users_items.json' to a python-recognizable structure

In [2]:
lst_items = []
with open(path_items, 'r', encoding='utf-8') as f:      # ---> Opening the JSON file
    for l in f.readlines():                             # ---> Reading and iteration of each line of the file
        lst_items.append(ast.literal_eval(l))           # ---> Conversion of each line of the file to Python dictionary and load to list 'lst_items' 

In [3]:
df_items = pd.DataFrame(lst_items)                       # ---> Creation of DataFrame 'df_items'
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


### 1.2  DataFrame overview

In [4]:
df_items.shape

(88310, 5)

In [5]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


# II. TRANSFORMATION

### 2.1  Process for unnesting column 'items' from python list 'lst_items'

In [6]:
lst_items1 = []
for l in lst_items:
    user_id = l['user_id']
    items_count = l['items_count']
    steam_id = l['steam_id']
    user_url = l['user_url']
    for item in l['items']:
        item['user_id'] = user_id
        item['items_count'] = items_count
        item['steam_id'] = steam_id
        item['user_url'] = user_url
        lst_items1.append(item)

In [7]:
df_items1 = pd.DataFrame(lst_items1)                    # ---> Creation of DataFrame 'df_items1'
df_items1.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...


### 2.2  Handling duplicate information

In [8]:
df_items1.duplicated().sum()                            # ---> Number of duplicate files

59104

In [9]:
df_items_duplicated = df_items1[df_items1.duplicated()] # ---> List of duplicate files
df_items_duplicated.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
164294,20,Team Fortress Classic,5,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164295,50,Half-Life: Opposing Force,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164296,70,Half-Life,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164297,130,Half-Life: Blue Shift,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164298,220,Half-Life 2,198,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad


In [10]:
df_items2 = df_items1.drop_duplicates(keep = 'first') # ---> Removal of duplicate rows and assignment of info to DataFrame 'df_items2'
df_items2.shape

(5094105, 8)

### 2.3  Handling of non-relevant information

In [11]:
df_items2 = df_items2.dropna(how='all')             # ---> Removal of complete rows without information
df_items2.shape

(5094105, 8)

In [12]:
df_items2.drop(['items_count', 'steam_id', 'user_url'], axis=1, inplace=True)    # ---> Removal of non-relevant columns: 'items_count', 'steam_id', and 'user_url' (first round)
df_items2.shape

(5094105, 5)

### 2.4  Formatting and type handling for data

In [13]:
df_items2['hours_game'] = (df_items2['playtime_forever'] / 60).round(2)    # ---> Conversion of column 'playtime_forever' from minutes to hours (rounding to 2 decimal places)
df_items2.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,hours_game
0,10,Counter-Strike,6,0,76561197970982479,0.1
1,20,Team Fortress Classic,0,0,76561197970982479,0.0
2,30,Day of Defeat,7,0,76561197970982479,0.12
3,40,Deathmatch Classic,0,0,76561197970982479,0.0
4,50,Half-Life: Opposing Force,0,0,76561197970982479,0.0


In [14]:

empty_files_hg = (df_items2['hours_game'] == 0).sum()       # ---> Number of rows with value '0' in column 'hours_game'
print(empty_files_hg)                                       

1847730


In [15]:
df_items2 = df_items2[df_items2['hours_game'] != 0]         # ---> Removal of rows from 'df_items2' where hours_game = '0'

In [16]:
df_items2.shape

(3246375, 6)

In [17]:
df_items2 = df_items2.map(lambda x: x.strip() if isinstance(x, str) else x) # ---> Aplying strip function to eliminate blank spaces (start,end)
df_items2.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,hours_game
0,10,Counter-Strike,6,0,76561197970982479,0.1
2,30,Day of Defeat,7,0,76561197970982479,0.12
8,300,Day of Defeat: Source,4733,0,76561197970982479,78.88
9,240,Counter-Strike: Source,1853,0,76561197970982479,30.88
10,3830,Psychonauts,333,0,76561197970982479,5.55


### 2.5  Handling of 'NaN and Empty' values

In [18]:
df_items2 = df_items2.drop(['playtime_2weeks','item_name','playtime_forever'], axis=1)  # ---> Review and removal of non-relevant columns (second round)
df_items2.head()

Unnamed: 0,item_id,user_id,hours_game
0,10,76561197970982479,0.1
2,30,76561197970982479,0.12
8,300,76561197970982479,78.88
9,240,76561197970982479,30.88
10,3830,76561197970982479,5.55


In [19]:
print(df_items2.isna().sum())

item_id       0
user_id       0
hours_game    0
dtype: int64


In [20]:
print((df_items2 == '').sum())

item_id       0
user_id       0
hours_game    0
dtype: int64


In [21]:
df_items2.shape

(3246375, 3)

In [22]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3246375 entries, 0 to 5153208
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   item_id     object 
 1   user_id     object 
 2   hours_game  float64
dtypes: float64(1), object(2)
memory usage: 99.1+ MB


# III. LOAD

In [23]:
df_items3 = df_items2
df_items3.to_csv('./Datasets/processing/items.csv', encoding='utf-8', index=False)