# ---- ETL FOR FILE AUSTRALIAN_USER_REVIEWS ----

In [1]:
import pandas as pd
import ast                                                # ---> Libraries to be used
import warnings
warnings.filterwarnings('ignore')

path_reviews = './Datasets/australian_user_reviews.json'  # ---> Path to JSON file

## I. EXTRACTION

### 1.1  Conversion of the file 'australian_user_reviews.json' to a python-recognizable structure

In [2]:
lst_reviews = []
with open(path_reviews, 'r', encoding='utf-8') as f:    # ---> Opening the JSON file
    for l in f.readlines():                             # ---> Reading and iteration of each line of the file
        lst_reviews.append(ast.literal_eval(l))         # ---> Conversion of each line of the file to Python dictionary and load to list 'lst_reviews' 

In [3]:
df_reviews = pd.DataFrame(lst_reviews)                  # ---> Creation of DataFrame 'df_reviews'
pd.set_option('display.max_colwidth', None)             # ---> Option to allow maximun content visualization
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}, {'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'It's unique and worth a playthrough.'}, {'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]"
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpful': '15 of 20 people (75%) found this review helpful', 'recommend': True, 'review': 'I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'}, {'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': 'For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.'}, {'funny': '', 'posted': 'Posted November 29, 2013.', 'last_edited': '', 'item_id': '239030', 'helpful': '1 of 4 people (25%) found this review helpful', 'recommend': True, 'review': 'Very fun little game to play when your bored or as a time passer. Very gud. Do Recommend. pls buy'}]"
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.', 'last_edited': '', 'item_id': '248820', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'A suitably punishing roguelike platformer. Winning feels good. Progressive unlocks mean a good slog ending in failure doesn't feel like a waste.'}, {'funny': '', 'posted': 'Posted December 4, 2015.', 'last_edited': 'Last edited December 5, 2015.', 'item_id': '370360', 'helpful': 'No ratings yet', 'recommend': True, 'review': '""Run for fun? What the hell kind of fun is that?""'}, {'funny': '', 'posted': 'Posted November 3, 2014.', 'last_edited': '', 'item_id': '237930', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Elegant integration of gameplay, story, world development and aesthetic.'}, {'funny': '', 'posted': 'Posted October 15, 2014.', 'last_edited': '', 'item_id': '263360', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Random drops and random quests, with stat points. Animation style reminiscent of the era before the Voodoo card.'}, {'funny': '', 'posted': 'Posted October 15, 2014.', 'last_edited': '', 'item_id': '107200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Fun balance of tactics and strategy. Potential for very rewarding battles on smaller maps. Can become a bit of a grind on larger maps (>200 stars).'}, {'funny': '', 'posted': 'Posted October 15, 2014.', 'last_edited': '', 'item_id': '224500', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Fun world builder, with plenty of option of how you want challenge served to you. Gnome pathing sometimes frustrating if you expand very very quickly.'}]"
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2013.', 'last_edited': '', 'item_id': '250320', 'helpful': '2 of 2 people (100%) found this review helpful', 'recommend': True, 'review': 'This game... is so fun. The fight sequences have been improved from walking dead. It also includes more of a Sam and Max puzzle solving (some of it in the first episode) and walking dead. The game also gets even more better if you have read the Fables comic books, which are without a doubt, very good. The music is also superb and fit the scenarios very well.'}, {'funny': '', 'posted': 'Posted July 28, 2012.', 'last_edited': '', 'item_id': '20920', 'helpful': '1 of 1 people (100%) found this review helpful', 'recommend': True, 'review': 'Really Really Really Great Game, very good story, im in chapter 1 atm and i think its great. You get a really early link with characters. No need to play the first game, its nicely wrapped up for you in a five minute video. FYI beware of the sex scenes :P, nudity does happen in the game and it shows you actually having sex, so try not to play with your parents around if your near a point in the story. Also you will need a good rig/comp to play on high. Don't even try running the game on Ultra unless you have mulitiple GPU's (Graphics card) and ram and a good processor. All in all good game :D'}, {'funny': '', 'posted': 'Posted June 2, 2012.', 'last_edited': '', 'item_id': '204100', 'helpful': '1 of 1 people (100%) found this review helpful', 'recommend': True, 'review': 'Just buy it already. Great Story, Great Multiplayer and good fan service. Just awesome game. Just using shootdodge and bullet time makes you feel like a badass. Also, its better if you get the max payne story recapped or replay the first two but its not necessary.'}, {'funny': '', 'posted': 'Posted June 29, 2014.', 'last_edited': '', 'item_id': '224600', 'helpful': '1 of 2 people (50%) found this review helpful', 'recommend': True, 'review': 'It was a great game from what I played, right now I need to find the actual download.'}, {'funny': '', 'posted': 'Posted November 22, 2012.', 'last_edited': '', 'item_id': '207610', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'The ending to this game is.... ♥♥♥♥♥♥♥.... Just buy it, you'll be invested, im automatically preordering season two of the walking dead game.'}, {'funny': '', 'posted': 'Posted February 23, 2012.', 'last_edited': '', 'item_id': '108710', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Alan wake is a really good game, the light effects are pretty awesome and this game is so good, it paid back remedy their promotion and conversion fees in the first two days on steam. Its awesome :D. Remedy can still make video games, even if its not Max Payne. It is also very character driven.'}]"
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny', 'posted': 'Posted April 15, 2014.', 'last_edited': '', 'item_id': '211420', 'helpful': '35 of 43 people (81%) found this review helpful', 'recommend': True, 'review': 'Git gud'}, {'funny': '1 person found this review funny', 'posted': 'Posted December 23, 2013.', 'last_edited': '', 'item_id': '211820', 'helpful': '12 of 16 people (75%) found this review helpful', 'recommend': True, 'review': 'It's like Terraria, you play for 9 hours straight, get endgame armour then stop playing until the next update.'}, {'funny': '2 people found this review funny', 'posted': 'Posted March 14, 2014.', 'last_edited': '', 'item_id': '730', 'helpful': '5 of 5 people (100%) found this review helpful', 'recommend': True, 'review': 'Hold shift to win, Hold CTRL to lose.'}, {'funny': '', 'posted': 'Posted July 11, 2013.', 'last_edited': '', 'item_id': '204300', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'OH YES, THIS GAME IS THE BEST, THEY ADD STUFF LIKE NEW CHARACTERS, AND LIKE A NEW MAP ONCE A YEAR, IT'S SO AWESOME, OH YES, IT'S SO AWESOMENAUTS, YES, YES, I'M GOOD AT THIS, YES, YES, GOOD, I'M GOOD, YES, GOOD, YOU ARE BAD, IM GOOD, YES, TOO GOOD, YES, IM NOT BAD, YES, GOOD.'}]"


### 1.2  DataFrame overview

In [4]:
df_reviews.shape

(25799, 3)

In [5]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


## II. TRANSFORMATION

### 2.1  Process for unnesting column 'reviews' from python list 'lst_reviews'

In [6]:
lst_reviews1 = []                                       # ---> list that receives the additional information of each dictionary contained in the column reviews
for l in lst_reviews:                                   # ---> Iteration on each element of 'lst_reviews', extracting 'user_id' and 'user_url'
    user_id = l['user_id']
    user_url = l['user_url']
    for review in l['reviews']:                         # ---> Iteration on each item in the reviews list, adding 'user_id' and 'user_url', as well as the rest of the data contained in reviews to 'lst_reviews1'
        review['user_id'] = user_id
        review['user_url'] = user_url
        lst_reviews1.append(review) 

In [7]:
df_reviews1 = pd.DataFrame(lst_reviews1)                # ---> Creation of DataFrame 'df_reviews1'
df_reviews1.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.",76561197970982479,http://steamcommunity.com/profiles/76561197970982479
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970982479
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!,76561197970982479,http://steamcommunity.com/profiles/76561197970982479
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8",js41637,http://steamcommunity.com/id/js41637
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.",js41637,http://steamcommunity.com/id/js41637


In [8]:
df_reviews1.shape

(59305, 9)

### 2.2  Handling duplicate information

In [9]:
df_reviews1.duplicated().sum()                          # ---> Number of duplicate rows

874

In [10]:
df_reviews_duplicated = df_reviews1[df_reviews1.duplicated()] # ---> List of duplicate rows
df_reviews_duplicated.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
1112,,"Posted September 24, 2015.",,346110,1 of 1 people (100%) found this review helpful,True,yep,bokkkbokkk,http://steamcommunity.com/id/bokkkbokkk
2891,,"Posted January 10, 2014.",,218620,1 of 3 people (33%) found this review helpful,True,"Good graphics, fun heists! A bit laggy",ImSeriouss,http://steamcommunity.com/id/ImSeriouss
2892,,"Posted January 10, 2014.",,105600,0 of 2 people (0%) found this review helpful,True,So fun! DEFINITELY NOT RIP OFF OF MINECRAFT! endless fun!,ImSeriouss,http://steamcommunity.com/id/ImSeriouss
2893,,"Posted December 17, 2014.",,570,No ratings yet,True,bobo pinoy,ImSeriouss,http://steamcommunity.com/id/ImSeriouss
2894,,"Posted January 13, 2014.",,211820,No ratings yet,True,If you want to play this game.. expect glithes! It's in development! BUT it is an awesome game!!! ~,ImSeriouss,http://steamcommunity.com/id/ImSeriouss


In [11]:
df_reviews2 = df_reviews1.drop_duplicates(keep = 'first') # --->  Removing duplicate rows and assigning info to DataFrame:'df_reviews2'
df_reviews2.shape

(58431, 9)

### 2.3  Handling of non-relevant information

In [12]:
df_reviews2 = df_reviews2.dropna(how='all')             # ---> Removal of complete rows without information (first round)
df_reviews2.shape

(58431, 9)

In [13]:
df_reviews2.drop(['funny', 'last_edited', 'helpful', 'user_url'], axis=1, inplace=True) # ---> Removal of complete columns without relevant information
df_reviews2.shape

(58431, 5)

### 2.4  Formatting and type handling for data

In [14]:
df_reviews2['posted'][0:5]

0     Posted November 5, 2011.
1        Posted July 15, 2011.
2       Posted April 21, 2011.
3        Posted June 24, 2014.
4    Posted September 8, 2013.
Name: posted, dtype: object

In [15]:
df_reviews2['posted'] = df_reviews2['posted'].replace('Posted', '', regex=True) # ---> Removal of 'Posted' strings from 'Posted' column
df_reviews2['posted'] = pd.to_datetime(df_reviews2['posted'], errors='coerce')
df_reviews2['posted'][0:5]

0   2011-11-05
1   2011-07-15
2   2011-04-21
3   2014-06-24
4   2013-09-08
Name: posted, dtype: datetime64[ns]

In [16]:
df_reviews2['posted_year'] = df_reviews2['posted'].dt.year  # ---> Creation of 'posted_year' column from 'posted' column
df_reviews2.head()

Unnamed: 0,posted,item_id,recommend,review,user_id,posted_year
0,2011-11-05,1250,True,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.",76561197970982479,2011.0
1,2011-07-15,22200,True,It's unique and worth a playthrough.,76561197970982479,2011.0
2,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!,76561197970982479,2011.0
3,2014-06-24,251610,True,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8",js41637,2014.0
4,2013-09-08,227300,True,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.",js41637,2013.0


In [17]:
df_reviews2 = df_reviews2.map(lambda x: x.strip() if isinstance(x, str) else x) # ---> Aplying strip function to eliminate blank spaces (start,end)
df_reviews2.head()

Unnamed: 0,posted,item_id,recommend,review,user_id,posted_year
0,2011-11-05,1250,True,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.",76561197970982479,2011.0
1,2011-07-15,22200,True,It's unique and worth a playthrough.,76561197970982479,2011.0
2,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!,76561197970982479,2011.0
3,2014-06-24,251610,True,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8",js41637,2014.0
4,2013-09-08,227300,True,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.",js41637,2013.0


### 2.5  Handling of 'NaN and Empty' values

In [18]:
df_reviews2.drop(['posted'], axis=1, inplace=True)      # ---> Removal of complete columns without relevant information (second round)

In [19]:
print(df_reviews2.isna().sum())                         # ---> Number of NaN fields in 'df_reviews2' by column

item_id           0
recommend         0
review            0
user_id           0
posted_year    9933
dtype: int64


In [20]:
print((df_reviews2 == '').sum())                        # ---> Number of empty fields in 'df_reviews2' by column

item_id         0
recommend       0
review         38
user_id         0
posted_year     0
dtype: int64


In [21]:
df_reviews3 = df_reviews2.dropna(subset=['posted_year'])        # ---> Removal of NaN rows from column 'posted_year'
print(df_reviews3.isna().sum())     

item_id        0
recommend      0
review         0
user_id        0
posted_year    0
dtype: int64


In [22]:
df_reviews3['posted_year'] = df_reviews3['posted_year'].astype('int64')     # ---> Conversion of 'posted_year' column to int64 type

In [23]:
df_reviews3.shape

(48498, 5)

In [24]:
df_reviews3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48498 entries, 0 to 59276
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   item_id      48498 non-null  object
 1   recommend    48498 non-null  bool  
 2   review       48498 non-null  object
 3   user_id      48498 non-null  object
 4   posted_year  48498 non-null  int64 
dtypes: bool(1), int64(1), object(3)
memory usage: 1.9+ MB


## III. LOAD

In [25]:
df_reviews3.to_csv('./Datasets/processing/reviews.csv', encoding='utf-8', index=False)