In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json
import os
import requests

In [46]:
response = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v2/")
appList = pd.DataFrame(response.json()['applist']['apps'])
appList = appList.set_index('appid')

Load json file. Each file is a nested dictionary. The outer dictionary has one key: reviews. 

In [47]:
current_dir = os.getcwd()
data_dir = os.getcwd() + "/data/"

In [82]:
def preprocess_pipeline(df, appid, name):
    filename = data_dir + "review_" + str(appid) + ".json" # Generate file name for a specific appid
    reviews = json.load(open(filename))['reviews'] #Decode json file into a dictionary."reviews" key has the highest hierarchy
    reviews_df = pd.DataFrame.from_dict(reviews, orient='index') # Convert dictionary to a pandas dataframe
    if reviews_df.shape == (0, 0):
        pass
    else:
        authors_df = (reviews_df['author']).apply(pd.Series) # "author" column is a nested dictionary. Explode this column into another dataframe
        reviews_df = pd.concat([reviews_df.drop(['author','hidden_in_steam_china','language','timestamp_updated','steam_china_location'],axis=1), authors_df], axis=1)
        # Concat the authors df with the original df. Remove some of the not-so-useful columns.

        # print(reviews_df.columns)
        reviews_df[["playtime_forever", "playtime_last_two_weeks"]] = reviews_df[["playtime_forever", "playtime_last_two_weeks"]]/60.0 # Playtimes are logged in minutes. Convert to hours.
        reviews_df = reviews_df.rename(columns={"voted_up": "is_recommended"}) # "voted-up" = True means the review is a positive recommendation. Rename it so that it makes more sense.
        reviews_df['timestamp_created'] = pd.to_datetime(reviews_df['timestamp_created'],unit='s') # "timestamp_created" column is in Unix format. Convert it to human readable form.
        reviews_df["appid"] = appid # Create a column for the product these reviews were written for.
        reviews_df["name"] = name # Create a column for the product these reviews were written for.

        df = pd.concat([df,reviews_df])
    return df

In [85]:
df = pd.DataFrame()
for appid in appList.index:
    # print(appList.loc[appid]["name"])
    name = appList.loc[appid]["name"]
    if name != "":
        print(f"Processing reviews for {name}.")
        df = preprocess_pipeline(df, appid, name)

Processing reviews for Pieterw test app76 ( 216938 ).
Processing reviews for test2.
Processing reviews for test3.
Processing reviews for Anime Artist 2: Cutest Girls Pack.
Processing reviews for Override 2: Super Mech League.
Processing reviews for We Surround You.
Processing reviews for Nexagon.
Processing reviews for Ms. Holmes: Five Orange Pips Collector's Edition.
Processing reviews for The battle of Visby.
Processing reviews for Fantasy Grounds - Jans Tokenpack 14 - Goblins.
Processing reviews for Wild West Dynasty.
Processing reviews for Fantasy Grounds - Jans Tokenpack 15 - Heroes 4.
Processing reviews for Fantasy Grounds - Jans Tokenpack 16 - Ancient Greek Heroes.
Processing reviews for My Silly Life.
Processing reviews for RetroBound.
Processing reviews for UnHolY ToRturEr Update patch "Escape from hell".
Processing reviews for The Fantastic Adventure of Monsieur Grape!.
Processing reviews for Cupcake: an Apartment Adventure Demo.
Processing reviews for The Wishing Stone.
Proc

KeyboardInterrupt: 

In [88]:
df.shape

(159360, 22)