In [1]:
import json
from collections import defaultdict

In [2]:
filename = 'data/steam_reviews.json'
with open(filename, 'rb') as file:
    reviews = json.load(file)

In [3]:
valid_reviews = []
reqd_keys = {
    'early_access',
    'product_id',
    'text',
    'user_id',
    'username'
}
while len(reviews):
    review = reviews.pop()
    is_key_avail = []
    for key in reqd_keys:
        is_key_avail.append(key in review.keys())
    if all(is_key_avail):
        valid_reviews.append(review)
        
        if 'compensation' in review.keys():
            valid_reviews.pop()
        elif review['early_access']:
            valid_reviews.pop()

del reviews

In [4]:
drop_keys = {
    'compensation',
    'early_access',
    'date',
    'found_funny',
    'hours',
    'page',
    'page_order',
    'products',
    'user_id',
    'username'
}
for review in valid_reviews:
    for key in drop_keys:
        if key in review.keys():
            del review[key]

In [11]:
# Only keep reviews for products that have 15+ valid reviews.
counter = defaultdict(int)
for review in valid_reviews:
    counter[review['product_id']] += 1
valid_product_ids = {product for product, count in counter.items() if count >= 15}
valid_reviews = [review for review in valid_reviews if review['product_id'] in valid_product_ids]

In [13]:
filename = 'data/steam_games.json'
with open(filename, 'rb') as file:
    games = json.load(file)

In [14]:
reqd_keys = {
    'release_date', 
    'publisher', 
    'specs', 
    'url', 
    'early_access', 
    'developer', 
    'app_name', 
    'genres', 
    'title', 
    'sentiment', 
    'reviews_url', 
    'id', 
    'tags'}
valid_games = []
game_ids = set()
for game in games:
    is_key_avail = []
    for key in reqd_keys:
        is_key_avail.append(key in game.keys())
    
    if all(is_key_avail):
        if 'Positive' in game['sentiment']:
            if not game['early_access']:
                if game['id'] not in game_ids:
                    game_ids.add(game['id'])
                    valid_games.append(game)
                    
del games

In [15]:
drop_keys = {
    'price',
    'discount_price', 
    'metascore', 
    'early_access',
}
for game in valid_games:
    for key in drop_keys:
        if key in game.keys():
            del game[key]

In [16]:
# Ensure the games and reviews datasets cover the same set of products.
game_ids = {game['id'] for game in valid_games}
reviewed_game_ids = {review['product_id'] for review in valid_reviews}
valid_game_ids = {game for game in game_ids if game in reviewed_game_ids}

valid_games = [game for game in valid_games if game['id'] in valid_game_ids]
valid_reviews = [review for review in valid_reviews if review['product_id'] in valid_game_ids]

In [17]:
filename = 'data/processed_reviews.json'
with open(filename, 'w') as file:
    json.dump(valid_reviews, file)

In [18]:
filename = 'data/processed_games.json'
with open(filename, 'w') as file:
    json.dump(valid_games, file)