In [243]:
# Some of this code is based on https://github.com/nik-davis/steam-data-science-project
# which is licensed under MIT

# This code queries the Humble Bundle for Ukraine page as well as
# the Steam API and SteamSpy API to generate a list of rankings for
# the games that are part of this bundle. The best representative
# ranking is done using a Bayesian average which scales the item's
# average by the number of ratings it received versus the system's
# average and the system's average number of ratings.

# Bayesian derived formula was sourced from here:
# https://www.codementor.io/@arpitbhayani/solving-an-age-old-problem-using-bayesian-average-15fy4ww08p

# Note that this is one of my first longer Python scripts so it is almost certainly not optimized

In [None]:
import csv
import datetime as dt
import json
import os
import statistics
import time

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [72]:
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [124]:
def item_generator(json_input, lookup_key):
    if isinstance(json_input, dict):
        for k, v in json_input.items():
            if k == lookup_key:
                yield v
            else:
                yield from item_generator(v, lookup_key)
    elif isinstance(json_input, list):
        for item in json_input:
            yield from item_generator(item, lookup_key)

In [125]:
# Query games in bundle, with cache
# (not perfect, gets some useless data but good enough, we'll filter it later)
humble_url = "https://www.humblebundle.com/stand-with-ukraine-bundle"

if os.path.exists("humble.txt"):
    with open("humble.txt", "r", encoding="utf-8") as humble_file:
        humble_text = humble_file.read()
else:
    humble_html = requests.get(humble_url)
    with open("humble.txt", "w", encoding="utf-8") as humble_file:
        humble_file.write(humble_html.text)

humble_soup = BeautifulSoup(humble_text, 'html.parser')
humble_json_el = humble_soup.find(id="webpack-bundle-page-data")
humble_json = json.loads(humble_json_el.text)

In [126]:
# Query mapping of name to appid from steam API, with cache
steamapi_url = "https://api.steampowered.com/ISteamApps/GetAppList/v2"

if os.path.exists("steam_appids.txt"):
    with open("steam_appids.txt", "r", encoding="utf-8") as appids_file:
        appids_text = appids_file.read()
else:
    appids_html = requests.get(steamapi_url)
    with open("steam_appids.txt", "w", encoding="utf-8") as appids_file:
        appids_file.write(appids_html.text)

appids_json = json.loads(appids_text)
apps_df = pd.DataFrame(appids_json["applist"]["apps"])

In [242]:
# Filter down to just the games in the bundle
# (Note: This misses a few that are DLC or not named right)
bundle_items = pd.DataFrame(columns=["appid", "name"])

bundle_filtered_dict = {}
item_gen = item_generator(humble_json, "human_name")
for i in item_gen:
    row = apps_df.loc[apps_df['name'] == i]
    if row.empty == False:
        bundle_filtered_dict.update({i: int(row["appid"])})

In [225]:
# Get data from SteamSpy for each game
cache = pd.DataFrame()
if os.path.exists("cache.csv"):
    cache = pd.read_csv('cache.csv', index_col=0)

final_names = []
final_appids = []
final_score_rank = []
final_positive = []
final_negative = []
final_total_ratings = []
final_ratings_ratio = []
final_user_score = []
final_avg_forever = []
final_avg_2weeks = []
final_med_forever = []
final_med_2weeks = []

for name in bundle_filtered_dict:
    
    cache_found = False
    if cache.empty == False:
        cache_row = cache.loc[cache['Name'] == name]
        if cache_row.empty == False:
            final_appids.append(bundle_filtered_dict[name])
            final_names.append(name)
            final_positive.append(int(cache_row["Positive"]))
            final_negative.append(int(cache_row["Negative"]))
            final_total_ratings.append(int(cache_row["TotalRatings"]))
            final_ratings_ratio.append(float(cache_row["RatingsRatio"]))
            final_user_score.append(int(cache_row["UserScore"]))
            final_avg_forever.append(int(cache_row["AvgForever"]))
            final_avg_2weeks.append(int(cache_row["Avg2Weeks"]))
            final_med_forever.append(int(cache_row["MedForever"]))
            final_med_2weeks.append(int(cache_row["Med2Weeks"]))
            
            cache_found = True
            
    if cache_found == False:
        url = "https://steamspy.com/api.php"
        parameters = {"request": "appdetails", "appid": bundle_filtered_dict[name]}
        json_data = get_request(url, parameters=parameters)
        game_info = pd.DataFrame.from_dict(json_data, orient='index')
        for game in game_info:
            positive = json_data["positive"]
            negative = json_data["negative"]
            total_ratings = positive + negative
            if total_ratings > 0:
                positive_percent = (positive / total_ratings) * 100
            else:
                positive_percent = 0
            user_score = json_data["userscore"]
            avg_forever = json_data["average_forever"]
            avg_2weeks = json_data["average_2weeks"]
            med_forever = json_data["median_forever"]
            med_2weeks = json_data["median_2weeks"]            

            print(name)
            final_appids.append(bundle_filtered_dict[name])
            final_names.append(name)
            final_positive.append(positive)
            final_negative.append(negative)
            final_total_ratings.append(total_ratings)
            final_ratings_ratio.append(positive_percent)
            final_user_score.append(user_score)
            final_avg_forever.append(avg_forever)
            final_avg_2weeks.append(avg_2weeks)
            final_med_forever.append(med_forever)
            final_med_2weeks.append(med_2weeks)

            # Per documentation, don't make more than 1 request per second
            time.sleep(2)
    
df = pd.DataFrame(list(zip(final_appids,
                           final_names,
                           final_positive,
                           final_negative,
                           final_total_ratings,
                           final_ratings_ratio,
                           final_user_score,
                           final_avg_forever,
                           final_avg_2weeks,
                           final_med_forever,
                           final_med_2weeks)),
                  columns = ['AppId',
                             'Name',
                             'Positive',
                             'Negative',
                             'TotalRatings',
                             'RatingsRatio',
                             'UserScore',
                             'AvgForever',
                             'Avg2Weeks',
                             'MedForever',
                             'Med2Weeks'
                            ])
df.set_index('AppId', inplace=True)
df.to_csv('cache.csv')

Cache exists


In [239]:
# Add a Bayesian average to better rank the games
def BayesianAvg(item_num_ratings, item_ratio_ratings,
                system_avg_num_ratings, system_ratio_ratings):
    b_avg = (((item_num_ratings) / (item_num_ratings + system_avg_num_ratings)) * item_ratio_ratings) + (((system_avg_num_ratings) /  (item_num_ratings + system_avg_num_ratings)) * system_ratio_ratings)
    return b_avg

# Calculate an overall average for the system
system_ratings_avg = df["RatingsRatio"].mean()
system_num_ratings_avg = df["TotalRatings"].mean()

# Calculate Bayesian average
b_averages = list(df.apply(lambda row:
    BayesianAvg(row["TotalRatings"], row["RatingsRatio"],
                system_num_ratings_avg, system_ratings_avg), axis=1))

# Add the new averages to the data frame
df['Bayesian'] = b_averages

82.79573430228537
13687.857142857143


In [244]:
# Do any filtering or re-arranging you want to here
df = df.sort_values(by=['Bayesian'], ascending=False)

# Write to file for easy access
df.to_csv("bayesian.csv")

df

Unnamed: 0_level_0,Name,Positive,Negative,TotalRatings,RatingsRatio,UserScore,AvgForever,Avg2Weeks,MedForever,Med2Weeks,Bayesian
AppId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
646570,Slay the Spire,104975,2230,107205,97.919873,0,5548,257,2287,225,96.207472
526870,Satisfactory,86697,2475,89172,97.224465,0,2734,776,1313,1249,95.304392
220200,Kerbal Space Program,95617,5259,100876,94.786669,0,4226,246,871,394,93.354016
211820,Starbound,144417,11082,155499,92.873266,0,3411,429,1174,429,92.057956
282070,This War of Mine,78970,5926,84896,93.019695,0,599,320,236,320,91.600151
...,...,...,...,...,...,...,...,...,...,...,...
999220,Amnesia: Rebirth,4634,1396,6030,76.849088,0,283,0,283,0,80.977166
523650,Lust for Darkness,2354,1050,3404,69.153937,0,202,0,235,0,80.078845
327070,Gloria Victis,4946,1958,6904,71.639629,0,639,0,1122,0,79.055336
239200,Amnesia: A Machine for Pigs,6349,2864,9213,68.913492,0,99,0,24,0,77.210917
