## Querying and Organizing Data

In [107]:
import requests
import time
import os
import logging
import sys

import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import requests

In [108]:
# Set up logging (Jupyter sets up it's own so we have to add ours instead of using a basicConfig)
log = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
log.addHandler(fhandler)
log.setLevel(logging.CRITICAL)

In [109]:
# Function for performing a GET request using requests library with retries
# Sets a 5 second timeout by default
def get_request(url, parameters=None, timeout=5):
    try:
        response = requests.get(url=url, params=parameters, timeout=timeout)
    except SSLError as s:
        log.error('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        log.warn('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        log.debug('Got response {0}'.format(response.status_code))
        return response
    else:
        # response is none usually means too many requests. Wait and try again 
        log.warn('No response, waiting 10 seconds...')
        time.sleep(10)
        log.warn('Retrying.')
        return get_request(url, parameters)

In [110]:
# Queries the games for a given username. To find your username, check your profile under General -> Custom URL
def get_steam_xml(username):
    if os.path.exists("steam_games.xml"):
        log.info('Steam XML is cached')
        with open("steam_games.xml", "r", encoding="utf-8") as games_file:
            contents = games_file.read()
    else:
        log.info('Steam XML needs query')
        xml_url = 'http://steamcommunity.com/id/{0}/games?tab=all&xml=1'.format(username)
        xml_contents = get_request(xml_url, timeout=5)
        with open("steam_games.xml", "w", encoding="utf-8") as games_file:
            games_file.write(xml_contents.text)

        contents = xml_contents.text

    return contents

In [111]:
# Reads the games XML returned from get_steam_xml() and outputs a pandas dataframe
def get_game_info(username):
    steam_xml = get_steam_xml(username)
    tree = ET.ElementTree(ET.fromstring(steam_xml))
    root = tree.getroot()

    if root.find('error') is not None:
        log.error(root.find('error').text)
        sys.exit(0)

    game_infos = []
    
    for game in root.iter('game'):
        app_id = game.find('appID').text
        name = game.find('name').text
        
        propertyOrDefault = lambda name: (game.find(name).text) if (game.find(name) is not None) else np.nan

        # Rest of these are optional
        logo_link = propertyOrDefault('logo')    
        store_link = propertyOrDefault('storeLink')
        hours_last_2_weeks = float(propertyOrDefault('hoursLast2Weeks'))
        hours_on_record = float(propertyOrDefault('hoursOnRecord'))
        stats_link = propertyOrDefault('statsLink')
        global_stats_link = propertyOrDefault('globalStatsLink')
        
        game_infos.append((app_id, name, logo_link, store_link, hours_last_2_weeks, hours_on_record,
                           stats_link, global_stats_link))

    df = pd.DataFrame.from_records(game_infos,
                                   columns=['AppId', 'Name', 'LogoLink', 'StoreLink', 'HoursLast2Weeks',
                                            'HoursOnRecord', 'StatsLink', 'GlobalStatsLink'])
    df.set_index('AppId')

    return df

In [112]:
# Get data from SteamSpy for each game
# The structure of game_infos must be at least two columns named 'AppId' and 'Name'.
# Setting 'in_place' to true will modify the input game_infos preserving any other existing columns.
# Otherwise, they are discarded.
# pull_first_n allows limiting the number of queries to the first N found.
def get_steamspy_data(cache_file, game_infos, pull_first_n = None, in_place = False):
    cache = pd.DataFrame()
    if os.path.exists(cache_file):
        cache = pd.read_csv(cache_file, index_col = False)

    final_names = []
    final_appids = []
    final_score_rank = []
    final_positive = []
    final_negative = []
    final_total_ratings = []
    final_ratings_ratio = []
    final_user_score = []
    final_avg_forever = []
    final_avg_2weeks = []
    final_med_forever = []
    final_med_2weeks = []

    pulled = 0
    
    for i, r in game_infos.iterrows():
        name = r['Name']
        appid = i

        cache_found = False
        if cache.empty == False:
            log.info("Found {0} in cache".format(name))
            cache_row = cache.loc[cache['Name'] == name]
            if cache_row.empty == False:
                final_appids.append(appid)
                final_names.append(name)
                final_positive.append(int(cache_row["Positive"]))
                final_negative.append(int(cache_row["Negative"]))
                final_total_ratings.append(int(cache_row["TotalRatings"]))
                final_ratings_ratio.append(float(cache_row["RatingsRatio"]))
                final_user_score.append(int(cache_row["UserScore"]))
                final_avg_forever.append(int(cache_row["AvgForever"]))
                final_avg_2weeks.append(int(cache_row["Avg2Weeks"]))
                final_med_forever.append(int(cache_row["MedForever"]))
                final_med_2weeks.append(int(cache_row["Med2Weeks"]))

                cache_found = True

        if cache_found == False:
            log.info("Request {0} from SteamSpy".format(name))
                
            url = "http://steamspy.com/api.php"
            parameters = {"request": "appdetails", "appid": appid}
            json_data = get_request(url, parameters=parameters).json()
            game_info = pd.DataFrame.from_dict(json_data, orient='index')
            for game in game_info:
                positive = json_data["positive"]
                negative = json_data["negative"]
                total_ratings = positive + negative
                if total_ratings > 0:
                    positive_percent = (positive / total_ratings) * 100
                else:
                    positive_percent = 0
                user_score = json_data["userscore"]
                avg_forever = json_data["average_forever"]
                avg_2weeks = json_data["average_2weeks"]
                med_forever = json_data["median_forever"]
                med_2weeks = json_data["median_2weeks"]            

                log.debug("Finished request for {0}".format(name))
                final_appids.append(appid)
                final_names.append(name)
                final_positive.append(positive)
                final_negative.append(negative)
                final_total_ratings.append(total_ratings)
                final_ratings_ratio.append(positive_percent)
                final_user_score.append(user_score)
                final_avg_forever.append(avg_forever)
                final_avg_2weeks.append(avg_2weeks)
                final_med_forever.append(med_forever)
                final_med_2weeks.append(med_2weeks)

                # Per documentation, don't make more than 1 request per second
                time.sleep(2)
        
        pulled = pulled + 1
        if pull_first_n is not None:
            log.debug("Pulled {0} of {1}".format(pulled, pull_first_n))
            if pulled == pull_first_n:
                break

    if in_place == False:
        log.info("Returning new data frame")
        df = pd.DataFrame(list(zip(final_appids,
                                   final_names,
                                   final_positive,
                                   final_negative,
                                   final_total_ratings,
                                   final_ratings_ratio,
                                   final_user_score,
                                   final_avg_forever,
                                   final_avg_2weeks,
                                   final_med_forever,
                                   final_med_2weeks)),
                          columns = ['AppId',
                                     'Name',
                                     'Positive',
                                     'Negative',
                                     'TotalRatings',
                                     'RatingsRatio',
                                     'UserScore',
                                     'AvgForever',
                                     'Avg2Weeks',
                                     'MedForever',
                                     'Med2Weeks'
                                    ])
        df.to_csv(cache_file)
        return df
    else:
        log.info("Appending columns to existing data frame")
        # Add the new columns to the existing game_infos
        game_infos['Positive'] = final_positive
        game_infos['Negative'] = final_negative
        game_infos['TotalRatings'] = final_total_ratings
        game_infos['RatingsRatio'] = final_ratings_ratio
        game_infos['UserScore'] = final_user_score
        game_infos['AvgForever'] = final_avg_forever
        game_infos['Avg2Weeks'] = final_avg_2weeks
        game_infos['MedForever'] = final_med_forever
        game_infos['Med2Weeks'] = final_med_2weeks
        
        game_infos.to_csv(cache_file)
        return game_infos

In [113]:
# Add a Bayesian average to better rank the games
def p_calculate_bayesian_average(item_num_ratings, item_ratio_ratings,
                system_avg_num_ratings, system_ratio_ratings):
    b_avg = (((item_num_ratings) / (item_num_ratings + system_avg_num_ratings)) * item_ratio_ratings) + (((system_avg_num_ratings) /  (item_num_ratings + system_avg_num_ratings)) * system_ratio_ratings)
    return b_avg

def add_bayesian_average_to_gamespy_dataframe(steamspy_df):
    # Calculate an overall average for the system
    system_ratings_avg = steamspy_df["RatingsRatio"].mean()
    system_num_ratings_avg = steamspy_df["TotalRatings"].mean()

    # Calculate Bayesian average
    b_averages = list(steamspy_df.apply(lambda row:
        p_calculate_bayesian_average(row["TotalRatings"], row["RatingsRatio"],
                    system_num_ratings_avg, system_ratings_avg), axis=1))

    # Add the new averages to the data frame
    steamspy_df['Bayesian'] = b_averages

In [114]:
# Note, your 'Game details' must be set to 'Public' for this to work.
# This is done in your profile -> Edit Profile -> Privacy Settings -> Game details
# To find your username, check your profile under General -> Custom URL
username = ''
if username == '':
    if os.path.exists("steam_id.dat"):
        log.info('Reading steam ID from file')
        with open("steam_id.dat", "r", encoding="utf-8") as id_file:
            username = id_file.read()
    else:
        log.critical('Need steam user ID')
else:
    # Note: get_game_info does not check if username matches in case the file is already cached.
    game_infos = get_game_info(username)

In [115]:
# Decorate our steam library info with ranking info from SteamSpy
steamspy_data = get_steamspy_data("steam_spy_cache.csv", game_infos, in_place = True)
add_bayesian_average_to_gamespy_dataframe(steamspy_data)

In [116]:
# Do any filtering or re-arranging you want to here
steamspy_data = steamspy_data.sort_values(by=['Bayesian'], ascending=False)

# Write to file for easy access
steamspy_data.to_csv("bayesian.csv")

## Plotting

In [117]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import Turbo256 as palette
from bokeh.transform import linear_cmap

In [118]:
# Plot most played games (ignore non played games)

colName = "HoursOnRecord"

most_played_games = pd.DataFrame.copy(steamspy_data)
most_played_games.dropna(axis = 0, subset = [colName], inplace = True)
most_played_games = most_played_games.sort_values(by=[colName], ascending=False)
most_played_games.tail()

p = figure(x_range = most_played_games["Name"], width = 2000, title = "Most played games")
p.vbar(x = most_played_games["Name"], top = most_played_games[colName], width = 0.5)
p.xaxis.major_label_orientation = "vertical"
p.xgrid.grid_line_color = None
p.y_range.start = 0
show(p)

In [119]:
# Plot most played games in last 2 weeks

colName = "HoursLast2Weeks"

most_played_games_2w = pd.DataFrame.copy(steamspy_data)
most_played_games_2w.dropna(axis = 0, subset = [colName], inplace = True)
most_played_games_2w = most_played_games_2w.sort_values(by=[colName], ascending=False)
most_played_games_2w.tail()

p = figure(x_range = most_played_games_2w["Name"], width = 2000, title = "Most played games in last 2 weeks")
p.vbar(x = most_played_games_2w["Name"], top = most_played_games_2w[colName], width = 0.5)
p.xaxis.major_label_orientation = "vertical"
p.xgrid.grid_line_color = None
p.y_range.start = 0
show(p)

In [120]:
# Plot most played games versus their rating (ignore non played games)

colName = "HoursOnRecord"

most_played_games = pd.DataFrame.copy(steamspy_data)
most_played_games.dropna(axis = 0, subset = [colName], inplace = True)
most_played_games = most_played_games.sort_values(by=[colName], ascending=False)

tooltips = [
    ('Game', '@Name'),
    ('Hours Played', '@HoursOnRecord'),
    ('Rating', '@Bayesian')
]

color_mapper = linear_cmap(field_name = "HoursOnRecord",
                           palette=palette,
                           low=min(most_played_games["HoursOnRecord"]),
                           high=max(most_played_games["HoursOnRecord"]))

select_tools = ['box_select', 'lasso_select', 'poly_select', 'tap', 'reset']

data_source = ColumnDataSource(most_played_games)
p = figure(plot_height = 1000,
           plot_width = 2000,
           title = "Most played vs ranking",
           tools = select_tools)
p.circle(x = "HoursOnRecord", y = "Bayesian",
         color = color_mapper,
         source = data_source,
         selection_color = 'deepskyblue',
         nonselection_color = 'lightgray', radius = 1)
p.add_tools(HoverTool(tooltips=tooltips))
show(p)