In [1]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

In [2]:
def query(url,parameter,interval,maxtrial):
    '''
    return json of requesting url with parameters, otherwise retry with an interval
    -----------------
    Parameters:
    -- url: string, API
    -- parameter: {'parameters' : 'value'}, parameters attached
    -- interval: int, time of sleep if request fail
    -- maxtrial: int, maximum number of attempt
    -----------------
    Returns:
    -- json
    '''
    if maxtrial == 0:
        print("request failed, stop trying!")
        return None
    response = requests.get(url=url,params=parameter)
    if response.status_code == 200:
        print('request successful!')
        return response.json()
    else:
        print('An error occurs! Retry after {} seconds'.format(interval))
        time.sleep(interval)
        return query(url,parameter,interval,maxtrial-1)
        
        

In [3]:
test = query(url='https://steamspy.com/api.php',parameter={"request":"all","page": "0"},interval=3,maxtrial=3)
print(len(test))

request successful!
1000


In [4]:
test

{'1627140': {'appid': 1627140,
  'name': 'Sabre Team',
  'developer': 'Krisalis Software',
  'publisher': 'Ziggurat',
  'score_rank': '',
  'positive': 0,
  'negative': 2,
  'userscore': 0,
  'owners': '1,000,000 .. 2,000,000',
  'average_forever': 0,
  'average_2weeks': 0,
  'median_forever': 0,
  'median_2weeks': 0,
  'price': '699',
  'initialprice': '699',
  'discount': '0',
  'ccu': 0},
 '1353950': {'appid': 1353950,
  'name': 'Ring of Fire: Prologue',
  'developer': 'de Fault, Chard',
  'publisher': 'Far Few Giants',
  'score_rank': '',
  'positive': 9,
  'negative': 2,
  'userscore': 0,
  'owners': '1,000,000 .. 2,000,000',
  'average_forever': 0,
  'average_2weeks': 0,
  'median_forever': 0,
  'median_2weeks': 0,
  'price': '0',
  'initialprice': '0',
  'discount': '0',
  'ccu': 0},
 '1619450': {'appid': 1619450,
  'name': 'Heart of a Warrior',
  'developer': 'Techworld Communication',
  'publisher': 'Techworld Communication',
  'score_rank': '',
  'positive': 0,
  'negative': 

In [5]:
class appidExtractor:
    '''
    extract appid from given url
    Parameters:
    --- url: string, API
    --- name: string, data will be stored in name.csv
    --- interval: int, time of sleep if request fail
    '''
    def __init__(self, name, interval):
        '''
        -- csvpath: path of csv file
        -- tracepath: path of txt that stores pathidx
        -- trace: next page index to request. !!It is a string!!
        -- fieldnames: names of the columns to be written into .csv file
        '''
        csvfile = name+".csv"
        tracefile = name+"_trace.txt"
        self.interval = interval
        self.trace = 0
        self.fieldnames = []
        self.csvpath = os.path.join(os.getcwd(),csvfile)
        self.tracepath = os.path.join(os.getcwd(),tracefile)
        
    def writeheader(self):
        '''
        write header of csvfile based fieldnames
        '''
        with open(self.csvpath,mode='a',newline = '',encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames,extrasaction='ignore')
            writer.writeheader()
        return
        
    def initialize(self):
        '''
        initialize csv and page file if no previous request made
        otherwise, read page idx from tracefile
        if only one file exists, reset the progress
        '''
        try:
            csvfile = open(self.csvpath,'r')
            try:
                tracefile = open(self.tracepath,'r')
                self.trace = int(tracefile.read())
                tracefile.close()
                csvfile.close()
            except FileNotFoundError:
                csvfile.close()
                csvfile = open(self.csvpath,'w')
                csvfile.write("")
                csvfile.close()
                tracefile = open(self.tracepath,'w')
                tracefile.write("0")
                tracefile.close()
            except: 
                print("something else went wrong!")
                exit()
                
        except FileNotFoundError:
            csvfile = open(self.csvpath,'x')
            tracefile = open(self.tracepath,'w')
            tracefile.write("0")
            csvfile.close()
            tracefile.close()
        except:
            print("something else went wrong!")
            exit()
        
        self.writeheader()
        return
            
    def deleteAll(self):
        '''delete both csvfile and tracefile'''
        if os.path.exists(self.csvpath):
            os.remove(self.csvpath)
        if os.path.exists(self.tracepath):
            os.remove(self.tracepath)

        return
    
    def get_trace(self):
        '''
        read page idx from tracefile and set self.trace
        '''
        with open(self.tracepath,'r') as tracefile:
            self.trace = tracefile.read()
        return 
    
    def set_trace(self,val):
        '''
        set page idx in both self.trace and tracefile
        -- val: int
        '''
        self.trace = str(val)
        with open(self.tracepath,'w') as tracefile:
            tracefile.write(self.trace)
        return
    
    def set_fieldnames(self,fieldnames):
        '''
        set the names of the columns to be written into .csv file
        !!! must be set before run() !!!
        -- fieldnames: list of string
        '''
        self.fieldnames = fieldnames
        return
    
    
    def process_query(self):
        '''
        recursion to request data with all page idx
        '''
        self.get_trace()
        para = {"request":"all","page": self.trace}
        url = "https://steamspy.com/api.php"
        data = query(url,para,self.interval,3) # request data
        time.sleep(10)
        if data == None: # if request faied, exit
            exit()
        # write dictionary data into csv
        with open(self.csvpath, mode='a',newline = '',encoding='utf-8') as csvfile:
            fieldnames = ['appid', 'name']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames,extrasaction='ignore')
            for key in data:
                writer.writerow(data[key])
        print("successfully write {number} entries, page {trace}".format(number=len(data),trace=self.trace))
        # update trace locally
        self.set_trace(int(self.trace)+1)
        if len(data) < 1000: # check if it is the last page
            print("request finished")
            return
        else:
            return self.process_query()
        
        
    
    def run(self):
        '''
        request data from API
        '''
        if len(self.fieldnames) == 0:
            raise ValueError("fieldnames cannot be None") # if the fieldnames are not set, raise
        self.initialize()
        self.process_query()
        
        return
        
        

In [25]:
extractor = appidExtractor("appid",10)
extractor.set_fieldnames(['appid', 'name'])

In [26]:
extractor.deleteAll()

In [27]:
extractor.run()

request successful!
successfully write 1000 entries, page 0
request successful!
successfully write 1000 entries, page 1
request successful!
successfully write 1000 entries, page 2
request successful!
successfully write 1000 entries, page 3
request successful!
successfully write 1000 entries, page 4
request successful!
successfully write 1000 entries, page 5
request successful!
successfully write 1000 entries, page 6
request successful!
successfully write 1000 entries, page 7
request successful!
successfully write 1000 entries, page 8
request successful!
successfully write 1000 entries, page 9
request successful!
successfully write 1000 entries, page 10
request successful!
successfully write 1000 entries, page 11
request successful!
successfully write 1000 entries, page 12
request successful!
successfully write 1000 entries, page 13
request successful!
successfully write 1000 entries, page 14
request successful!
successfully write 1000 entries, page 15
request successful!
successfully w

In [6]:
appid_df = pd.read_csv("appid.csv")
appid_df.reset_index(inplace=True)
appid_df.sort_index(inplace=True)
appid_df

Unnamed: 0,index,appid,name
0,0,570,Dota 2
1,1,730,Counter-Strike: Global Offensive
2,2,578080,PUBG: BATTLEGROUNDS
3,3,1063730,New World
4,4,440,Team Fortress 2
...,...,...,...
56970,56970,1506770,Crawler
56971,56971,1868280,Coins Collector Simulator
56972,56972,1466190,You Arrive in a Town
56973,56973,1853990,Hidden Treasures in the Forest of Dreams


In [None]:
class detailExtractor(appidExtractor):
    def __init__(self, name, interval, parseID, batchsize):
        super().__init__(name, interval)
        appidpath = os.path.join(os.getcwd(), "appid.csv")
        self.appid_df = pd.read_csv(appidpath)
        parselist = [self.parseSteam, self.parseSteamSpy]
        writerlist = [self.writeSteam, self.writeSteamSpy]
        self.parse = parselist[parseID]
        self.write = writerlist[parseID]
        self.batchsize = batchsize
        self.Null_response = {}

    def parseSteamSpy(self, appid):

        return

    def parseSteam(self, appid):
        '''
        parse Steam API to request app detail
        -- appid: string
        -- return: dict
        '''
        url = "http://store.steampowered.com/api/appdetails/"
        parameters = {"appids": appid}
        data = query(url, parameters, self.interval, 3, False)
        return data

    def writeSteam(self, data_batch):
        with open(self.csvpath, mode='a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(
                csvfile, fieldnames=self.fieldnames, extrasaction='ignore')
            for data in data_batch:
                detail_dict = list(data.values())[0]['data']
                writer.writerow(detail_dict)
        return

    def writeSteamSpy(self):
        return

    def process_batch(self, start, end):
        '''
        resume requesting based on tracefile
        read appids from appid.csv
        request app details in batch
        update trace
        -- return: list of dict
        '''
        appid_batch = []
        data_batch = []
        for idx, row in self.appid_df.loc[start:end].iterrows():
            appid_batch.append(row['appid'])
        for appid in appid_batch:
            response = self.parse(appid)
            if not list(response.values())[0]['success']:
                response = self.Null_response
            data_batch.append(response)
            time.sleep(1)
        print("successfully request detail {} to {}".format(start, end))
        return data_batch
    
    def get_Steam_Null_response(self):
        Null_data = {}
        for key in self.fieldnames:
            Null_data[key] = "NaN"
        Null_response = {'NaN':{'success': True,'data':Null_data}}
        self.Null_response = Null_response
        return
        

    def run(self):
        '''
        request data from API
        '''
        if len(self.fieldnames) == 0:
            # if the fieldnames are not set, raise
            raise ValueError("fieldnames cannot be None")
        self.get_Steam_Null_response()
        self.initialize()
        proceed = True
        length = self.appid_df.shape[0]
        #length = 10
        while proceed:
            start = int(self.trace)
            end = start + self.batchsize - 1
            if end >= length:
                end = length - 1
                proceed = False
            data_batch = self.process_batch(start, end)
            self.write(data_batch)
            self.set_trace(end+1)

        return