# STEAMSPY - API 

#### Here, we are going to pull data from "steamspy.com/api.php?request=all&page= "
#### At the time when the project was started, there were 66 pages, from 0 to 65. The content probably has been added later on.
e.g. URL of the 2nd page: steamspy.com/api.php?request=all&page=1

In [2]:
# Importing libraries
import requests
import time
import datetime
import csv
import pandas as pd
import numpy as np
import json
import os
import sys
print(sys.version)

3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]


In [3]:
# Headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "DNT": "1",
    "Connection": "close",
    "Upgrade-Insecure-Requests": "1",
}

In [None]:
# Iterate trough pages with for loop to get results into a single DataFrame
total_results = {}
for page_num in range(0, 66):
    URL = "https://steamspy.com/api.php?request=all&page=" + str(page_num)
    print("Downloading", URL)
    response = requests.get(url=URL, headers=headers)
    data = response.json()
    total_results = total_results | data

In [6]:
# Normalizing downloaded data into and calling df.transpose to have columns placed properly
pd.json_normalize(total_results["570"]) # 570 in order to select the relevant data
df = pd.DataFrame(data=total_results)
df = df.transpose()
df = df.reset_index()
df = df.drop("index", axis=1)
df

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
0,570,Dota 2,Valve,Valve,,1688350,364904,0,"200,000,000 .. 500,000,000",31189,1568,2476,1343,0,0,0,576198
1,730,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,,6686707,840120,0,"100,000,000 .. 200,000,000",35229,596,4531,415,0,0,0,1164766
2,1172470,Apex Legends,Respawn Entertainment,Electronic Arts,,575033,142955,0,"50,000,000 .. 100,000,000",9972,625,2355,522,0,0,0,357478
3,578080,PUBG: BATTLEGROUNDS,"KRAFTON, Inc.","KRAFTON, Inc.",,1275045,946006,0,"50,000,000 .. 100,000,000",22181,104,8186,123,0,0,0,353924
4,1063730,New World,Amazon Games,Amazon Games,,182925,77355,0,"50,000,000 .. 100,000,000",7441,3,10298,3,3999,3999,0,20515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63839,557850,Magnificent Ships: Volume 1,VROOM,VROOM,,14,1,0,"0 .. 20,000",0,0,0,0,999,999,0,0
63840,601890,PD Howler 11,"Daniel Ritchie - PDHowler.com, Philip Staiger ...",KPL,,14,3,0,"0 .. 20,000",0,0,0,0,799,799,0,0
63841,746010,Rig or Skill: PC Brawl,Don't Bite Devs,No Gravity Games,,14,11,0,"0 .. 20,000",0,0,0,0,0,0,0,0
63842,996160,Quest for the Golden Duck,Bigosaur,Bigosaur,,14,3,0,"0 .. 20,000",0,0,0,0,999,999,0,0


In [None]:
# Saving data into CSV file
df.to_csv(
    r"..\SteamSpy\steamspy_data.csv",
    index=False,
)

#### Now that we have our data in CSV, we will read the file and proceed to the next step. 
#### Where we will use https://steamspy.com/api.php URL, and parameters "request" and "appid", to get details for the apps.

In [None]:
steamspy_all = pd.read_csv(
    r"..\SteamSpy\steamspy_data.csv"
)

In [None]:
# Creating a list of app IDs
steamspy_all["appid"] = steamspy_all["appid"].astype(int)
appid_list = []
for label, content in steamspy_all["appid"].items():
    appid_list.append(content)

#### Defining api downloading, error handling and file saving function

In [None]:
max_retries = 10
retry_delay = 1

def get_details(appids_chunk):
    global det_df
    det_df = pd.DataFrame()
    URL = "https://steamspy.com/api.php"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "DNT": "1",
        "Connection": "close",
        "Upgrade-Insecure-Requests": "1",
    }
    len(appid_list)

    for id in appids_chunk:
        params = {"request": "appdetails", "appid": str(id)}
        # While getting requests occasionaly an error occurs and we handle it by retrying via for loop
        for retry in range(1, max_retries + 1):
            try:
                response = requests.get(url=URL, params=params, headers=headers)
                response.raise_for_status()
                data = json.loads(response.text)
                break
            except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
                print("Error:", e)
                if retry < max_retries:
                    print("Retrying in {} seconds...".format(retry_delay))
                    time.sleep(retry_delay)
                else:
                    print("Max retries exceeded. Request failed.")
        # set the max_level=0 because tags are nested dict, otherwise it will normalize the first columns tags and create columns like tags.MOBA and etc.
        det_df = pd.json_normalize(data, max_level=0)

        if not os.path.isfile(
            r"..\SteamSpy\details_data.csv"
        ):
            det_df.to_csv(
                r"..\SteamSpy\details_data.csv",
                header="column_names",
            )
        else:
            det_df.to_csv(
                r"..\SteamSpy\details_data.csv",
                mode="a",
                header=False,
            )

#### After defining function we can proceed to downloading.
#### Downloading data in chunks, will make sure that we are not sending too many requests at once. In case of an unexpected error or crash, we will create a check list inside the nested for loop in order to continue downloading and saving from where it was left

In [None]:
chunk_size = 100
for i in range(0, len(appid_list), chunk_size):
    appids_chunk = appid_list[i : i + chunk_size]
    appids_chunk_fixed = []
    for appids_chunk_item in appids_chunk:
        if os.path.isfile(
            r"..\SteamSpy\details_data.csv"
        ):
            check_data = pd.read_csv(
                r"..\SteamSpy\details_data.csv",
                usecols=["appid"],
            )
            existing_appids = []  # the check list
            for label, content in check_data["appid"].items():
                existing_appids.append(content)
            if appids_chunk_item not in existing_appids:
                appids_chunk_fixed.append(appids_chunk_item)
        else:
            get_details(appids_chunk)
    time.sleep(1)
    print("waiting 1 sec")
    get_details(appids_chunk_fixed)
    print("completed successfully")

#### Our data had been downloaded and saved to details_data.csv successfully. Proceeding to data cleaning.

### Useful information on variables:
#### total_results
 - data downloaded with all pages
#### df
 - normalized and transposed DataFrame of total_results
#### steamspy_data.csv
 - df saved into csv file, contains data from "steamspy.com/api.php?request=all&page= " with all pages
#### steamspy_all = df
 - reading steamspy_data.csv file
#### appid_list
 - list of app IDs, created from steamspy_all DataFrame
#### existing_appids
 - a check list created from details_data.csv to control if the appid was downloaded or not. If exists it continues downloading from where it was left.
#### det_df
 - DataFrame for details API downloaded from "https://steamspy.com/api.php"
#### details_data.csv
 - csv file saved from det_df