In [None]:
from bs4 import BeautifulSoup
import requests, os, time, json
import pandas as pd

# Scraped from
# https://steamcharts.com/top

# Create directory if it doesn't exist
spath = os.path.join('steam_data')
if not os.path.exists(spath):
    os.makedirs(spath)

# Set up games_dict
if os.path.exists(os.path.join(spath, 'games_dict.json')):
    f = open(os.path.join(spath, 'games_dict.json'))
    games_dict = json.load(f)
    f.close()
else:
    games_dict = {}

In [None]:
# Gets the dataframe for one game
def getGameCounts(appid):
    r = requests.get('https://steamcharts.com{}'.format(appid))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    pc_df = pd.read_html(str(soup))[0]
    pc_df['App ID'] = int(appid.split('/')[-1])
    return pc_df

In [None]:
end_num = 40 # 25 games per page
all_df = pd.DataFrame()

url = 'https://steamcharts.com/top/p.{}'

# for each page
for i in range(1, end_num+1):
    r = requests.get(url.format(i))
    r.raise_for_status()
    msoup = BeautifulSoup(r.text, "html.parser")
    
    # for each game on the page
    for game in msoup.find_all('td', attrs={'class': 'game-name left'}):
        appid = game.find('a').get('href')
        games_dict[int(appid.split('/')[-1])] = game.text.strip()
        try:
            df = getGameCounts(appid)
            all_df = all_df.append(df, ignore_index=True)
        except:
            print('url not found', appid)
    
    # to slow it down a bit
    if i != end_num:
        time.sleep(60)

In [None]:
# Save the files
all_df['Game'] = all_df['App ID'].astype(int).map(games_dict)
all_df.to_csv(os.path.join(spath, 'steam_charts.csv'), index=False)

f = open(os.path.join(spath, 'games_dict.json'), 'w')
json.dump(games_dict, f)
f.close()