# Steam Charts scraping

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer
import requests
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt
from fake_useragent import UserAgent
import time
import csv
import seaborn as sns

In [2]:
#importing clean game ids and names
game_ids=pd.read_csv("../data/steam_clean.csv")
game_ids = game_ids[["appid", "name"]]

SteamCharts was scraped for information on player numbers in May, June and July, as well as the all time peak players.

Fortunately SteamCharts urls are organised with Steam's appids, so cycling through them was very straightforward

In [8]:
#initial write
ua=UserAgent()
mask = SoupStrainer(["tr", "td", "div"])

#start csv writer
with open("../data/steam_player_nos_v2.csv", "w", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["appid", "name", "all_time_peak", 
                     "jul_19_av", "jul_19_peak", 
                     "jun_19_av", "jun_19_peak",
                     "may_19_av", "may_19_peak"])
    
    for i in tqdm_notebook(range(0,11)):
        
        #create empty list for player number info
        players=[]

        #cycle through ids and add that into the url 
        url = "https://steamcharts.com/app/{}".format(game_ids["appid"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)
        
        #find and append the all time peak players stat
        for item in soup.find_all("div", attrs="app-stat"):
            if "all-time" in item.text:          
                players.append(game_ids["appid"][i])
                players.append(game_ids["name"][i])
                players.append(item.span.text)

                
        #find the peak and average for each month and append
        for item in soup.find_all("tr"):
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "May 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "June 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "July 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
                
        #add info to csv
        writer.writerow(players)

        
        #if nothing found append nans
        if len(players) == 0: 
            writer.writerow([game_ids["appid"][i], game_ids["name"][i], 
                             np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [13]:
#subsequent write
ua=UserAgent()
mask = SoupStrainer(["tr", "td", "div"])

#start csv writer
with open("../data/steam_player_nos_v2.csv", "a", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)
    
    for i in tqdm_notebook(range(len(pd.read_csv("../data/steam_player_nos_v2.csv")),len(game_ids))):
        
        players=[]

        url = "https://steamcharts.com/app/{}".format(game_ids["appid"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)
        
        for item in soup.find_all("div", attrs="app-stat"):
            if "all-time" in item.text:          
                players.append(game_ids["appid"][i])
                players.append(game_ids["name"][i])
                players.append(item.span.text)

        for item in soup.find_all("tr"):
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "May 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "June 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
            if (item.find("td", attrs="month-cell left") != None 
                            and item.find("td", attrs="month-cell left").text.strip() == "July 2019"):
                players.append(item.find("td", attrs ="right num-f").text)
                players.append(item.find("td", attrs ="right num").text)
                
        #add info to csv
        writer.writerow(players)

        
        if len(players) == 0: 
            writer.writerow([game_ids["appid"][i], game_ids["name"][i], 
                             np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
#checking what we got
pd.read_csv("../data/steam_player_nos_v2.csv").tail()

Unnamed: 0,appid,name,all_time_peak,jul_19_av,jul_19_peak,jun_19_av,jun_19_peak,may_19_av,may_19_peak
22574,998890,The Colony,1.0,,,,,,
22575,999750,BLASTER LiLO,1.0,,,,,,
22576,1001490,Tower Behind the Moon,18.0,0.15,2.0,0.22,2.0,0.36,3.0
22577,1001880,aMAZE Valentine,2.0,0.01,1.0,0.02,1.0,0.04,1.0
22578,1002490,Roulette Simulator 2,2.0,0.02,1.0,0.02,1.0,0.02,1.0
