In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer
import requests
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt
from fake_useragent import UserAgent
import time
import csv

In [2]:
#importing cleaned game_ids
game_ids=pd.read_csv("../data/steam_clean.csv")
game_ids = game_ids[["appid", "name"]]

# Cleaning names

Unfortunately ITAD doesn't use Steam appids to identify games (as it is not a steam only website) so the game names had to be modified to match their URLS as best as possible.

In [3]:
#editing name column to match format of url for scraping

#all names lowercase with no space
game_ids["name_clean"] = game_ids["name"].str.replace(r"([\W]+)", "").str.lower()

#removed any "the" from start of name
game_ids["name_clean"] = game_ids["name_clean"].str.replace(r"(^the)", "")

#all non-zero digits replaced with Roman numerals
digits = [("1", "i"),
         ("2" , "ii"),
         ("3", "iii"),
         ("4", "iv"),
         ("5", "v"),
         ("6", "vi"),
         ("7", "vii"),
         ("8", "viii"),
         ("9", "ix")]

for dig, rom in digits:
    game_ids["name_clean"] = game_ids["name_clean"].str.replace(dig, rom)

In [24]:
game_ids.head()

Unnamed: 0,appid,name,name_clean
0,10,Counter-Strike,counterstrike
1,20,Team Fortress Classic,teamfortressclassic
2,30,Day of Defeat,dayofdefeat
3,40,Deathmatch Classic,deathmatchclassic
4,50,Half-Life: Opposing Force,halflifeopposingforce


# First Scrape

In [51]:
#Initial write
delays = [0,1,2]
ua=UserAgent()
mask = SoupStrainer(["div", "span"])

#start csv writer
with open("../data/summer_sale_19.csv", "w", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["appid", "name", "discount"])

    #loop through game ids and go to URL
    for i in tqdm_notebook(range(0,10)):

        delay = np.random.choice(delays)
        #time.sleep(delay)
        on_sale=0
        discounts = []
        
        url = "https://isthereanydeal.com/game/{}/history/".format(game_ids["name_clean"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)

        #if no page found, append nan
        if soup.find("div", attrs="widget__nodata"):
            discounts.append(game_ids["appid"][i])
            discounts.append(game_ids["name"][i])
            discounts.append(np.nan)

        #if page found, append discount (or 0 if none)
        else:
            for item in soup.find_all("div", attrs={"class": "lg2 game",
                                                    "data-shop" : "steam"}):
                if any(substring in item.span.text for substring in ["2019-06-26", "2019-06-25"]):
                    discounts.append(game_ids["appid"][i])
                    discounts.append(game_ids["name"][i])
                    discounts.append(item.find_all("span", "lg2__cut")[1].text)
                    on_sale+=1
                    break

            if on_sale ==0:
                discounts.append(game_ids["appid"][i])
                discounts.append(game_ids["name"][i])
                discounts.append(0)
       
        #add info to csv
        writer.writerow(discounts)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [66]:
#checking the scrape aligns with the original dataframe
pd.read_csv("../data/summer_sale_19.csv").tail()

Unnamed: 0,appid,name,discount
9209,503080,Echoes of the Past: Kingdom of Despair Collect...,
9210,503090,Fairy Maids,50%
9211,503100,Black Hat Cooperative,0
9212,503130,Red is Dead - The Complex Fun Random Level Fas...,
9213,503140,Dungeon Punks,0


In [68]:
game_ids.iloc[9213]

appid                503140
name          Dungeon Punks
name_clean     dungeonpunks
Name: 9213, dtype: object

In [69]:
#Subsequent appends
ua=UserAgent()
mask = SoupStrainer(["div", "span"])

#start csv writer
with open("../data/summer_sale_19.csv", "a", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)

    #loop through game ids and go to URL
    for i in tqdm_notebook(range(9214,len(game_ids))):

        time.sleep(np.random.choice([0,0,0,1,2,3,4]))
        on_sale=0
        discounts = []
        
        url = "https://isthereanydeal.com/game/{}/history/".format(game_ids["name_clean"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)

        #if no page found, append nan
        if soup.find("div", attrs="widget__nodata"):
            discounts.append(game_ids["appid"][i])
            discounts.append(game_ids["name"][i])
            discounts.append(np.nan)

        #if page found, append discount (or 0 if none)
        else:
            for item in soup.find_all("div", attrs={"class": "lg2 game",
                                                    "data-shop" : "steam"}):
                if any(substring in item.span.text for substring in ["2019-06-26", "2019-06-25"]):
                    discounts.append(game_ids["appid"][i])
                    discounts.append(game_ids["name"][i])
                    discounts.append(item.find_all("span", "lg2__cut")[1].text)
                    on_sale+=1
                    break

            if on_sale ==0:
                discounts.append(game_ids["appid"][i])
                discounts.append(game_ids["name"][i])
                discounts.append(0)
       
        #add info to csv
        writer.writerow(discounts)

HBox(children=(IntProgress(value=0, max=13365), HTML(value='')))

In [4]:
sale = pd.read_csv("../data/summer_sale_19.csv")

Initial scrape:

- 11907 - on sale
- 7269 - not on sale
- 3403 - page not found


# Second Scrape

Looking at the games that were missed it, a further change was made to the names to better match the urls: removing all "the"s

In [19]:
sale[sale["discount"].isnull()==True].head()

Unnamed: 0,appid,name,discount
29,1630,Disciples II: Rise of the Elves,
36,2100,Dark Messiah of Might & Magic,
45,2330,QUAKE II Mission Pack: The Reckoning,
48,2360,HeXen: Beyond Heretic,
49,2370,HeXen: Deathkings of the Dark Citadel,


In [31]:
#filtering on those names that need to be retried
sale_v2 = sale[sale["discount"].isnull() == True].drop(columns="discount")

#reformatting as before
sale_v2 = sale_v2.merge(game_ids[["appid", "name_clean"]], on="appid", how="left")

#removing all "the"s
sale_v2["name_clean"] = sale_v2["name_clean"].str.replace("the","")

Same scraper as before

In [46]:
#Inital write
ua=UserAgent()
mask = SoupStrainer(["div", "span"])

#start csv writer
with open("../data/summer_sale_19_extras.csv", "w", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["appid", "name", "discount"])

    #loop through game ids and go to URL
    for i in tqdm_notebook(range(0,10)):

        time.sleep(np.random.choice([0,0,0,1,2,3,4]))
        on_sale=0
        discounts = []
        
        url = "https://isthereanydeal.com/game/{}/history/".format(sale_v2["name_clean"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)

        #if no page found, append nan
        if soup.find("div", attrs="widget__nodata"):
            discounts.append(sale_v2["appid"][i])
            discounts.append(sale_v2["name"][i])
            discounts.append(np.nan)

        #if page found, append discount (or 0 if none)
        else:
            for item in soup.find_all("div", attrs={"class": "lg2 game",
                                                    "data-shop" : "steam"}):
                if any(substring in item.span.text for substring in ["2019-06-26", "2019-06-25"]):
                    discounts.append(sale_v2["appid"][i])
                    discounts.append(sale_v2["name"][i])
                    discounts.append(item.find_all("span", "lg2__cut")[1].text)
                    on_sale+=1
                    break

            if on_sale ==0:
                discounts.append(sale_v2["appid"][i])
                discounts.append(sale_v2["name"][i])
                discounts.append(0)
       
        #add info to csv
        writer.writerow(discounts)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [49]:
#Subsequent appends
ua=UserAgent()
mask = SoupStrainer(["div", "span"])

#start csv writer
with open("../data/summer_sale_19_extras.csv", "a", newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)

    #loop through game ids and go to URL
    for i in tqdm_notebook(range(len(pd.read_csv("../data/summer_sale_19_extras.csv")),len(sale_v2))):

        time.sleep(np.random.choice([0,0,0,1,2,3,4]))
        on_sale=0
        discounts = []
        
        url = "https://isthereanydeal.com/game/{}/history/".format(sale_v2["name_clean"][i])
        r = requests.get(url, headers={"User-Agent" : ua.random})
        soup = BeautifulSoup(r.text, 'html.parser', parse_only=mask)

        #if no page found, append nan
        if soup.find("div", attrs="widget__nodata"):
            discounts.append(sale_v2["appid"][i])
            discounts.append(sale_v2["name"][i])
            discounts.append(np.nan)

        #if page found, append discount (or 0 if none)
        else:
            for item in soup.find_all("div", attrs={"class": "lg2 game",
                                                    "data-rokshop" : "steam"}):
                if any(substring in item.span.text for substring in ["2019-06-26", "2019-06-25"]):
                    discounts.append(sale_v2["appid"][i])
                    discounts.append(sale_v2["name"][i])
                    discounts.append(item.find_all("span", "lg2__cut")[1].text)
                    on_sale+=1
                    break

            if on_sale ==0:
                discounts.append(sale_v2["appid"][i])
                discounts.append(sale_v2["name"][i])
                discounts.append(0)
       
        #add info to csv
        writer.writerow(discounts)

HBox(children=(IntProgress(value=0, max=3393), HTML(value='')))

# Combining both scrapes

In [5]:
sale_extras = pd.read_csv("../data/summer_sale_19_extras.csv")

In [11]:
sale[sale["discount"].isnull()==True].head()

Unnamed: 0,appid,name,discount
29,1630,Disciples II: Rise of the Elves,
36,2100,Dark Messiah of Might & Magic,
45,2330,QUAKE II Mission Pack: The Reckoning,
48,2360,HeXen: Beyond Heretic,
49,2370,HeXen: Deathkings of the Dark Citadel,


In [9]:
sale_extras.head()

Unnamed: 0,appid,name,discount
0,1630,Disciples II: Rise of the Elves,80%
1,2100,Dark Messiah of Might & Magic,0
2,2330,QUAKE II Mission Pack: The Reckoning,33%
3,2360,HeXen: Beyond Heretic,
4,2370,HeXen: Deathkings of the Dark Citadel,50%


In [35]:
#merging the two dataframes
sale_merged = sale.merge(sale_extras[["discount", "appid"]], on="appid", how="left")

#filling the initial nans with the new data
sale_merged["discount_x"].fillna(sale_merged["discount_y"], inplace=True)

#dropping the redundant column
sale_merged = sale_merged.drop(columns="discount_y").rename(columns={"discount_x" : "discount"})

Updated scrape:

- 13135 - on sale
- 7917 - not on sale
- 1527 - page not found

In [48]:
sale_merged.to_csv("../data/summer_sale_19_v2.csv", index=False)