In [1]:
import pandas as pd

from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time

import nltk

from datetime import datetime

# #new: use this to get updated chromeDriver to match newest chrome update
# #new: super useful youtube video: https://www.youtube.com/watch?v=mAwL_0N1W9E&ab_channel=JonathanSoma
from webdriver_manager.chrome import ChromeDriverManager

Record date of when website was accessed.

In [2]:
date = datetime.today().strftime('%Y.%m.%d')
print(date)

2023.06.29


Retrieve dynamic page via Selenium.

In [3]:
# set up driver via chromedriver

# #old: chromedriver was out dated, see new approach below
# PATH = 'C:\Program Files (x86)\chromedriver.exe'
# driver = webdriver.Chrome(PATH)

# #new: get latest chromedriver
driver = webdriver.Chrome(ChromeDriverManager().install())

# get patchnotes page
driver.get('https://hearthstone.blizzard.com/en-gb/news/patchnotes/')

# find "Load more" button
button_loadmore = driver.find_element(By.XPATH, '//*[@id="NewsHome"]/div/div[2]/div[3]/button')

# click/return button until it no longer exists (load all patchnotes links)
max_click = 1000
ct_click = 0
flag = 0
while flag == 0 and ct_click <= max_click:
    ct_click += 1
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="NewsHome"]/div/div[2]/div[3]/button'))
        )
        time.sleep(2)
    finally:
        try:
            button_loadmore.send_keys(Keys.RETURN)
        except:
            flag = 1

# find element containing patchnotes
notes = driver.find_element(By.ID, 'NewsHome')

# extract html from element
notes_html = notes.get_attribute('innerHTML')

# close driver
driver.close()

  driver = webdriver.Chrome(ChromeDriverManager().install())


Parse HTML content via BeautifulSoup

In [4]:
soup = bs(notes_html, 'html.parser')

# view contents -- uncomment below to view
print(soup.prettify())

<div class="NewsHomeApp__NewsListContainer-sc-173hfhu-0 fWrzqA">
 <div class="NewsHomeApp__ParallaxDragon-sc-173hfhu-6 agMgS" style="transform: translate3d(0px, -313.583px, 0px);">
  <img alt="" src="https://d2q63o9r0h0ohi.cloudfront.net/images/blog/parallax_dragon-e213f2b4b61006a0e6c31da4e805162baaa5af39ca25da8aed450cdbaeb4bab6bd6c63268d794f859fb409a4cee9119097f1a6b93c3ee57e6eb234bf46726b87.png"/>
 </div>
 <div class="ContentSection">
  <h3 class="NewsHomeApp__NewsHeader-sc-173hfhu-1 bbgbaf">
   Recent Articles
  </h3>
  <div class="FilterTabGroup__FilterGrouping-sc-1yoek8d-0 iPfMBf">
   <div class="FilterTabGroup__FilterContainer-sc-1yoek8d-1 dUkioI">
    <div class="FilterTab__Filter-sc-50ebwn-0 ekzEyv NewsHomeApp__LocalFilterTab-sc-173hfhu-5 gfnQeR">
     <button class="FilterTab__FilterButton-sc-50ebwn-1 cuACvU">
      All
     </button>
     <div class="FilterTab__SelectedIndicator-sc-50ebwn-2 haWGXd SelectedIndicator">
      <div class="FilterTab__SelectedIndicatorFill-sc-50ebwn

In [5]:
# extract urls
url_list = []
for element in soup.find_all('a'):
    url_list.append('https://hearthstone.blizzard.com'+element['href'])

For each patchnotes page, extract card update content.

In [6]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [7]:
card_update_id = {'card','standard','wild','balance'}
NON_update_id = {'battlegrounds','mercenaries','duels','arena','tavern','brawl'}
change_specifier_ids = {'following','cards'}
buff_id = {'up','increased','raised','buff','buffs','buffed'}
nerf_id = {'down','lowered','reduced','dropped','drop','decreased','nerf','nerfs','nerfed'}

In [8]:
df_changes = pd.DataFrame(columns=['patch_num','date_pub','card_name','change_text','change_type'])

Get page source from website.

In [9]:
def get_content(link):
    # load webpage
    r = requests.get(link)
    # convert to beautiful soup object
    soup = bs(r.content)
    return soup

Get metadata from patch notes.

In [10]:
def get_metadata(soup):
    # extract metadata
    date_pub = soup.body.find(class_='publish-date').text.strip()
    patch_num = soup.body.find('h2').text.strip()[:-12] # remove " Patch Notes" from string
    return patch_num, date_pub

Extract specific part of page source containing updates.

In [11]:
def extract_update_content(soup):
    # extract element containing patch note information
    blog = soup.body.find(class_ = 'detail blog-detail')
    # extract tags with card information
    updates = blog.find_all(['h4','h5','p','ul'],recursive=False)
    return updates

Check if patch notes contain any card updates and if so, remove any information stated before the card updates.

In [12]:
def check_card_update(updates):
    # CHECK for card updates in patch and REMOVE anything listed before the card updates
    card_update_exists = False # = True if card update exists
    idx = 0
    max_idx = len(updates)
    while not card_update_exists and idx < max_idx:
        element = updates[idx]
        if element.name in {'h4','h5'}:
            text_set = set(tokenizer.tokenize(element.text.strip().lower()))
            id_bool = text_set.intersection(card_update_id) and not text_set.intersection(NON_update_id)
            if id_bool and 'updates' in text_set:
                del updates[:idx]
                card_update_exists = True
        idx += 1
    return card_update_exists

Remove any information stated after the card updates.

In [13]:
def remove_other_updates(updates):
    # remove any content after "Card/Standard Balance Updates" section
    for idx, element in enumerate(updates):
        if element.name in {'h4','h5'}:
            text_set = set(tokenizer.tokenize(element.text.strip().lower()))
            id_bool = not text_set.intersection(card_update_id) or text_set.intersection(NON_update_id)
            if id_bool:
                del updates[idx:]

Filter out any other information (by tags or content) that are not to be recorded.

In [14]:
def filter_update_content(updates):
    # extract only needed info
    # loop in reverse to avoid reindexing elements before deleting them
    for idx, element in reversed(list(enumerate(updates))):
        if element.name == 'p':
            text_set = set(tokenizer.tokenize(element.text.strip().lower()))
            style_check = element.find_all(['span','img','em'])
            # check for incorrect styling then check for text content
            if element.text == '' or style_check:
                del updates[idx]
            elif not change_specifier_ids.issubset(text_set):
                del updates[idx]
        elif element.name == 'ul':
            style_check = element.find_all(['span','img','em'])
            # check for incorrect styling
            if style_check:
                del updates[idx]

Check if the updates are labeled as buffs or nerfs.

In [15]:
def check_change_indicator(updates):
    # find buff/nerf subsections, if exists
    buff_idx = None
    nerf_idx = None
    spec_unknown = 0 # check if buff/nerf specified, but not in buff_id/nerf_id
    for idx, element in enumerate(updates):
        if element.name == 'p':
            text_set = set(tokenizer.tokenize(element.text.strip().lower()))
            if change_specifier_ids.issubset(text_set):
                if text_set.intersection(buff_id):
                    buff_idx = idx
                elif text_set.intersection(nerf_id):
                    nerf_idx = idx
                else:
                    spec_unknown = 1
    return buff_idx, nerf_idx, spec_unknown

Separate buff changes and nerf changes.

In [16]:
def extract_buffnerf_sections(updates,buff_idx,nerf_idx):
    # extract buff/nerf subsections
    if buff_idx and nerf_idx:
        if buff_idx < nerf_idx:
            buff_changes = updates[buff_idx+1:nerf_idx]
            nerf_changes = updates[nerf_idx+1:]
        else:
            nerf_changes = updates[nerf_idx+1:buff_idx]
            buff_changes = updates[buff_idx+1:]
    elif buff_idx:
        buff_changes = updates[buff_idx+1:]
        nerf_changes = []
    else:
        nerf_changes = updates[nerf_idx+1:]
        buff_changes = []
    return buff_changes, nerf_changes

Record card changes in dictionary.

In [17]:
def record_changes(changes,spec,df_changes,patch_num,date_pub):
    # changes = list of card changes
    # spec = str indicating type of change
    for change in changes:
        # find card name
        sib = change.previous_sibling
        card_name = sib.text.strip()
        ct = 0
        max_ct = 100
        while card_name == '' and ct < max_ct:
            sib = sib.previous_sibling
            card_name = sib.text.strip()
            ct += 1
        # record info
        dict_temp = {'patch_num':[patch_num],'date_pub':[date_pub], 'card_name':[card_name],
                     'change_text':[change.text.strip()], 'change_type':[spec]}
        df_changes = pd.concat([df_changes,pd.DataFrame(dict_temp)])
    return df_changes

Put it all together!

In [18]:
for link in url_list:
    '''Get patch notes page.'''
    soup = get_content(link)
    
    '''Get metadata.'''
    patch_num, date_pub = get_metadata(soup)
    print('Patch Notes '+ patch_num)
    
    '''Get card update information.'''
    updates = extract_update_content(soup)
        
    # remove any content before "Card/Balance Updates" header
    card_update_exists = check_card_update(updates)

    # IF patch notes contain card updates, extract the changes
    if card_update_exists:
        # remove any content after "Card/Standard Balance Updates" section
        remove_other_updates(updates)

        # extract only needed info
        filter_update_content(updates)

        '''Filter buffs and nerfs.'''
        # find buff/nerf subsections
        buff_idx, nerf_idx, spec_unknown = check_change_indicator(updates)

        # extract buffs/nerfs
        # use if statement to check if buff/nerf is even specified--if not, label "unknown"
        if buff_idx or nerf_idx:
            # extract buff/nerf subsections
            buff_changes, nerf_changes = extract_buffnerf_sections(updates,buff_idx,nerf_idx)

            # record buffs
            df_changes = record_changes(buff_changes,'buff',df_changes,patch_num,date_pub)
            
            # record nerfs
            df_changes = record_changes(nerf_changes,'nerf',df_changes,patch_num,date_pub)

        else:
            changes = updates[1+spec_unknown:]
            df_changes = record_changes(changes,None,df_changes,patch_num,date_pub)

Patch Notes 26.6
Patch Notes 26.4.3
Patch Notes 26.4
Patch Notes 26.2.2
Patch Notes 26.2
Patch Notes 26.0.4
Patch Notes 26.0.2
Patch Notes 26.0
Patch Notes 25.6.2
Patch Notes 25.6
Patch Notes 25.4.3
Patch Notes 25.4
Patch Notes 25.2.2
Patch Notes 25.2
Patch Notes 25.0.4
Patch Notes 25.0.3
Patch Notes 25.0
Patch Notes 24.6.2
Patch Notes 24.6
Patch Notes 24.4.3
Patch Notes 24.4
Patch Notes 24.2.2
Patch Notes 24.2
Patch Notes 24.0.3
Patch Notes 24.0
Patch Notes 23.6
Patch Notes 23.4.3
Patch Notes 23.4
Patch Notes 23.2.2
Patch Notes 23.2
Patch Notes 23.0
Patch Notes 22.6
Patch Notes 22.4
Patch Notes 22.2.2
Patch Notes 22.2.1
Patch Notes 22.2
Patch Notes 22.0.2
Patch Notes 22.0
Patch Notes 21.8
Patch Notes 21.6
Patch Notes 21.4.4
Patch Notes 21.4
Patch Notes 21.3
Patch Notes 21.2
Patch Notes 21.0.3
Patch Notes 21.0
Patch Notes 20.8.2
Patch Notes 20.8
Patch Notes 20.4.2
Patch Notes 20.4
Patch Notes 20.2.2
Patch Notes 20.2
Patch Notes 20.0.2
Patch Notes 20.0
Patch Notes 19.6
Patch Notes 19.4.

In [19]:
df_changes

Unnamed: 0,patch_num,date_pub,card_name,change_text,change_type
0,26.4.3,06/15/2023,Card and Treasure Pool Updates,Kingsbane has been banned from deckbuilding an...,
0,26.0.4,04/27/2023,Rowdy Fan and Twig of the World Tree will be e...,The following cards have been adjusted to be m...,
0,26.0.4,04/27/2023,Rock Master Voone,Old: [4 Mana]\nNew: [3 Mana],
0,26.0.4,04/27/2023,Verse Riff,Old: [2 Mana]\nNew: [1 Mana],
0,26.0.4,04/27/2023,Bridge Riff,Old: [6 Mana]\nNew: [5 Mana],
...,...,...,...,...,...
0,17.6,07/14/2020,Dragoncaster,Old: [Cost 6] → New: [Cost 7].,
0,17.6,07/14/2020,Fungal Fortunes,Old: [Cost 2] → New: [Cost 3].,
0,17.6,07/14/2020,"Galakrond, the Nightmare",Old: Battlecry: Draw 1 card. It costs (0). → N...,
0,17.6,07/14/2020,"Galakrond, the Apocalypse (Rogue)",Old: Battlecry: Draw 2 cards. It costs (0). → ...,


In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_changes)

  patch_num    date_pub                                          card_name  \
0    26.4.3  06/15/2023                     Card and Treasure Pool Updates   
0    26.0.4  04/27/2023  Rowdy Fan and Twig of the World Tree will be e...   
0    26.0.4  04/27/2023                                  Rock Master Voone   
0    26.0.4  04/27/2023                                         Verse Riff   
0    26.0.4  04/27/2023                                        Bridge Riff   
0    26.0.4  04/27/2023                                       Power Slider   
0    26.0.4  04/27/2023                                       Shield Block   
0    26.0.4  04/27/2023                                  Frightened Flunky   
0    26.0.4  04/27/2023                                       Thori’belore   
0    26.0.4  04/27/2023                            Infinitize the Maxitude   
0    26.0.4  04/27/2023                                      Audiosplitter   
0    26.0.4  04/27/2023                                    Spite

In [None]:
# df_changes.to_csv(f'card_data_changes_{date}.tsv',sep='\t', index=False) # tab sep since card text includes commas