In [10]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options

In [11]:
CHROME_PATH = "/Users/ik/bin/chromedriver"

In [12]:
# initialize web driver with eager page loading strategy
chrome_options = Options()
chrome_options.page_load_strategy = 'eager'
driver = webdriver.Chrome(CHROME_PATH, options=chrome_options)

In [13]:
driver.get('https://locations.blinkfitness.com/index.html')

## Crawl main index site for list of links to each state/regions branch directory

In [4]:
# get links to each region home page
driver.get('https://locations.blinkfitness.com/index.html')
links = WebDriverWait(driver, timeout=30).until(lambda d: d.find_elements_by_tag_name('a'))

location_urls = [] # will contain final urls
skip = ['index','search']

for link in links:
    # get all links on webpage
    url_string = link.get_attribute('href')
    
    # check for branch state sublinks only 
    if url_string.startswith('https://locations.blinkfitness.com/'):
        if not any(x in url_string for x in skip):
            branch_state = link.text 
            location_urls.append(url_string)

# will handle special case separately
location_urls.remove('https://locations.blinkfitness.com/va/virginia-beach/4239-holland-road')

## Helper function: get status of each gym
Possible statuses based on preliminary inspection
* Open Now
* Closed (for the day)
* Coming Soon
* Temporarily Closed (due to covid)

Both are contained within the Teaser-info div but different elements depending on whether the branch is running (first two choices) vs if it's currently not in service (last two choices).

I'll check for the div that should contain the status and if it does not exist, then I'll look for the div that states otherwise.

In [5]:
status_dict = {
    0: ['Temporarily Closed', 'Closed'], # branch not in service due to covid
    1: ['Coming Soon'], # branch not in service yet
    2: ['Open Now'], # branch in service and currently open <- target status
    3: ['Closed - Opens at'] # branch in service but currently closed
}

# function that finds status of individual directory listing
def find_status(branch):
    # if branch is open, status is contained in Hours-statusText element
    try:
        status = branch.find_element_by_class_name('Hours-statusText').text.strip()
    # else look for Teaser-text
    except:
        status = branch.find_element_by_class_name('Teaser-text').text
    
    # return status as integer value for efficient storage and comparison
    for k,v in status_dict.items():
        if status in v:
            return k
    return None

## Parse individual branches from each region directory page

In [6]:
location_info = []

for region_url in location_urls:
    driver.get(region_url)
    
    # parse city containers
    cities = WebDriverWait(driver, timeout=30).until(lambda d: d.find_elements_by_class_name('Directory-cityContainer'))  
    
    for city in cities:
        branch_city = city.find_element_by_class_name('Directory-cityName').text
        
        # get list of all branches in city container
        branches = city.find_elements_by_class_name('Directory-listTeaser')
        
        # parse each branch for title, address & phone number
        for branch in branches:
            temp_branch = {}
            # temp_branch['branch_state'] = 
            temp_branch['branch_url'] = branch.find_element_by_class_name('Teaser-titleLink').get_attribute('href')
            temp_branch['branch_city'] = branch_city
            temp_branch['branch_title'] = branch.find_element_by_class_name('Teaser-title').text
            temp_branch['branch_address'] = branch.find_element_by_class_name('Teaser-address').text
            temp_branch['branch_phone'] = branch.find_element_by_class_name('Teaser-phone').text
            temp_branch['branch_status'] = find_status(branch)
            
            location_info.append(temp_branch)

## Handle parsing of special case: Virginia Beach branch
Virginia only has 1 location right now so it doesn't have a separate directory page like all other states

In [7]:
temp = {}
va_url = 'https://locations.blinkfitness.com/va/virginia-beach/4239-holland-road'

driver.get(va_url)
va_info = WebDriverWait(driver, timeout=30).until(lambda d: d.find_element_by_class_name('Core'))  

temp['url'] = va_url
temp['state'] = 'Virginia'
temp['city'] = 'Virginia Beach'
temp['title'] = va_info.find_element_by_class_name('LocationName-geo').text
temp['phone'] = va_info.find_element_by_id('phone-main').text
temp['address'] = va_info.find_element_by_class_name('c-address').text

location_info.append(temp)

In [8]:
from pprint import pprint
for i in location_info:
    pprint(i)

{'address': '2251 West Ball Road',
 'city': 'Anaheim',
 'phone': '(714) 790-3555',
 'status': 0,
 'title': 'Anaheim',
 'url': 'https://locations.blinkfitness.com/ca/anaheim/2251-west-ball-road'}
{'address': '16121 Bellflower Blvd.',
 'city': 'Bellflower',
 'phone': '(562) 725-7133',
 'status': 0,
 'title': 'Bellflower',
 'url': 'https://locations.blinkfitness.com/ca/bellflower/16121-bellflower-blvd.'}
{'address': '1060 West Alameda Avenue',
 'city': 'Burbank',
 'phone': '(818) 686-5930',
 'status': 0,
 'title': 'Burbank',
 'url': 'https://locations.blinkfitness.com/ca/burbank/1060-west-alameda-avenue'}
{'address': '15519 Normandie Avenue',
 'city': 'Gardena',
 'phone': '(424) 292-8150',
 'status': 0,
 'title': 'Gardena',
 'url': 'https://locations.blinkfitness.com/ca/gardena/15519-normandie-avenue'}
{'address': '6714 Pacific Boulevard',
 'city': 'Huntington Park',
 'phone': '(323) 538-8705',
 'status': 0,
 'title': 'Huntington Park',
 'url': 'https://locations.blinkfitness.com/ca/hunti