In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from flask import Flask, render_template, jsonify
from flask_sqlalchemy import SQLAlchemy 
from datetime import datetime
import time

In [16]:
driver = webdriver.Chrome("/Users/ik/bin/chromedriver")
driver.get('https://locations.blinkfitness.com/index.html')

In [2]:
class BlinkParser():
    CHROME_PATH = "/Users/ik/bin/chromedriver"
    DIRECTORY_URL = 'https://locations.blinkfitness.com/index.html'
    
    branch_status_dict = {
        0: ['Open Now - Closes at'],
        1: ['Opening Soon'], 
        2: ['Temporarily Closed', 'Closed'],
        3: ['Closed - Opens at']}

    def __init__(self):
        self.driver = None
        self.branch_directory_urls = []
        self.branch_info = []
        
    def load_chromedriver(self, path):
        chrome_options = Options()
        chrome_options.page_load_strategy = 'eager'
        return webdriver.Chrome(path, options=chrome_options)
    
    # crawls gym directory and parses metadata individual branch elements 
    def parse(self):
        self.driver = self.load_chromedriver(BlinkParser.CHROME_PATH)
        self.driver.get(BlinkParser.DIRECTORY_URL)
        wait = WebDriverWait(self.driver, 10)
    
        # extract links only to each state's homepage and store as class attribute
        # VA beach has non-standard homepage; ignoring for now
        branch_links = wait.until(lambda d: d.find_elements_by_tag_name('a'))
        self.branch_directory_urls = self.find_hrefs_with_conditions(
            list_a_tags=branch_links,
            url_starts_with='https://locations.blinkfitness.com/',
            url_does_not_include=['index','search','virginia-beach']) 
        
        # if urls not parsed yet, then sleep and retry after 1 second
        while len(self.branch_directory_urls) == 0:
            time.sleep(1)
            
        self.parse_branch_info()
        
        self.driver.quit()
  
    # takes list of web elements ('a' tags) and returns a list of URLs (str) that follows conditions specified by parameters
    def find_hrefs_with_conditions(self, list_a_tags, url_starts_with, url_does_not_include):
        list_urls = []

        for link in list_a_tags:
            url_string = link.get_attribute('href')

            if url_string.startswith(url_starts_with):
                if not any(x in url_string for x in url_does_not_include):
                    list_urls.append(url_string)
        return list_urls        
    
    # loads single gym branch homepage and extracts relevant gym metadata (state, city, street address, title, phone, url)
    # and appends to class attribute containing list of metadata for all branches
    def parse_branch_info(self):
        for url in self.branch_directory_urls:
            self.driver.get(url)
    
            # parse elements containing individual branch information
            wait = WebDriverWait(self.driver, 10)
            cities = wait.until(lambda d: d.find_elements_by_class_name('Directory-cityContainer'))
            
            for city in cities:
                branches = city.find_elements_by_class_name('Directory-listTeaser')
                
                for branch in branches:
                    temp_branch = {
                        'state': url[-2:].upper(),
                        'city': city.find_element_by_class_name('Directory-cityName').text,
                        'street': branch.find_element_by_class_name('Teaser-address').text,
                        'title': branch.find_element_by_class_name('Teaser-title').text,
                        'phone': branch.find_element_by_class_name('Teaser-phone').text,
                        'url': branch.find_element_by_class_name('Teaser-titleLink').get_attribute('href')}
                    
                    self.branch_info.append(temp_branch)

    # returns list of gym homepage urls from list of dictionaries containing all gym metadata
    def get_urls(self):
        return [branch['url'] for _,branch in enumerate(self.branch_info)]
    
    @staticmethod
    def encode_gym_status(status):    
        for status_code, status_text_list in BlinkParser.branch_status_dict.items():
            if status in status_text_list:
                return status_code
        return None
     
    # loads gym homepage and parses current status as encoded value (see branch_status_dict for mapping)
    #
    def get_status_code(self, url):
        self.driver.get(url)
        wait = WebDriverWait(self.driver, 3)
        
        try:
            status = wait.until(lambda d: d.find_element_by_class_name('Hours-statusText')).text 
        except:
            # for branches that have no current status (usually branches that have not been opened yet)
            status = wait.until(lambda d: d.find_element_by_class_name('Core-openingDate')).text
    
        return BlinkParser.encode_gym_status(status)
        
    # crawls through each gym homepage and parses the current capacity (if available) 
    # returns as list of dictionaries of capacity readings (branch title, timestamp, status_code, capacity)
    def parse_capacity(self):
        # load new driver in case connection refused from too many requests from initial metadata parse
        self.driver = self.load_chromedriver(BlinkParser.CHROME_PATH)
        
        urls = self.get_urls()
        capacities = []
        
        for url in urls:
            status_code = self.get_status_code(url)
            
            # TODO: parsing status_code may be unnecessary; initial thought process was to take advantage of lazy evaluation (parse only open branches)
            # find_elements used with walrus operator to avoid error for certain webpages where Core-capacityStatus shows up for unopened branches (it shouldn't)
            if not status_code and len(cap_element := self.driver.find_elements_by_class_name('Core-capacityStatus')) > 0:
                capacity = cap_element[0].text
            else:
                capacity = None
            
            capacities.append({
                'title': self.driver.find_element_by_class_name('LocationName-geo').text,
                'timestamp': datetime.now(),
                'status_code': status_code,
                'capacity': capacity 
            })
            
        self.driver.quit()
        return capacities

In [8]:
app = Flask(__name__)

app.config['ENV'] = 'development'
#app.config['SECRET_KEY'] = ''

app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///blink.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

# database setup
db = SQLAlchemy(app)

class Branch(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(30), unique=True, nullable=False)
    phone = db.Column(db.String(12), unique=True, nullable=False)
    url = db.Column(db.String(100), nullable=False)
    address = db.relationship('Address', backref='branch', lazy=True, uselist=False)
    
    def __repr__(self):
        return f"Branch: {self.title.title()}"
    
class Address(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    branch_id = db.Column(db.Integer, db.ForeignKey('branch.id'), nullable=False)
    state = db.Column(db.String(2), nullable=False) # TODO: make this limited to state abbreviations
    city = db.Column(db.String(30), nullable=False)
    street = db.Column(db.String(100), unique=True, nullable=False)
    
    def __repr__(self):
        return f"Address: {self.street}, {self.city}, {self.state}"
    
class Capacity(db.Model):
    id = db.Column(db.Integer,primary_key=True)
    branch_id = db.Column(db.Integer, db.ForeignKey('branch.id'), nullable=False)
    capacity = db.Column(db.String(100))
    status_code = db.Column(db.Integer)
    timestamp = db.Column(db.DateTime)

def get_branch_data():
     # query and format the data to be displayed by jinja template
    branches = db.session.query(Branch, Address).join(Address).all()
    
    data = []
    for branch in branches:
        data.append({
            'title': branch.Branch.title,
            'phone': branch.Branch.phone,
            'state': branch.Address.state,
            'region': branch.Address.city,
            'address': branch.Address.street,
            'url': branch.Branch.url
        })
        
    return data

# converts capacity string (parsed from branch homepage) to integer or NULL value
def capacity_to_percent(capacity_str):
    status_to_value = {
        "LESS THAN 25% FULL": 25,
        "ABOUT 50% FULL": 50,
        "ABOUT 75% FULL": 75,
        "AT CAPACITY": 100,
        "": None,
        "CAPACITY DATA UNAVAILABLE": None
    }
    
    return status_to_value[capacity_str]

def get_capacity_data():
    caps = db.session.query(Capacity, Branch).join(Branch).filter(Capacity.capacity != None).all()
    readings = {}

    # map list of queried capacities into dictonary of the form:
    # {
    #   branch_title_1: [(timestamp_1, capacity_1), ... , (timestamp_n, capacity_n)]   
    #   ....
    #   branch_title_n: [(timestamp_1, capacity_1), ... , (timestamp_n, capacity_n)]  
    # }
    for cap in caps:
        branch = f"{cap.Branch.address.state} - {cap.Branch.title}"
        timestamp = cap.Capacity.timestamp
        capacity = capacity_to_percent(cap.Capacity.capacity)

        # append capacity to appropriate list if branch already exists 
        if branch in readings.keys():
            readings[branch].append((timestamp, capacity))
        # else create new entry in dictionary
        else:
            readings[branch] = [(timestamp, capacity)]
            
    return readings

@app.route('/')
def home():
    return render_template("home.html", branches=get_branch_data())

# renders page containing table of all gyms + metadata (address, phone, url)
@app.route('/view_branches')
def view_branches():
    return render_template("table.html", branches=get_branch_data())

# renders table showing parsed capacities 
@app.route('/view_capacities')
def view_capacities():
    # time is currently hardcoded for sample presentation (will replace with timestamps once parsing is automated)
    time_headers = ["3:30 PM", "3:45 PM", "4:00 PM", "4:15 PM", "4:30 PM", "4:45 PM", "5:00 PM", "5:15 PM", "5:30 PM", "5:45 PM"]
    return render_template("capacity.html", data=get_capacity_data(), time_headers=time_headers)

@app.route('/api_branches', methods=['GET'])
def get_branches():
    return jsonify(get_branch_data())

@app.route('/api_capacities', methods=['GET'])
def get_capacities():
    return jsonify(get_capacity_data())

# if __name__ == "__main__":
#     app.run(debug=True)

### Helper functions: database setup, commits, and data integration

In [9]:
# Clear tables if starting from scratch 
def refresh_tables():
    db.drop_all()
    db.create_all()

# Adds and commits python objects mapped using ORM to SQLAlchemy database
def add_and_commit_to_db(db_object):
    db.session.add(db_object)
    db.session.commit()
    
# store branch info dictionaries into db 
# parser.branch_info should have been populated prior using parser.parse() 
def main(parser):
    
    for branch in parser.branch_info:
        # Address data stored using separate db model (tied to Branch) for abstraction
        branch_address = Address(
            state = branch['state'],
            city = branch['city'],
            street = branch['street']
        )
        
        new_branch = Branch(
            title = branch['title'],
            address = branch_address,
            phone = branch['phone'],
            url = branch['url']
        )
        
        add_and_commit_to_db(new_branch)
        
# scrape capacity data from individual branch pages and store into database
def capacity(parser):
    capacities = parser.parse_capacity()
    
    for cap in capacities:   
        # get branch ID from matching branch titles in Branch table and capacity data
        blink_branch_id = Branch.query.filter(Branch.title == cap['title']).first().id
        
        # raise error if no valid branch in Branch table
        if not blink_branch_id:
            raise NameError('No valid blink branch id for capacity reading')
        
        # store capacity data as SQLAlchemy object
        new_capacity = Capacity(
            branch_id = blink_branch_id,
            status_code = cap['status_code'],
            capacity = cap['capacity'],
            timestamp = cap['timestamp']
        )
        
        add_and_commit_to_db(new_capacity)


### Initialize BlinkParser

In [17]:
parser = BlinkParser()

### Initial parsing setup
Only run this cell if you want to drop and recreate all tables (deleting old branch metadata), parse fresh data, and store into Branch / Address tables.

In [18]:
refresh_tables()
parser.parse()
main(parser)

### Parse capacity data (goal is to automate this portion)
Crawls each individual branch homepage for capacity (n/a, 25%, 50%, 75%, 100%) and stores into Capacity table.

In [12]:
capacity(parser)

### Print number of all capacity readings and all non-NA readings

In [54]:
all_readings = Capacity.query.all()
all_valid_readings = Capacity.query.filter(Capacity.capacity != None).all()

print('Number of capacity readings: ', len(all_readings))
print('Nnumber of non-NA capacity readings: ', len(all_valid_readings))

Number of capacity readings:  1090
Nnumber of non-NA capacity readings:  960


### Query: join Capacity and Branch tables on branch_id filtering out readings with capacity n/a

In [55]:
caps = db.session.query(Capacity, Branch).join(Branch).filter(Capacity.capacity != None).all()

In [56]:
lis = []
for cap in caps:
    if cap.Capacity.capacity == '':
        print(f"{cap.Capacity.id}:  {cap.Capacity.timestamp} {cap.Branch.title} ({cap.Branch.address.state})")
    #print(f"{cap.Capacity.timestamp}  {cap.Branch.title} ({cap.Branch.address.state}): {cap_str[-8:-5] if (cap_str:=cap.Capacity.capacity) else None}")
    lis.append(cap.Capacity.capacity)
    

176:  2020-12-15 15:50:42.448339 Brentwood (NY)
185:  2020-12-15 15:50:51.884516 Murray Hill (NY)
235:  2020-12-15 16:07:37.971232 Beverly (MA)
245:  2020-12-15 16:07:47.244091 Linden (NJ)
308:  2020-12-15 16:08:51.487169 Corona (NY)
336:  2020-12-15 16:20:16.564254 Jacksonville (FL)
387:  2020-12-15 16:21:01.840026 Williamsburg (NY)
503:  2020-12-15 16:36:51.517334 Brentwood (NY)
504:  2020-12-15 16:36:52.996822 Baldwin (NY)
554:  2020-12-15 17:00:23.103382 Jacksonville (FL)
599:  2020-12-15 17:01:08.011839 Bed-Stuy (NY)
628:  2020-12-15 17:01:43.062310 Grand Central (NY)
663:  2020-12-15 17:16:20.975666 Jacksonville (FL)
708:  2020-12-15 17:17:02.301945 Bed-Stuy (NY)
721:  2020-12-15 17:17:13.328295 Brentwood (NY)
772:  2020-12-15 17:32:38.456926 Jacksonville (FL)
790:  2020-12-15 17:33:15.371778 Linden (NJ)
881:  2020-12-15 17:48:19.427133 Jacksonville (FL)


### Verifying number of branches parsed into db

In [26]:
branches = db.session.query(Branch, Address).join(Address).all()
len(branches)

109

### Get all unique values parsed for capacity

In [57]:
print(set(lis))

{'', 'LESS THAN 25% FULL', 'ABOUT 50% FULL', 'CAPACITY DATA UNAVAILABLE', 'ABOUT 75% FULL', 'AT CAPACITY'}


### Function to convert capacities (parsed as strings) to rounded percentage values
Blink doesn't have detailed capacity data available on their website so I just used the number included in the description (see above for all types).


In [60]:
# converts capacity string (parsed from branch homepage) to integer or NULL value
def capacity_to_percent(capacity_str):
    status_to_value = {
        "LESS THAN 25% FULL": 25,
        "ABOUT 50% FULL": 50,
        "ABOUT 75% FULL": 75,
        "AT CAPACITY": 100,
        "": None,
        "CAPACITY DATA UNAVAILABLE": None
    }
    
    return status_to_value[capacity_str]

### Map list of capacity readings into dictionary of lists (key is branch title, value is list of capacity readings)

In [73]:
readings = {}

for cap in caps:
    title = cap.Branch.title
    timestamp = cap.Capacity.timestamp
    capacity = capacity_to_percent(cap.Capacity.capacity)

    if title in readings.keys():
        readings[title].append((timestamp, capacity))
    else:
        readings[title] = [(timestamp, capacity)]

In [74]:
for cap in caps:
    print(cap.Capacity.timestamp)

2020-12-15 15:31:35.014090
2020-12-15 15:31:35.542752
2020-12-15 15:31:36.127613
2020-12-15 15:31:36.875111
2020-12-15 15:31:38.624974
2020-12-15 15:31:40.181774
2020-12-15 15:31:40.818354
2020-12-15 15:31:41.561634
2020-12-15 15:31:42.293603
2020-12-15 15:31:43.045792
2020-12-15 15:31:43.636399
2020-12-15 15:31:44.205334
2020-12-15 15:31:44.803289
2020-12-15 15:31:45.450101
2020-12-15 15:31:46.405215
2020-12-15 15:31:47.004894
2020-12-15 15:31:47.656257
2020-12-15 15:31:48.187261
2020-12-15 15:31:48.950594
2020-12-15 15:31:50.836021
2020-12-15 15:31:51.761267
2020-12-15 15:31:52.635157
2020-12-15 15:31:53.161547
2020-12-15 15:31:54.079288
2020-12-15 15:31:54.752266
2020-12-15 15:31:55.683268
2020-12-15 15:31:57.037105
2020-12-15 15:31:58.303799
2020-12-15 15:31:58.961192
2020-12-15 15:31:59.595517
2020-12-15 15:32:00.277056
2020-12-15 15:32:00.799576
2020-12-15 15:32:02.052120
2020-12-15 15:32:02.785800
2020-12-15 15:32:03.516351
2020-12-15 15:32:04.236794
2020-12-15 15:32:06.358016
2