# Setup

Import the following packages. To get all the packages run `pip install -r requirements.txt`

In [1]:
# Import data processing libraries
import pandas as pd
import csv
import re
import requests
from bs4 import BeautifulSoup

# Environment Variables
import os
from dotenv import load_dotenv

# Scraping for Max Page Number

Goal of this section is to obtain a dictionary of all states and their respective max number of pages found on their search page in the *Find Locations* section of the Jersey Mikes website. This provides us an estimation/metric of how many stores are in a state or how many addresses to scrape for later on.

Base URL: https://www.jerseymikes.com/locations/{state_abriev}?page={page_num}

In [2]:
# Base url for finding store by state
base_url = 'https://www.jerseymikes.com/locations/'
page_url = '?page='

# Filter for states by number of pages in store link
states_counts = {}
states_abriev = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
                 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
                 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
                 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
                 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# Scrap for 
for state in states_abriev:
    # Choose min page number, to find any higher page
    state_url = base_url + state + page_url + '1'
    
    # Create Beautiful Soup object to parse html content
    document_html = requests.get(state_url).text
    soup = BeautifulSoup(document_html, 'html.parser')

    # Parse page links
    states_content = soup.find('ul', class_='pagination')
    
    # No additional pages mean max page of 1
    max_page = 1
    if states_content:
        states_content = [i.text.strip() for i in states_content.find_all('a')]
        states_content = [int(i) if i.isnumeric() else 0 for i in states_content]
        max_page = max(states_content)

    states_counts.update({state: max_page})        

states_counts

{'AL': 3,
 'AK': 1,
 'AZ': 7,
 'AR': 1,
 'CA': 29,
 'CO': 5,
 'CT': 3,
 'DE': 1,
 'DC': 1,
 'FL': 18,
 'GA': 9,
 'HI': 1,
 'ID': 1,
 'IL': 8,
 'IN': 2,
 'IA': 2,
 'KS': 1,
 'KY': 2,
 'LA': 1,
 'ME': 1,
 'MD': 4,
 'MA': 4,
 'MI': 6,
 'MN': 4,
 'MS': 1,
 'MO': 2,
 'MT': 1,
 'NE': 1,
 'NV': 2,
 'NH': 1,
 'NJ': 11,
 'NM': 1,
 'NY': 7,
 'NC': 16,
 'ND': 1,
 'OH': 8,
 'OK': 2,
 'OR': 3,
 'PA': 6,
 'RI': 1,
 'SC': 7,
 'SD': 1,
 'TN': 5,
 'TX': 17,
 'UT': 3,
 'VT': 1,
 'VA': 8,
 'WA': 5,
 'WV': 1,
 'WI': 3,
 'WY': 1}

# Scraping for Addresses by State

Goal of this section is to obtain the addresses of all stores for every Jersey Mikes state locations pages we mapped earlier.

Base URL: https://www.jerseymikes.com/locations/{state_abriev}

In [3]:
# Scrape addresses for each state, search through all pages for each state
locations = []
base_url = 'https://www.jerseymikes.com/locations/'
page_url = '?page='

for state, page in states_counts.items():
    # Create new soup from current page
    state_url = base_url + state + page_url
    
    for page in range(1, page+1):
        state_url += str(page)
        document_html = requests.get(state_url).text
        soup = BeautifulSoup(document_html, 'html.parser')
        soup = soup.find_all('p', itemprop='address')
        locations += soup

In [4]:
# Parse addresses
for index, location in enumerate(locations):
    address = location.text.strip()
    end = address.find('(')
    address = address[:end]
    address = address.replace('\n', ',').strip()
    locations[index] = address + 'US'
    
locations = set(locations)
locations

# Obtaining Geocode from Google Maps API

Use Google Maps Platform to obtain longitude and latitude from addresses. Obtain an API key here, https://developers.google.com/maps/documentation/javascript/get-api-key

In [5]:
# Load API key
load_dotenv()
api_key = os.getenv('GOOGLE_MAPS_API_KEY')

# Create empty dataframe
df = pd.DataFrame(columns=['Address', 'Latitude', 'Longitude'])

In [6]:
# Loop through all locations, get geocode, if invalid then skip
for location in locations:
    latitude, longitude = None, None
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json'
    endpoint = f'{base_url}?address={location}&key={api_key}'
    r = requests.get(endpoint)
    
    # Invalid address
    if r.status_code not in range(200, 299):
        continue
    
    # Get geocode data
    try:
        results = r.json()['results'][0]
        latitude = results['geometry']['location']['lat']
        longitude = results['geometry']['location']['lng']
    except:
        continue
        
    # Append to dataframe
    df.loc[len(df)] = [location, latitude, longitude]

df.head()

Unnamed: 0,Address,Latitude,Longitude
0,"Imperial Promenade,5675 E La Palma Avenue,Suit...",33.86097,-117.791142
1,"4509 Phoenix Avenue,Fort Smith, AR 72903-6005,US",35.338555,-94.383186
2,"3821 Lakewood Boulevard ,Ste. 101,Long Beach, ...",33.828598,-118.143035
3,"Station Park West,1060 West Park Lane,Suite 11...",40.983426,-111.907794
4,"6095 Carlson Way,Suite B,Marion, IA 52302-6651,US",42.036505,-91.547915


# Export Dataframe to .csv File

Export store locations datafraame to .csv file.

In [11]:
df.to_csv('../data/jersey_mikes_locations.csv', index=False)