## Math Counts

## Scrape Competition Locations

In [20]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
from geopy.geocoders import GoogleV3
import json
import time

In [3]:
# Get all competitions from Math Counts website

columns = ["Competition Name", "URL", "State", "Date", "Type", "City"]
data = []

for page in range(0, 23):
    url = "https://www.mathcounts.org/dates-locations-coordinators?field_competition_state_value=All&page=" + str(page)
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    
    table = soup.find("table", {"class": "views-table"}).find("tbody")
    for row in table.findAll("tr"):
        rowData = row.findAll("td")
        
        competitionName = rowData[0].getText().replace("\n", "")
        url = "https://www.mathcounts.org" + rowData[0].find("a")["href"]
        state = rowData[1].getText().strip()
        date = rowData[2].getText().strip()
        competitionType = rowData[3].getText().strip()
        city = rowData[4].getText().strip()
        
        data.append([competitionName, url, state, date, competitionType, city])

df = pd.DataFrame(data, columns=columns)

In [4]:
df.head()

Unnamed: 0,Competition Name,URL,State,Date,Type,City
0,Alaska State Competition,https://www.mathcounts.org/alaska-state-compet...,AK,,State,Canceled
1,Anchorage Chapter Competition,https://www.mathcounts.org/anchorage-chapter-c...,AK,02/08/2020,Chapter,Anchorage
2,Fairbanks Chapter Competition,https://www.mathcounts.org/fairbanks-chapter-c...,AK,02/15/2020,Chapter,Fairbanks
3,Remote Chapter Competition,https://www.mathcounts.org/remote-chapter-comp...,AK,02/08/2020,Chapter,Anchorage
4,Southeast AK Chapter Competition,https://www.mathcounts.org/southeast-ak-chapte...,AK,02/08/2020,Chapter,Juneau


In [5]:
# Filter only chapter locations 
df = df[df["Type"] != "State"]
df.head()

Unnamed: 0,Competition Name,URL,State,Date,Type,City
1,Anchorage Chapter Competition,https://www.mathcounts.org/anchorage-chapter-c...,AK,02/08/2020,Chapter,Anchorage
2,Fairbanks Chapter Competition,https://www.mathcounts.org/fairbanks-chapter-c...,AK,02/15/2020,Chapter,Fairbanks
3,Remote Chapter Competition,https://www.mathcounts.org/remote-chapter-comp...,AK,02/08/2020,Chapter,Anchorage
4,Southeast AK Chapter Competition,https://www.mathcounts.org/southeast-ak-chapte...,AK,02/08/2020,Chapter,Juneau
6,Birmingham Chapter Competition,https://www.mathcounts.org/birmingham-chapter-...,AL,02/29/2020,Chapter,Vestavia Hills


In [6]:
# Get facility locations for each competition 

columns = ["Competition Name", "URL", "State", "Date", "Type", "City", "Facility", "Street Address", "ZIP"]
data = []

for i, row in df.iterrows(): 
    html = requests.get(row["URL"]).text
    soup = BeautifulSoup(html, "html.parser")
    
    fields = soup.find("fieldset").findAll("div", {"class": "field"})
    
    facility = None 
    address = None 
    zipCode = None
    
    for field in fields:
        label = field.find("div", {"class": "field-label"}).text.replace(":", "").strip()
        item = field.find("div", {"class": "field-item"}).text.strip()
        
        if label == "Competition Facility": facility = item
        if label == "Facility Street Address": address = item
        if label == "Facility ZIP Code": zipCode = item
            
    data.append(list(row) + [facility, address, zipCode])

In [7]:
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,Competition Name,URL,State,Date,Type,City,Facility,Street Address,ZIP
0,Anchorage Chapter Competition,https://www.mathcounts.org/anchorage-chapter-c...,AK,02/08/2020,Chapter,Anchorage,University of Alaska Anchorage,,
1,Fairbanks Chapter Competition,https://www.mathcounts.org/fairbanks-chapter-c...,AK,02/15/2020,Chapter,Fairbanks,University of Alaska Fairbanks,,
2,Remote Chapter Competition,https://www.mathcounts.org/remote-chapter-comp...,AK,02/08/2020,Chapter,Anchorage,UAA,3211 Providence Drive,99503.0
3,Southeast AK Chapter Competition,https://www.mathcounts.org/southeast-ak-chapte...,AK,02/08/2020,Chapter,Juneau,University of Alaska Juneau,,
4,Birmingham Chapter Competition,https://www.mathcounts.org/birmingham-chapter-...,AL,02/29/2020,Chapter,Vestavia Hills,Liberty Park Middle School,17035 Liberty Parkway,35242.0


In [8]:
# Filter all competitions that do not have locations 
df = df[(df["Facility"].notnull()) & (df["Facility"] != "TBD") & (df["Facility"] != "tbd")]
df.head()

Unnamed: 0,Competition Name,URL,State,Date,Type,City,Facility,Street Address,ZIP
0,Anchorage Chapter Competition,https://www.mathcounts.org/anchorage-chapter-c...,AK,02/08/2020,Chapter,Anchorage,University of Alaska Anchorage,,
1,Fairbanks Chapter Competition,https://www.mathcounts.org/fairbanks-chapter-c...,AK,02/15/2020,Chapter,Fairbanks,University of Alaska Fairbanks,,
2,Remote Chapter Competition,https://www.mathcounts.org/remote-chapter-comp...,AK,02/08/2020,Chapter,Anchorage,UAA,3211 Providence Drive,99503.0
3,Southeast AK Chapter Competition,https://www.mathcounts.org/southeast-ak-chapte...,AK,02/08/2020,Chapter,Juneau,University of Alaska Juneau,,
4,Birmingham Chapter Competition,https://www.mathcounts.org/birmingham-chapter-...,AL,02/29/2020,Chapter,Vestavia Hills,Liberty Park Middle School,17035 Liberty Parkway,35242.0


In [22]:
# Geocode locations as coordinates 
API_TOKEN = None
with open('credentials.json') as f:
    data = json.load(f)
    API_TOKEN = data["API_KEY"]
geolocator = GoogleV3(api_key=API_TOKEN, timeout = 10)

In [24]:
columns = ["Competition Name", "URL", "State", "Date", "Type", "City", "Facility", "Street Address", "ZIP", "Lat", "Lon"]
data = []

for i, row in df.iterrows():
    query = row["Facility"] + ("" if row["Street Address"] is None else " " + row["Street Address"])
    location = geolocator.geocode(query, components={"city": row["City"], "administrative_area": row["State"], "country": "US"})
        
    if location is None: data.append(list(row) + [None, None])
    else: data.append(list(row) + [location.latitude, location.longitude]) 
    time.sleep(.5) # Sleep to prevent API overload

In [30]:
df = pd.DataFrame(data,columns=columns)
df.head()

Unnamed: 0,Competition Name,URL,State,Date,Type,City,Facility,Street Address,ZIP,Lat,Lon
0,Anchorage Chapter Competition,https://www.mathcounts.org/anchorage-chapter-c...,AK,02/08/2020,Chapter,Anchorage,University of Alaska Anchorage,,,61.191042,-149.81956
1,Fairbanks Chapter Competition,https://www.mathcounts.org/fairbanks-chapter-c...,AK,02/15/2020,Chapter,Fairbanks,University of Alaska Fairbanks,,,64.854665,-147.815676
2,Remote Chapter Competition,https://www.mathcounts.org/remote-chapter-comp...,AK,02/08/2020,Chapter,Anchorage,UAA,3211 Providence Drive,99503.0,61.191042,-149.81956
3,Southeast AK Chapter Competition,https://www.mathcounts.org/southeast-ak-chapte...,AK,02/08/2020,Chapter,Juneau,University of Alaska Juneau,,,58.385521,-134.640746
4,Birmingham Chapter Competition,https://www.mathcounts.org/birmingham-chapter-...,AL,02/29/2020,Chapter,Vestavia Hills,Liberty Park Middle School,17035 Liberty Parkway,35242.0,33.478673,-86.67307


In [32]:
df.to_csv("competitions.csv")

In [31]:
# Manually clean these points 
df[df["Lat"].isnull()].shape

(19, 11)