## Math Counts

## Scrape School Club Locations

In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
from geopy.geocoders import GoogleV3
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import re

In [30]:
# Get all school clubs from Math Counts website

columns = ["State", "City", "Group Name", "Leader"]
data = []

for page in range(0, 9):
    # 2021 Registered School Based Clubs 
    url = "https://www.mathcounts.org/programs/math-club/registered-school-clubs-list?page=" + str(page) 
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    
    table = soup.find("table", {"class": "views-table"}).find("tbody")
    for row in table.findAll("tr"):
        rowData = row.findAll("td")
        
        data.append([rowData[i].getText().strip() for i in range(4)])

df = pd.DataFrame(data, columns=columns)

In [31]:
df.head()

Unnamed: 0,State,City,Group Name,Leader
0,KY,Louisville,Golden Acorn,Mary Hayward
1,AZ,Glendale,Copper Creek,Steve Fay
2,TX,Alba,"Alba-Golden School (Alba, TX)",Cynthia Weimer
3,WI,Cross Plains,St Francis-Xavier,Dave Mack
4,AL,Lanett,Lanett Junior High School,Tiffenie Dozier


In [32]:
# Clean up group name strings  
df["Group Name (Clean)"] = df["Group Name"].apply(lambda x: re.sub(r"\([^)]+\)", "", x).strip())
df.head()

Unnamed: 0,State,City,Group Name,Leader,Group Name (Clean)
0,KY,Louisville,Golden Acorn,Mary Hayward,Golden Acorn
1,AZ,Glendale,Copper Creek,Steve Fay,Copper Creek
2,TX,Alba,"Alba-Golden School (Alba, TX)",Cynthia Weimer,Alba-Golden School
3,WI,Cross Plains,St Francis-Xavier,Dave Mack,St Francis-Xavier
4,AL,Lanett,Lanett Junior High School,Tiffenie Dozier,Lanett Junior High School


In [34]:
# Geocode locations as coordinates 
API_TOKEN = None
with open('credentials.json') as f:
    data = json.load(f)
    API_TOKEN = data["API_KEY"]
geolocator = GoogleV3(api_key=API_TOKEN, timeout = 10)

In [51]:
columns = ["State", "City", "Group Name", "Leader", "Group Name (Clean)", "Lat", "Lon"]
data = []

for i, row in df.iterrows():
    name = row["Group Name (Clean)"].lower()
    if not ("school" in name or "academy" in name or "elementary" in name or "mshs" in name or "high" in name or "middle" in name or "ms/hs" in name or "homeschool" in name or "prep" in name):
        location = None
    else: 
        location = geolocator.geocode(row["Group Name (Clean)"] + ", " + row["City"] + ", " + row["State"] + ", US")
        
    if location is None: data.append(list(row) + [None, None])
    else: data.append(list(row) + [location.latitude, location.longitude]) 
    time.sleep(.5) # Sleep to prevent API overload

In [28]:
df = pd.DataFrame(data,columns=["State", "City", "Group Name", "Leader", "Group Name (Clean)","Lat", "Lon"])
df.head()

Unnamed: 0,State,City,Group Name,Leader,Group Name (Clean),Lat1,Lon1,Lat,Lon
0,KY,Louisville,Golden Acorn,Mary Hayward,Golden Acorn,,,38.096465,-85.855566
1,AZ,Glendale,Copper Creek,Steve Fay,Copper Creek,32.750901,-110.476483,33.684564,-112.210209
2,TX,Alba,"Alba-Golden School (Alba, TX)",Cynthia Weimer,Alba-Golden School,32.752062,-95.582733,32.752062,-95.582733
3,WI,Cross Plains,St Francis-Xavier,Dave Mack,St Francis-Xavier,,,43.119824,-89.634631
4,AL,Lanett,Lanett Junior High School,Tiffenie Dozier,Lanett Junior High School,32.856529,-85.197042,32.856529,-85.197042


In [32]:
df.to_csv("competitions.csv")

In [29]:
# Manually clean these points 
df[df["Lat"].isnull()].shape

(1, 9)

In [19]:
df.shape

(430, 7)

In [20]:
df[df["Lat"].isnull()]

Unnamed: 0,State,City,Group Name,Leader,Group Name (Clean),Lat,Lon
0,KY,Louisville,Golden Acorn,Mary Hayward,Golden Acorn,,
3,WI,Cross Plains,St Francis-Xavier,Dave Mack,St Francis-Xavier,,
6,TN,Antioch,"Apollo Middle School (Antioch, TN)",Elizabeth Rose,Apollo Middle School,,
7,ID,Lewiston,Cornerstone Christian School,Dena Johnson,Cornerstone Christian School,,
14,CA,Portola Valley,Woodland School,Betsy Zager,Woodland School,,
29,CA,Castro Valley,Creekside Middle School,Isabella Siu,Creekside Middle School,,
30,NJ,Highland Park,Highland Park Middle School,Lizette Lopez,Highland Park Middle School,,
31,MI,Monroe,HIS Homeschool,Jennifer Spencer,HIS Homeschool,,
36,VT,Springfield,Riverside Middle School (Weber),Brian Weber,Riverside Middle School,,
39,TX,Houston,"St. Thomas Episcopal (Houston, TX)",Gordon Sampson,St. Thomas Episcopal,,


Location(360 La Cuesta Dr, Portola Valley, CA 94028, USA, (37.402995, -122.19691, 0.0))