# Imports

In [258]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
import re
from datetime import datetime
import time
import itertools
import random
import numpy as np
from io import BytesIO


# Setup

In [3]:
# OHSAA Southwest District conferences
domains = ['https://eccsports.com/',
           'https://gmcsports.com/',
           'http://www.swocsports.com/',
           'http://www.swblsports.com/',
           'http://sbaac.com/',
           'http://ggcl.gclsports.com/',
           ]

# Gender URL examples (standings, schedule, statistics)
girls = ['35', '215', '218']
boys = ['30', '137', '162']

# Requests headers
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}


# Download List of Matches

In [156]:
match_ids = []
totals_rows_list = []
for year in range(2007, 2023):
    for gender in [girls, boys]:
        for domain in domains[:]:
            page = requests.get(
                f'{domain}schedule.aspx?satc={gender[1]}&cmp=1&year={year}', headers=headers)
            soup = BeautifulSoup(page.content, 'html.parser')
            title = soup.find('title').string
            match_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in match_links:
                href = link['href']
                matches = re.findall(r'id=(.*)', href)

                # Add to list of Match IDs
                match_ids.append(matches[0])

                # Add to list of Match IDs with totals from table
                id = int(matches[0])
                total = link.parent.find_previous_sibling().findChild().contents
                date = link.find_previous('th').find_previous('th').contents
                date_parsed = str(datetime.strptime(
                    date[0], '%A, %B %d, %Y').isoformat()[:10])

                row = [id, total, date_parsed,
                       f'{year}-{year+1}', gender[1], domain, title]
                totals_rows_list.append(row)
            time.sleep(.25)


In [158]:
df_matches_totals = pd.DataFrame(totals_rows_list, columns=[
                                 'match_id', 'total', 'date', 'year', 'gender', 'domain', 'title'])
df_matches_totals['total'] = df_matches_totals['total'].str[0]
df_matches_totals.to_csv('matches_sw_district.csv')
df_matches_totals


Unnamed: 0,match_id,total,date,year,gender,domain,title
0,22219,"Loveland 1602, Hamilton 1313",2008-11-24,2008-2009,215,https://gmcsports.com/,Greater Miami Conference
1,22220,"Middletown 2206, Sycamore 1976",2008-11-24,2008-2009,215,https://gmcsports.com/,Greater Miami Conference
2,22222,"Princeton 2318, Colerain 1895",2008-11-25,2008-2009,215,https://gmcsports.com/,Greater Miami Conference
3,22221,"Mason 1827, Hamilton 1465",2008-11-25,2008-2009,215,https://gmcsports.com/,Greater Miami Conference
4,23966,"Lakota East 2072, Walnut Hills 1728",2008-12-01,2008-2009,215,https://gmcsports.com/,Greater Miami Conference
...,...,...,...,...,...,...,...
11028,237373,"New Richmond 2741, McNicholas 2502",2022-02-08,2021-2022,137,http://sbaac.com/,Southern Buckeye Athletic and Academic Conference
11029,218533,"Clermont Northeastern 2256, East Clinton 2112",2022-02-08,2021-2022,137,http://sbaac.com/,Southern Buckeye Athletic and Academic Conference
11030,237374,"Georgetown 2489, New Richmond 2337",2022-02-11,2021-2022,137,http://sbaac.com/,Southern Buckeye Athletic and Academic Conference
11031,240020,Felicity-Franklin at Sectional Tournament **sc...,2022-02-15,2021-2022,137,http://sbaac.com/,Southern Buckeye Athletic and Academic Conference


## Filter out matches without scores or outside usual

In [None]:
has_score_filter = df_matches_totals['Total'].str.contains('(\d\d\d.*){2}')
df_has_score = df_matches_totals[has_score_filter]
not_actual_match = df_has_score['Total'].str.contains(
    '(\d\d\d.+){3,}|( Tournament )|( at )|(lassic)|(,.+){2}')
df_has_score = df_has_score[~not_actual_match]


In [9]:
df_has_score.to_csv('matches_sw_district_have_scores.csv')


## Split score totals

In [None]:
df_score_split = df_has_score.copy()
df_score_split['Total'] = df_score_split['Total'].str.split(',')
df_score_split = df_score_split.explode('Total').reset_index(drop=True)
df_score_split['Name'] = df_score_split['Total'].str[:-5]
df_score_split['Score'] = df_score_split['Total'].str[-4:]


In [15]:
df_score_split.to_csv('matches_sw_district_have_scores_split.csv')


## Download baker games

In [201]:
baker_games = []
for ind in df_matches_totals.sample(frac=1).index:
    page = requests.get(f'{df_matches_totals.domain[ind]}bwMatchStats.aspx?matchid={df_matches_totals.match_id[ind]}', headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    ths = soup.find_all('th', string='BAKER GAMES')
    for i in range(len(ths)):
        games_list = []
        school_th = ths[i].findPrevious('thead').findNext('th').text
        for j in range(6):
            games_list.append(ths[i].parent.findNext('td').find_next_siblings()[j].text)
        baker_row = (df_matches_totals.domain[ind], df_matches_totals.match_id[ind], school_th, games_list)
        baker_games.append(baker_row)
        time.sleep(.25)


In [202]:
df_baker_games = pd.DataFrame(baker_games, columns=['domain', 'match_id', 'school', 'games'])
df_baker_games.to_csv('baker_games.csv')
df_baker_games

Unnamed: 0,domain,match_id,school,games
0,https://eccsports.com/,160342,Loveland,"[90, 107, , , , ]"
1,https://eccsports.com/,160342,Winton Woods,"[92, 76, , , , ]"
2,https://gmcsports.com/,24731,Lakota West,"[240, 194, , , , ]"
3,http://sbaac.com/,238189,Felicity-Franklin,"[0, 0, 121, 119, 125, 111]"
4,https://gmcsports.com/,22173,Princeton,"[181, 160, 203, 167, , ]"
...,...,...,...,...
14281,https://eccsports.com/,115787,Loveland,"[92, 110, 122, 122, , ]"
14282,http://ggcl.gclsports.com/,147465,Mount Notre Dame,"[132, 126, 128, 128, , ]"
14283,http://ggcl.gclsports.com/,147465,Mercy McAuley,"[190, 220, 160, 174, , ]"
14284,https://gmcsports.com/,66863,Oak Hills,"[169, 213, 202, 169, , ]"


# Download List of Schools

In [18]:
schools_rows_list = []
for domain in domains[:]:
    for gender in [girls, boys]:
        for year in range(2007, 2023):
            page = requests.get(
                f'{domain}confstandings.aspx?sat={gender[0]}&cmp=1&year={year}')
            soup = BeautifulSoup(page.content, 'html.parser')
            school_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in school_links:

                school_name = link.contents

                schools_rows_list.append(school_name)
            time.sleep(10)


In [19]:
df_official_names = pd.DataFrame(schools_rows_list, columns=['Name'])
df_official_names = df_official_names.drop_duplicates().reset_index(drop=True)


In [20]:
df_official_names.to_csv('official_names.csv')


# Download Player Stats

In [4]:
school_ids = []
schools_rows_list = []

for year in range(2007, 2023):
    for gender in [girls, boys]:
        for domain in domains[:]:
            page = requests.get(
                f'{domain}bwstatistics.aspx?satc={gender[2]}&year={year}', headers=headers)
            soup = BeautifulSoup(page.content, 'html.parser')
            stats_links = soup.select('a[href*=teamStats]')
            for link in stats_links:
                href = link['href']
                matches = re.findall(r'id=(.*)', href)
                school_ids.append(matches[0])
                id = int(matches[0])
                row = [id, domain]
                schools_rows_list.append(row)


In [5]:
schools_rows_list.sort()
unique_schools = list(k for k, _ in itertools.groupby(schools_rows_list))
df_unique_schools = pd.DataFrame(
    unique_schools, columns=['SchoolID', 'Domain'])
unique_schools


[[1, 'https://gmcsports.com/'],
 [2, 'https://gmcsports.com/'],
 [3, 'https://gmcsports.com/'],
 [4, 'https://gmcsports.com/'],
 [5, 'https://gmcsports.com/'],
 [6, 'https://gmcsports.com/'],
 [7, 'https://eccsports.com/'],
 [8, 'https://gmcsports.com/'],
 [9, 'https://gmcsports.com/'],
 [10, 'https://gmcsports.com/'],
 [11, 'http://ggcl.gclsports.com/'],
 [12, 'http://ggcl.gclsports.com/'],
 [13, 'http://ggcl.gclsports.com/'],
 [16, 'http://ggcl.gclsports.com/'],
 [18, 'http://ggcl.gclsports.com/'],
 [19, 'http://ggcl.gclsports.com/'],
 [22, 'https://eccsports.com/'],
 [23, 'https://eccsports.com/'],
 [24, 'http://sbaac.com/'],
 [25, 'http://www.swocsports.com/'],
 [26, 'https://eccsports.com/'],
 [27, 'http://www.swocsports.com/'],
 [27, 'https://eccsports.com/'],
 [28, 'https://eccsports.com/'],
 [29, 'https://gmcsports.com/'],
 [31, 'http://www.swocsports.com/'],
 [32, 'http://www.swocsports.com/'],
 [33, 'https://eccsports.com/'],
 [34, 'https://eccsports.com/'],
 [35, 'http://sba

In [6]:

df_unique_schools.to_csv('school_ids.csv')


In [7]:
players_rows_list = []
random.shuffle(unique_schools)
for year in range(2007, 2023):
    for gender in [girls, boys]:
        for school_id, domain in unique_schools[:]:
            page = requests.get(
                f'{domain}teamStats.aspx?sat={gender[0]}&cmp=1&year={year}&schoolid={school_id}', headers=headers)
            soup = BeautifulSoup(page.content, 'html.parser')
            player_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in player_links:
                href = link['href']
                matches = re.findall(r'player=(.*)', href)

                # Add to list of Match IDs with totals from table
                id = int(matches[0])

                row = [id, domain, ]
                players_rows_list.append(row)
            time.sleep(.25)
players_rows_list
df_player_ids = pd.DataFrame(players_rows_list, columns=[
                             'player_id', 'domain', ])


[[57953, 'http://ggcl.gclsports.com/'],
 [57949, 'http://ggcl.gclsports.com/'],
 [57948, 'http://ggcl.gclsports.com/'],
 [57950, 'http://ggcl.gclsports.com/'],
 [57954, 'http://ggcl.gclsports.com/'],
 [57951, 'http://ggcl.gclsports.com/'],
 [57955, 'http://ggcl.gclsports.com/'],
 [57952, 'http://ggcl.gclsports.com/'],
 [58307, 'http://ggcl.gclsports.com/'],
 [58308, 'http://ggcl.gclsports.com/'],
 [58309, 'http://ggcl.gclsports.com/'],
 [58310, 'http://ggcl.gclsports.com/'],
 [64552, 'http://ggcl.gclsports.com/'],
 [58311, 'http://ggcl.gclsports.com/'],
 [58312, 'http://ggcl.gclsports.com/'],
 [58313, 'http://ggcl.gclsports.com/'],
 [58314, 'http://ggcl.gclsports.com/'],
 [57914, 'http://ggcl.gclsports.com/'],
 [57915, 'http://ggcl.gclsports.com/'],
 [57916, 'http://ggcl.gclsports.com/'],
 [57917, 'http://ggcl.gclsports.com/'],
 [57918, 'http://ggcl.gclsports.com/'],
 [62204, 'http://ggcl.gclsports.com/'],
 [57919, 'http://ggcl.gclsports.com/'],
 [63577, 'http://ggcl.gclsports.com/'],


In [8]:
df_player_ids.to_csv('player_ids.csv')


In [9]:
gamestats = []
random.shuffle(players_rows_list)
for p_id, dom in players_rows_list:
    page = requests.get(f'{dom}playerStats.aspx?player={p_id}', headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('title').string
    schoolH1 = soup.select('h1')
    player_name = soup.select('h2')[0].contents[0].text
    try:
        grade_lvl = soup.select('span[id*=GradeVal]')[0].contents[0].text
    except:
        grade_lvl = ''
    match_list = soup.select('a[id*=GamesRepeat]')
    # soup.select('a[id*=ContentPlaceHolder1]')
    schname = schoolH1[0].contents[0].text
    genname = schoolH1[0].contents[2].text
    if title == 'Girls Greater Catholic League':
        ind = 0
        genname = 'Girls'
    else:
        ind = 1
    schgen = [re.findall(pattern='(\w+.*)', string=schname)
              [0], re.findall(pattern='(\w+)', string=genname)[ind]]
    for i in range(len(match_list)):
        href = match_list[i]['href']
        match_id = re.findall(r'ID=(.*)', href)
        pins = match_list[i].find_previous().find_previous_siblings()[3].text
        location = match_list[i].find_previous().find_previous_siblings()[
            5].text
        opponent = match_list[i].find_previous().find_previous_siblings()[
            6].text
        row = [player_name, p_id, schgen[1], grade_lvl,
               schgen[0], match_id[0], location, opponent, pins, ]
        gamestats.append(row)
    time.sleep(.25)


In [135]:
df_gamestats = pd.DataFrame(gamestats, columns=[
                            'player_name', 'player_id', 'gender', 'grade_level', 'school', 'match_id', 'location', 'opponent', 'pins'])
# Count number of games in match
df_gamestats['num'] = df_gamestats['pins'].str.count(',') + 1
df_gamestats


Unnamed: 0,player_name,player_id,gender,grade_level,school,match_id,location,opponent,pins,num
0,Chuck Hammond,92929,Boys,Senior,Mason,31538,Eastern,Middletown,"213, 211",2
1,Chuck Hammond,92929,Boys,Senior,Mason,31539,Columbia,Hamilton,"232, 254",2
2,Chuck Hammond,92929,Boys,Senior,Mason,31540,Columbus,Kick-off,"212, 149, 164",3
3,Chuck Hammond,92929,Boys,Senior,Mason,31541,mason,Fairfield,"223, 211",2
4,Chuck Hammond,92929,Boys,Senior,Mason,31542,Western,Holiday Classic,"211, 233, 220",3
...,...,...,...,...,...,...,...,...,...,...
89528,Kayla Mineer,448590,Girls,Junior,Anderson,115866,Cherry Grove Lanes,West Clermont,"168, 135",2
89529,Kayla Mineer,448590,Girls,Junior,Anderson,115868,Cherry Grove Lanes,West Clermont,"124, 146",2
89530,Kayla Mineer,448590,Girls,Junior,Anderson,115874,Cherry Grove Lanes,Turpin,"112, 156",2
89531,Kayla Mineer,448590,Girls,Junior,Anderson,115877,Cherry Grove Lanes,Turpin,"125, 143",2


In [34]:
df_gamestats.to_csv('gamestats.csv')


In [153]:
# Remove whitespace in pins column
df_gamestats['pins'] = df_gamestats['pins'].replace(' ', '', regex=True)
# Split pins to individual games and rename new columns
df_games_split = pd.DataFrame(df_gamestats['pins'].str.split(',').tolist(), )
df_games_split.rename(columns={i: "game"+str(i+1)
                      for i in range(len(df_games_split.columns))}, inplace=True)
# Convert to int and drop all-NaN rows
df_games_split = df_games_split.apply(pd.to_numeric, downcast='integer')
df_games_split.dropna(how='all', inplace=True)
# Calculate standard deviation, variance, median, mean, maximum, minimum, range, and total pinfall
df_games_split['std'] = np.std(df_games_split, axis=1).round(0)
df_games_split['var'] = np.var(df_games_split, axis=1).round(0)
df_games_split['med'] = np.nanmedian(df_games_split, axis=1).round(0)
df_games_split['avg'] = df_games_split.iloc[:, :-3].mean(axis=1).round(0)
df_games_split['max'] = df_games_split.iloc[:, :-4].max(axis=1)
df_games_split['min'] = df_games_split.iloc[:, :-5].min(axis=1)
df_games_split['rng'] = df_games_split['max'] - df_games_split['min']
df_games_split['tot'] = df_games_split.iloc[:, :-7].sum(axis=1)
# Re-convert to int
df_games_split = df_games_split.astype('Int16')
df_games_split


Unnamed: 0,game1,game2,game3,game4,game5,game6,std,var,med,avg,max,min,rng,tot
0,213,211,,,,,1,9894,212,212,213,211,2,424
1,232,254,,,,,11,12042,243,243,254,232,22,486
2,212,149,164,,,,27,4648,164,175,212,149,63,525
3,223,211,,,,,6,9918,217,217,223,211,12,434
4,211,233,220,,,,9,8515,220,221,233,211,22,664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89528,168,135,,,,,16,4262,152,152,168,135,33,303
89529,124,146,,,,,11,3498,135,135,146,124,22,270
89530,112,156,,,,,22,3110,134,134,156,112,44,268
89531,125,143,,,,,9,3526,134,134,143,125,18,268


In [154]:
df_gamestats_split = pd.concat([df_gamestats, df_games_split], axis=1)
df_gamestats_split


Unnamed: 0,player_name,player_id,gender,grade_level,school,match_id,location,opponent,pins,num,...,game5,game6,std,var,med,avg,max,min,rng,tot
0,Chuck Hammond,92929,Boys,Senior,Mason,31538,Eastern,Middletown,213211,2,...,,,1,9894,212,212,213,211,2,424
1,Chuck Hammond,92929,Boys,Senior,Mason,31539,Columbia,Hamilton,232254,2,...,,,11,12042,243,243,254,232,22,486
2,Chuck Hammond,92929,Boys,Senior,Mason,31540,Columbus,Kick-off,212149164,3,...,,,27,4648,164,175,212,149,63,525
3,Chuck Hammond,92929,Boys,Senior,Mason,31541,mason,Fairfield,223211,2,...,,,6,9918,217,217,223,211,12,434
4,Chuck Hammond,92929,Boys,Senior,Mason,31542,Western,Holiday Classic,211233220,3,...,,,9,8515,220,221,233,211,22,664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89528,Kayla Mineer,448590,Girls,Junior,Anderson,115866,Cherry Grove Lanes,West Clermont,168135,2,...,,,16,4262,152,152,168,135,33,303
89529,Kayla Mineer,448590,Girls,Junior,Anderson,115868,Cherry Grove Lanes,West Clermont,124146,2,...,,,11,3498,135,135,146,124,22,270
89530,Kayla Mineer,448590,Girls,Junior,Anderson,115874,Cherry Grove Lanes,Turpin,112156,2,...,,,22,3110,134,134,156,112,44,268
89531,Kayla Mineer,448590,Girls,Junior,Anderson,115877,Cherry Grove Lanes,Turpin,125143,2,...,,,9,3526,134,134,143,125,18,268


In [155]:
df_gamestats_split.to_csv('gamestats_split.csv')


## Find school locations

Pulling locations from [OpenStreetMap](https://openstreetmap.org/copyright). Data is licensed under the [Open Data Commons Open Database License](https://opendatacommons.org/licenses/odbl/).

In [249]:
school_loc_rows = []
school_names = df_gamestats_split.school.unique()
for name in school_names:
    response = requests.get(f'https://nominatim.openstreetmap.org/search.php?q={name}+high,ohio&format=jsonv2').json()
    if len(response) >= 1:
        pass
    else:
        response = requests.get(f'https://nominatim.openstreetmap.org/search.php?q={name}+academy,ohio&format=jsonv2').json()
    # Some coordinates returned with ridiculous 15 decimal places; school locations don't need atomic-level precision, and even 5 is probably overkill
    lat = round(float(response[0]['lat']), 5)
    lon = round(float(response[0]['lon']), 5)
    display_name = response[0]['display_name']
    zip = re.search(pattern='(\d{5})', string=display_name).expand('\g<1>')
    school_loc_rows.append([name, lat, lon, display_name, zip])
    time.sleep(5)


In [250]:
df_school_locations = pd.DataFrame(school_loc_rows, columns=['name', 'latitude', 'longitude', 'display_name', 'zip'])
df_school_locations

Unnamed: 0,name,latitude,longitude,display_name,zip
0,Mason,39.35071,-84.30624,"William Mason High School, Lakeside Drive, Mas...",45040
1,Mount Notre Dame,39.222,-84.43283,"Mount Notre Dame High School, East Benson Aven...",45215
2,Turpin,39.10551,-84.36641,"Turpin High School, Bartels Road, Anderson Tow...",45244
3,Winton Woods,39.28798,-84.52618,"Winton Woods High School, 1231, West Kemper Ro...",45240
4,Fairfield,39.33688,-84.51831,"Fairfield Senior High School, 8800, Holden Bou...",45014
5,Kings,39.35317,-84.25501,"Kings High School, McClelland Avenue, Deerfiel...",45040
6,Milford,39.17987,-84.24062,"Milford High School, Eagles Way, Miami Townshi...",45150
7,McAuley,39.20058,-84.55312,"Mercy McAuley High School, 6000, Oakwood Avenu...",45224
8,Hamilton,39.42485,-84.58,"Hamilton High School, Eaton Avenue, River View...",45013
9,Sycamore,39.26651,-84.34983,"Sycamore High School, Cincinnati By-Pass, Mont...",45249


In [251]:
# Franklin returned a rec center at a university for whatever reason; corrected this one manually for ease of use
df_school_locations.loc[40] = 'Franklin', 39.55327, -84.28708, 'Franklin High School, 750, East 4th Street, Franklin, Warren County, Ohio, 45005, United States', 45005
df_school_locations

Unnamed: 0,name,latitude,longitude,display_name,zip
0,Mason,39.35071,-84.30624,"William Mason High School, Lakeside Drive, Mas...",45040
1,Mount Notre Dame,39.222,-84.43283,"Mount Notre Dame High School, East Benson Aven...",45215
2,Turpin,39.10551,-84.36641,"Turpin High School, Bartels Road, Anderson Tow...",45244
3,Winton Woods,39.28798,-84.52618,"Winton Woods High School, 1231, West Kemper Ro...",45240
4,Fairfield,39.33688,-84.51831,"Fairfield Senior High School, 8800, Holden Bou...",45014
5,Kings,39.35317,-84.25501,"Kings High School, McClelland Avenue, Deerfiel...",45040
6,Milford,39.17987,-84.24062,"Milford High School, Eagles Way, Miami Townshi...",45150
7,McAuley,39.20058,-84.55312,"Mercy McAuley High School, 6000, Oakwood Avenu...",45224
8,Hamilton,39.42485,-84.58,"Hamilton High School, Eaton Avenue, River View...",45013
9,Sycamore,39.26651,-84.34983,"Sycamore High School, Cincinnati By-Pass, Mont...",45249


In [252]:
df_school_locations.to_csv('school_locations.csv')

# Find median income data

In [255]:
median_raw = requests.get('https://api.census.gov/data/2020/acs/acs5/subject?get=NAME,S1903_C03_001E&for=zip%20code%20tabulation%20area:*').content

In [287]:
# Request returned as bytes
df_median_income = pd.read_csv(BytesIO(median_raw))
df_median_income

Unnamed: 0,"[[""NAME""",S1903_C03_001E,zip code tabulation area],Unnamed: 3
0,"[""ZCTA5 29590""",30985,29590],
1,"[""ZCTA5 93306""",54450,93306],
2,"[""ZCTA5 93660""",39625,93660],
3,"[""ZCTA5 93110""",93264,93110],
4,"[""ZCTA5 93212""",42983,93212],
...,...,...,...,...
33115,"[""ZCTA5 16623""",51667,16623],
33116,"[""ZCTA5 16627""",45000,16627],
33117,"[""ZCTA5 16634""",51500,16634],
33118,"[""ZCTA5 16640""",55982,16640],


In [289]:
# Fix messy conversion to DataFrame
df_median_income.drop(df_median_income.columns[[0, 3]], axis=1, inplace=True)
df_median_income.set_axis(['zip_code', 'median_income'], axis=1, inplace=True)
df_median_income['median_income'] = df_median_income['median_income'].apply(lambda x: x.replace(']',''))
df_median_income = df_median_income.astype('int32')
df_median_income

Unnamed: 0,zip_code,median_income
0,30985,29590
1,54450,93306
2,39625,93660
3,93264,93110
4,42983,93212
...,...,...
33115,51667,16623
33116,45000,16627
33117,51500,16634
33118,55982,16640


In [294]:
df_median_income.to_csv('median_income_by_zip.csv')

# Find bowling alley locations

In [361]:
page = requests.get('https://www.kidsbowlfree.com/all_centers.php')
soup = BeautifulSoup(page.content, 'html.parser')

In [364]:
alley_as = soup.find_all('a', {'title': re.compile('OH|KY|IN')})

In [366]:
alley_ids = []
for i in range(len(alley_as)):
    match = re.findall(r'id=(\d*)', str(alley_as[i]))[0]
    alley_ids.append(match)

In [370]:
alley_rows_list = []
for i in range(len(alley_ids)):
    page = requests.get(f'https://www.kidsbowlfree.com/center.php?alley_id={alley_ids[i]}')
    soup = BeautifulSoup(page.content, 'html.parser')
    name = soup.find_all('h1')[0].text.rstrip()
    address = soup.find_all('h4')[0].text.split('\n')
    street = address[0]
    rest = address[1].split(',')
    city = rest[0].lstrip()
    state = rest[1].lstrip()
    zip = rest[2].lstrip()[:5]
    try:
        response = requests.get(f'https://nominatim.openstreetmap.org/search.php?q={street}+{city},{state}&format=jsonv2').json()
        if len(response) >=1:
            lat = round(float(response[0]['lat']), 5)
            lon = round(float(response[0]['lon']), 5)
        else:
            lat = 0
            lon = 0
    except ValueError:  
        print('Decoding JSON failed')
    row =[name, street, city, state, zip, lat, lon]
    alley_rows_list.append(row)
    time.sleep(15)

Decoding JSON failed


In [400]:
df_alleys = pd.DataFrame(alley_rows_list, columns=['name', 'street_address', 'city', 'state', 'zip', 'latitude', 'longitude'])
df_alleys

Unnamed: 0,name,street_address,city,state,zip,latitude,longitude
0,Beech Grove Bowl,95 N 2nd Ave,Beech Grove,IN,46107,39.72342,-86.08373
1,Classic Bowling Lanes,1421 N Willis Dr,Bloomington,IN,47404,39.18003,-86.54533
2,IMU Bowling & Billiards,900 E. Seventh St.,Bloomington,IN,47405,0.00000,0.00000
3,Blackiston Bowl,1516 Blackiston Mill Rd,Clarksville,IN,47129,38.31385,-85.77146
4,Clarksville Strike & Spare,900 Eastern Blvd,Clarksville,IN,47129,38.30337,-85.76479
...,...,...,...,...,...,...,...
112,Le Ella Lanes,1428 US 22 NW,Washington Court House,OH,43160,0.00000,0.00000
113,Le Ella Lanes of Wellston,15 N Park Ave,Wellston,OH,45692,39.12390,-82.53593
114,Dynasty Lanes,3105 S.R. 103 East,Willard,OH,44890,0.00000,0.00000
115,Royal Z Lanes,2667 St Rt 22&3,Wilmington,OH,45177,48.03349,-71.29568


In [401]:
df_alleys.to_csv('alley_addresses.csv')

In [402]:
df_alleys = df_alleys.loc[((df_alleys['latitude'].between(38.75, 40.5))&(df_alleys['longitude'].between(-85.5, -83.75))),:]
df_alleys = df_alleys.drop_duplicates()
df_alleys

Unnamed: 0,name,street_address,city,state,zip,latitude,longitude
19,Durbin Bowl,158 Front St,Lawrenceburg,IN,47025,39.09341,-84.85595
27,Richmond 40 Bowl,75 S 37th St,Richmond,IN,47374,39.8252,-84.85065
33,Southern Lanes,7634 Alexandria Pike,Alexandria,KY,41001,38.97744,-84.39221
36,Strike & Spare Erlanger,510 Commonwealth Ave,Erlanger,KY,41018,39.02066,-84.60484
39,La Ru Bowling Lanes,2443 Alexandria Pike,Highland Heights,KY,41076,39.03854,-84.45038
47,Super Bowl Bellewood,1211 Waterworks Rd.,Newport,KY,41071,39.09446,-84.46945
56,Batavia Bowl,1991 James E. Sauls Drive,Batavia,OH,45103,39.07331,-84.1134
65,Strike & Spare Western Bowl,6383 Glenway Ave.,Cincinnati,OH,45211,39.14543,-84.62431
66,Pla-Mor Lanes,225 E Hardin St,Coldwater,OH,45828,40.4887,-84.62483
71,Poelking Lanes - Dayton,1403 Wilmington Ave,Dayton,OH,45420,39.72844,-84.15566


In [403]:
df_alleys.to_csv('alley_addresses_bounded.csv')