# Imports

In [49]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
import re
from datetime import datetime
import time
import itertools
import random
import numpy as np

# Setup

In [3]:
# OHSAA Southwest District conferences
domains = ['https://eccsports.com/',
           'https://gmcsports.com/',
           'http://www.swocsports.com/',
           'http://www.swblsports.com/',
           'http://sbaac.com/',
           'http://ggcl.gclsports.com/',
           ]

# Gender URL examples (standings, schedule, statistics)
girls = ['35', '215', '218']
boys = ['30', '137', '162']

# Requests headers
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}


# Download List of Matches

In [6]:
match_ids = []
totals_rows_list = []
for year in range(2007, 2023):
    for gender in [girls, boys]:
        for domain in domains[:]:
            page = requests.get(
                f'{domain}schedule.aspx?satc={gender[1]}&cmp=1&year={year}', headers=headers)
            soup = BeautifulSoup(page.content, 'html.parser')
            title = soup.find('title').string
            match_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in match_links:
                href = link['href']
                matches = re.findall(r'id=(.*)', href)

                # Add to list of Match IDs
                match_ids.append(matches[0])

                # Add to list of Match IDs with totals from table
                id = int(matches[0])
                total = link.parent.find_previous_sibling().findChild().contents
                date = link.find_previous('th').find_previous('th').contents
                date_parsed = str(datetime.strptime(
                    date[0], '%A, %B %d, %Y').isoformat()[:10])

                row = [id, total, date_parsed,
                       f'{year}-{year+1}', gender[1], domain, title]
                totals_rows_list.append(row)
            time.sleep(1)


In [7]:
df_matches_totals = pd.DataFrame(totals_rows_list, columns=[
                                 'MatchID', 'Total', 'Date', 'Year', 'Gender', 'Domain', 'Title'])
df_matches_totals['Total'] = df_matches_totals['Total'].str[0]
df_matches_totals.to_csv('matches_sw_district.csv')


## Filter out matches without scores or outside usual

In [None]:
has_score_filter = df_matches_totals['Total'].str.contains('(\d\d\d.*){2}')
df_has_score = df_matches_totals[has_score_filter]
not_actual_match = df_has_score['Total'].str.contains(
    '(\d\d\d.+){3,}|( Tournament )|( at )|(lassic)|(,.+){2}')
df_has_score = df_has_score[~not_actual_match]


In [9]:
df_has_score.to_csv('matches_sw_district_have_scores.csv')


## Split score totals

In [None]:
df_score_split = df_has_score.copy()
df_score_split['Total'] = df_score_split['Total'].str.split(',')
df_score_split = df_score_split.explode('Total').reset_index(drop=True)
df_score_split['Name'] = df_score_split['Total'].str[:-5]
df_score_split['Score'] = df_score_split['Total'].str[-4:]


In [15]:
df_score_split.to_csv('matches_sw_district_have_scores_split.csv')


# Download List of Schools

In [18]:
schools_rows_list = []
for domain in domains[:]:
    for gender in [girls, boys]:
        for year in range(2007, 2023):
            page = requests.get(f'{domain}confstandings.aspx?sat={gender[0]}&cmp=1&year={year}')
            soup = BeautifulSoup(page.content, 'html.parser')
            school_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in school_links:
                
                school_name = link.contents
                
                schools_rows_list.append(school_name) 
            time.sleep(10)


In [19]:
df_official_names = pd.DataFrame(schools_rows_list, columns=['Name'])
df_official_names = df_official_names.drop_duplicates().reset_index(drop=True)

In [20]:
df_official_names.to_csv('official_names.csv')

# Download Player Stats

In [4]:
school_ids = []
schools_rows_list = []

for year in range(2007, 2023):
    for gender in [girls, boys]:
        for domain in domains[:]:
            page = requests.get(f'{domain}bwstatistics.aspx?satc={gender[2]}&year={year}')
            soup = BeautifulSoup(page.content, 'html.parser')
            stats_links = soup.select('a[href*=teamStats]')
            for link in stats_links:
                href = link['href']
                matches = re.findall(r'id=(.*)', href)
                school_ids.append(matches[0])
                id = int(matches[0])
                row = [id, domain]
                schools_rows_list.append(row)


In [5]:
schools_rows_list.sort()
unique_schools = list(k for k,_ in itertools.groupby(schools_rows_list))
df_unique_schools = pd.DataFrame(unique_schools, columns=['SchoolID', 'Domain'])
unique_schools

[[1, 'https://gmcsports.com/'],
 [2, 'https://gmcsports.com/'],
 [3, 'https://gmcsports.com/'],
 [4, 'https://gmcsports.com/'],
 [5, 'https://gmcsports.com/'],
 [6, 'https://gmcsports.com/'],
 [7, 'https://eccsports.com/'],
 [8, 'https://gmcsports.com/'],
 [9, 'https://gmcsports.com/'],
 [10, 'https://gmcsports.com/'],
 [11, 'http://ggcl.gclsports.com/'],
 [12, 'http://ggcl.gclsports.com/'],
 [13, 'http://ggcl.gclsports.com/'],
 [16, 'http://ggcl.gclsports.com/'],
 [18, 'http://ggcl.gclsports.com/'],
 [19, 'http://ggcl.gclsports.com/'],
 [22, 'https://eccsports.com/'],
 [23, 'https://eccsports.com/'],
 [24, 'http://sbaac.com/'],
 [25, 'http://www.swocsports.com/'],
 [26, 'https://eccsports.com/'],
 [27, 'http://www.swocsports.com/'],
 [27, 'https://eccsports.com/'],
 [28, 'https://eccsports.com/'],
 [29, 'https://gmcsports.com/'],
 [31, 'http://www.swocsports.com/'],
 [32, 'http://www.swocsports.com/'],
 [33, 'https://eccsports.com/'],
 [34, 'https://eccsports.com/'],
 [35, 'http://sba

In [6]:

df_unique_schools.to_csv('school_ids.csv')

In [7]:
players_rows_list = []
random.shuffle(unique_schools)
for year in range(2007, 2023):
    for gender in [girls, boys]:
        for school_id, domain in unique_schools[:]:
            page = requests.get(f'{domain}teamStats.aspx?sat={gender[0]}&cmp=1&year={year}&schoolid={school_id}')
            soup = BeautifulSoup(page.content, 'html.parser')
            player_links = soup.select('a[id*=ContentPlaceHolder1]')
            for link in player_links:
                href = link['href']
                matches = re.findall(r'player=(.*)', href)
                
                # Add to list of Match IDs with totals from table
                id = int(matches[0])

                row = [id, domain, ]
                players_rows_list.append(row) 
            time.sleep(.25)
players_rows_list
df_player_ids = pd.DataFrame(players_rows_list, columns=['PlayerID', 'Domain',])


[[57953, 'http://ggcl.gclsports.com/'],
 [57949, 'http://ggcl.gclsports.com/'],
 [57948, 'http://ggcl.gclsports.com/'],
 [57950, 'http://ggcl.gclsports.com/'],
 [57954, 'http://ggcl.gclsports.com/'],
 [57951, 'http://ggcl.gclsports.com/'],
 [57955, 'http://ggcl.gclsports.com/'],
 [57952, 'http://ggcl.gclsports.com/'],
 [58307, 'http://ggcl.gclsports.com/'],
 [58308, 'http://ggcl.gclsports.com/'],
 [58309, 'http://ggcl.gclsports.com/'],
 [58310, 'http://ggcl.gclsports.com/'],
 [64552, 'http://ggcl.gclsports.com/'],
 [58311, 'http://ggcl.gclsports.com/'],
 [58312, 'http://ggcl.gclsports.com/'],
 [58313, 'http://ggcl.gclsports.com/'],
 [58314, 'http://ggcl.gclsports.com/'],
 [57914, 'http://ggcl.gclsports.com/'],
 [57915, 'http://ggcl.gclsports.com/'],
 [57916, 'http://ggcl.gclsports.com/'],
 [57917, 'http://ggcl.gclsports.com/'],
 [57918, 'http://ggcl.gclsports.com/'],
 [62204, 'http://ggcl.gclsports.com/'],
 [57919, 'http://ggcl.gclsports.com/'],
 [63577, 'http://ggcl.gclsports.com/'],


In [8]:
df_player_ids.to_csv('player_ids.csv')

In [9]:
gamestats = []
random.shuffle(players_rows_list)
for p_id, dom in players_rows_list:
    page = requests.get(f'{dom}playerStats.aspx?player={p_id}')
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('title').string
    schoolH1 = soup.select('h1')
    player_name = soup.select('h2')[0].contents[0].text
    try:
        grade_lvl = soup.select('span[id*=GradeVal]')[0].contents[0].text
    except:
        grade_lvl = ''
    match_list = soup.select('a[id*=GamesRepeat]')
    #soup.select('a[id*=ContentPlaceHolder1]')
    schname = schoolH1[0].contents[0].text
    genname = schoolH1[0].contents[2].text
    if title=='Girls Greater Catholic League':
        ind=0
        genname = 'Girls'
    else:
        ind=1
    schgen = [re.findall(pattern='(\w+.*)', string=schname)[0], re.findall(pattern='(\w+)', string=genname)[ind]]
    for i in range(len(match_list)):
        href = match_list[i]['href']
        match_id = re.findall(r'ID=(.*)', href)
        pins = match_list[i].find_previous().find_previous_siblings()[3].text
        location = match_list[i].find_previous().find_previous_siblings()[5].text
        opponent = match_list[i].find_previous().find_previous_siblings()[6].text
        row = [player_name, p_id, schgen[1], grade_lvl, schgen[0], match_id[0], location, opponent, pins,]
        gamestats.append(row)
    time.sleep(.25)


In [135]:
df_gamestats = pd.DataFrame(gamestats, columns=['player_name', 'player_id', 'gender', 'grade_level', 'school', 'match_id', 'location', 'opponent', 'pins'])
# Count number of games in match
df_gamestats['num'] = df_gamestats['pins'].str.count(',') + 1
df_gamestats


Unnamed: 0,player_name,player_id,gender,grade_level,school,match_id,location,opponent,pins,num
0,Chuck Hammond,92929,Boys,Senior,Mason,31538,Eastern,Middletown,"213, 211",2
1,Chuck Hammond,92929,Boys,Senior,Mason,31539,Columbia,Hamilton,"232, 254",2
2,Chuck Hammond,92929,Boys,Senior,Mason,31540,Columbus,Kick-off,"212, 149, 164",3
3,Chuck Hammond,92929,Boys,Senior,Mason,31541,mason,Fairfield,"223, 211",2
4,Chuck Hammond,92929,Boys,Senior,Mason,31542,Western,Holiday Classic,"211, 233, 220",3
...,...,...,...,...,...,...,...,...,...,...
89528,Kayla Mineer,448590,Girls,Junior,Anderson,115866,Cherry Grove Lanes,West Clermont,"168, 135",2
89529,Kayla Mineer,448590,Girls,Junior,Anderson,115868,Cherry Grove Lanes,West Clermont,"124, 146",2
89530,Kayla Mineer,448590,Girls,Junior,Anderson,115874,Cherry Grove Lanes,Turpin,"112, 156",2
89531,Kayla Mineer,448590,Girls,Junior,Anderson,115877,Cherry Grove Lanes,Turpin,"125, 143",2


In [34]:
df_gamestats.to_csv('gamestats.csv')

In [144]:
# Remove whitespace in pins column
df_gamestats['pins'] = df_gamestats['pins'].replace(' ', '', regex=True)
# Split pins to individual games and rename new columns
df_games_split = pd.DataFrame(df_gamestats['pins'].str.split(',').tolist(), )
df_games_split.rename(columns={i: "game"+str(i+1) for i in range(len(df_games_split.columns))}, inplace=True)
# Convert to int and drop all-NaN rows
df_games_split = df_games_split.apply(pd.to_numeric, downcast='integer')
df_games_split.dropna(how='all', inplace=True)
# Calculate standard deviation, variance, median, mean, maximum, minimum, and range
df_games_split['std'] = np.std(df_games_split, axis=1).round(0)
df_games_split['var'] = np.var(df_games_split, axis=1).round(0)
df_games_split['med'] = np.nanmedian(df_games_split, axis=1).round(0)
df_games_split['avg'] = df_games_split.iloc[:,:-3].mean(axis=1).round(0)
df_games_split['max'] = df_games_split.iloc[:,:-4].max(axis=1)
df_games_split['min'] = df_games_split.iloc[:,:-5].min(axis=1)
df_games_split['rng'] = df_games_split['max'] - df_games_split['min']
# Re-convert to int
df_games_split = df_games_split.astype('Int16')
df_games_split

Unnamed: 0,game1,game2,game3,game4,game5,game6,std,var,med,avg,max,min,rng
0,213,211,,,,,1,9894,212,212,213,211,2
1,232,254,,,,,11,12042,243,243,254,232,22
2,212,149,164,,,,27,4648,164,175,212,149,63
3,223,211,,,,,6,9918,217,217,223,211,12
4,211,233,220,,,,9,8515,220,221,233,211,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
89528,168,135,,,,,16,4262,152,152,168,135,33
89529,124,146,,,,,11,3498,135,135,146,124,22
89530,112,156,,,,,22,3110,134,134,156,112,44
89531,125,143,,,,,9,3526,134,134,143,125,18


In [134]:
df_gamestats_split = pd.concat([df_gamestats, df_games_split], axis=1)
df_gamestats_split

Unnamed: 0,player_name,player_id,gender,grade_level,school,match_id,location,opponent,pins,num,...,game4,game5,game6,std,var,med,avg,max,min,rng
0,Chuck Hammond,92929,Boys,Senior,Mason,31538,Eastern,Middletown,213211,2,...,,,,1,9894,212,212,213,211,2
1,Chuck Hammond,92929,Boys,Senior,Mason,31539,Columbia,Hamilton,232254,2,...,,,,11,12042,243,243,254,232,22
2,Chuck Hammond,92929,Boys,Senior,Mason,31540,Columbus,Kick-off,212149164,3,...,,,,27,4648,164,175,212,149,63
3,Chuck Hammond,92929,Boys,Senior,Mason,31541,mason,Fairfield,223211,2,...,,,,6,9918,217,217,223,211,12
4,Chuck Hammond,92929,Boys,Senior,Mason,31542,Western,Holiday Classic,211233220,3,...,,,,9,8515,220,221,233,211,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89528,Kayla Mineer,448590,Girls,Junior,Anderson,115866,Cherry Grove Lanes,West Clermont,168135,2,...,,,,16,4262,152,152,168,135,33
89529,Kayla Mineer,448590,Girls,Junior,Anderson,115868,Cherry Grove Lanes,West Clermont,124146,2,...,,,,11,3498,135,135,146,124,22
89530,Kayla Mineer,448590,Girls,Junior,Anderson,115874,Cherry Grove Lanes,Turpin,112156,2,...,,,,22,3110,134,134,156,112,44
89531,Kayla Mineer,448590,Girls,Junior,Anderson,115877,Cherry Grove Lanes,Turpin,125143,2,...,,,,9,3526,134,134,143,125,18


In [136]:
df_gamestats_split.to_csv('gamestats_split.csv')