# Project: Levels of Football

Kaleb Sailer

This is the scraping file for high school data.

### Packages

In [3]:
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import random

## High School Data: Bound Website

In [5]:
## using 2023-2024 data so it does not change with the playoffs currently going on

# team name and record scraping 
url = 'https://www.gobound.com/ia/ihsaa/football/2023-24/powerindex'

service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)

team_names = []
wins = []
losses = []
ppg = []
ppg_a = []
ppg_diff = []

browser.get(url) 

# waiting for page
time.sleep(2)

# scrape the team name
team_elements = browser.find_elements(By.XPATH, "//table//tr/td[2]/a")

for team_name in team_elements:
        team_names.append(team_name.text)

# scrape the wins
win_elements = browser.find_elements(By.XPATH, "//table//tr/td[5]")

for win_count in win_elements:
        wins.append(win_count.text)

# scrape the losses
loss_elements = browser.find_elements(By.XPATH, "//table//tr/td[6]")

for loss_count in loss_elements:
        losses.append(loss_count.text)

# scrape the ppg
ppg_elements = browser.find_elements(By.XPATH, "//table//tr/td[39]")

for ppg_count in ppg_elements:
        ppg.append(ppg_count.text)

# scrape the ppg allowed
ppg_a_elements = browser.find_elements(By.XPATH, "//table//tr/td[40]")

for ppg_a_count in ppg_a_elements:
        ppg_a.append(ppg_a_count.text)

# scrape the ppg diff
ppg_diff_elements = browser.find_elements(By.XPATH, "//table//tr/td[41]")

for ppg_diff_count in ppg_diff_elements:
        ppg_diff.append(ppg_diff_count.text.strip('+'))
    
browser.quit()

In [6]:
# data frame creation
hs_team_stats_df = pd.DataFrame({
    'team_name': team_names,
    'W': wins,
    'L': losses,
    'ppg': ppg,
    'ppg_a': ppg_a,
    'ppg_diff': ppg_diff
})

display(hs_team_stats_df.head())

Unnamed: 0,team_name,W,L,ppg,ppg_a,ppg_diff
0,Southeast Polk,13,0,39.5,17.5,22.0
1,Dowling Catholic,9,2,32.4,11.7,20.6
2,Ankeny,9,4,25.6,24.0,1.6
3,Waukee Northwest,7,3,34.9,17.6,17.3
4,Ankeny Centennial,7,5,33.8,14.2,19.7


In [7]:
id_metrics = ['h20200131032829975bb9dfada0d8e49', 'h2020013103313188014d330346cea4b', 'h20200131080614042e9114503743541']

service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)

team_data = {}

for id_metric in id_metrics:
    url = f'https://www.gobound.com/ia/ihsaa/football/2023-24/leaderlist?competitor=team&range=season&idGroup=h202302090257205525ac5f791e53a41&idMetricGroup={id_metric}&page=1&block=total'
    browser.get(url)

    time.sleep(5)
    
    rows = browser.find_elements(By.XPATH, '/html/body/div[2]/div/div[2]/div/div[1]/div[2]/div/div/div/div[2]/table/tbody/tr')
    print(f"Found {len(rows)} rows for metric {id_metric}")

    for row in rows:
        team_element = row.find_element(By.XPATH, './td[2]/div/div/h7/a')
        team_name = team_element.text
        print(f"Processing team: {team_name}")

        if team_name not in team_data:
            team_data[team_name] = {}

        if id_metric == 'h20200131032829975bb9dfada0d8e49':
            pass_tds = row.find_element(By.XPATH, './td[8]').text
            pass_yds = row.find_element(By.XPATH, './td[6]').text
            team_data[team_name]['pass_tds'] = pass_tds
            team_data[team_name]['pass_yds'] = pass_yds

        elif id_metric == 'h2020013103313188014d330346cea4b':
            rush_tds = row.find_element(By.XPATH, './td[6]').text
            rush_yds = row.find_element(By.XPATH, './td[4]').text
            team_data[team_name]['rush_tds'] = rush_tds
            team_data[team_name]['rush_yds'] = rush_yds

        elif id_metric == 'h20200131080614042e9114503743541':
            off_tds = row.find_element(By.XPATH, './td[4]').text
            team_data[team_name]['off_tds'] = off_tds
        
        time.sleep(random.uniform(1, 3))
        
browser.quit()

Found 36 rows for metric h20200131032829975bb9dfada0d8e49
Processing team: Southeast Polk
Processing team: Iowa City Liberty
Processing team: Ankeny
Processing team: Sioux City East
Processing team: Cedar Rapids Kennedy
Processing team: Iowa City High
Processing team: Ankeny Centennial
Processing team: Valley
Processing team: Davenport North
Processing team: Des Moines East
Processing team: Dowling Catholic
Processing team: Waukee
Processing team: Prairie
Processing team: Johnston
Processing team: Iowa City West
Processing team: Des Moines Roosevelt
Processing team: Cedar Rapids Washington
Processing team: Dubuque Hempstead
Processing team: Council Bluffs Lincoln
Processing team: Davenport Central
Processing team: Dubuque Senior
Processing team: Linn-Mar
Processing team: Cedar Rapids Jefferson
Processing team: Waukee Northwest
Processing team: Ames
Processing team: Cedar Falls
Processing team: Davenport West
Processing team: Marshalltown
Processing team: Bettendorf
Processing team: Sio

In [8]:
team_metric_df = pd.DataFrame.from_dict(team_data, orient='index')
team_metric_df.head(1)

Unnamed: 0,pass_tds,pass_yds,rush_tds,rush_yds,off_tds
Southeast Polk,29,2653,38,2175,67


### Concatenation of the two High School Data Frames

In [10]:
hs_stats_df = pd.merge(hs_team_stats_df, team_metric_df, left_on='team_name', right_on=team_metric_df.index, how='inner')
hs_stats_df = hs_stats_df[['team_name', 'W', 'L', 'ppg', 'ppg_a', 'ppg_diff', 'pass_yds', 'pass_tds', 'rush_yds', 'rush_tds', 'off_tds']]

In [11]:
# type conversions
hs_stats_df[['ppg', 'ppg_a', 'ppg_diff']] = hs_stats_df[['ppg', 'ppg_a', 'ppg_diff']].astype('float')

hs_stats_df[['W', 'L', 'pass_yds', 'pass_tds', 'rush_yds', 'rush_tds', 'off_tds']] = hs_stats_df[['W', 'L', 'pass_yds', 'pass_tds', 'rush_yds', 'rush_tds', 'off_tds']].astype('int')

In [12]:
hs_stats_df.to_csv('hs_stats_raw.csv', encoding='utf-8', index=False)