base_url is https://stats.caha.timetoscore.com/display-stats?league=3
read in season_map.csv
generate a list of URLs based on each season. For example if season is 3 then the url will be https://stats.caha.timetoscore.com/display-stats?league=3&season=3
save to a DataFrame with the cooresponding season year

Search each page for a list of Schedule URLs and Division Player Stats URLs. Ignore the "Norcal Schedule" link on each page. 

dataframe reformatted to be:
Season Name, Schedule Name, Division Player Stats URL

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Read the season_map.csv file
season_map_df = pd.read_csv('season_map.csv')

# Base URL
base_url = "https://stats.caha.timetoscore.com/display-stats?league=3"

# Initialize a list to store the data
data = []

# Iterate through the season map
for index, row in season_map_df.iterrows():
    season_year = row['Season Year']
    season_number = row['Season']
    season_url = f"{base_url}&season={season_number}"
    
    # Fetch the content of the season URL
    response = requests.get(season_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all the links on the page
        links = soup.find_all('a')
        
        # Initialize lists to store schedule names and division player stats URLs
        schedule_names = []
        division_player_stats_urls = []
        
        # Extract Schedule and Division Player Stats URLs
        for link in links:
            if 'Division Player Stats' in link.get_text():
                division_player_stats_url = urljoin(base_url, link['href'])
                division_player_stats_urls.append(division_player_stats_url)
            elif 'Schedule' in link.get_text() and 'Norcal Schedule' not in link.get_text():
                schedule_name = link.get_text()
                schedule_names.append(schedule_name)
        
        # Append data to the list
        data.extend([
            {
                'Season Name': season_year,
                'Division': schedule_name.replace(' Schedule', '').strip(),
                'Division Player Stats URL': division_player_stats_url
            }
            for schedule_name, division_player_stats_url in zip(schedule_names, division_player_stats_urls)
        ])

# Create a DataFrame from the collected data
result_df = pd.DataFrame(data)





In [2]:
# Write the DataFrame to a CSV file
result_df.to_csv('division_list.csv', index=False)

print("DataFrame written to division_list.csv")

DataFrame written to division_list.csv


Scrape the first table in the Division Player Stats URL and write it to a new dataframe
create a CSV from the table. The first column should be Season Name, second is Division, and then append the data from the table.

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_division_stats(url, season_name, division):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')
        
        if table:
            division_stats_df = pd.read_html(str(table), header=0)[0]
            division_stats_df.insert(0, 'Season Name', season_name)
            division_stats_df.insert(1, 'Division', division)
            return division_stats_df
    
    return None

# Read division_list.csv
division_list_df = pd.read_csv('division_list.csv')

# Initialize a list to store the data
data = []

# Iterate through the division list
for index, row in division_list_df.iterrows():
    season_name = row['Season Name']
    division = row['Division']
    division_player_stats_url = row['Division Player Stats URL']
    
    division_stats_df = scrape_division_stats(division_player_stats_url, season_name, division)
    
    if division_stats_df is not None:
        data.append(division_stats_df)

# Concatenate all DataFrames in the list if there's any data
if data:
    result_df = pd.concat(data, ignore_index=True)
    
    # Remove duplicate lines
    result_df.drop_duplicates(inplace=True)
    
    # Write the DataFrame to a CSV file named norcal_player_stats.csv
    result_df.to_csv('norcal_player_stats.csv', index=False)
    
    print("DataFrame written to norcal_player_stats.csv")
else:
    print("No valid data found. Skipping writing to norcal_player_stats.csv")


DataFrame written to norcal_player_stats.csv


Scrape goalie stats


In [74]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_goalie_stats(url, season_name, division):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table')
        
        if len(tables) > 1:
            goalie_stats_df = pd.read_html(str(tables[1]), header=0)[0]
            goalie_stats_df.insert(0, 'Season Name', season_name)
            goalie_stats_df.insert(1, 'Division', division)
            return goalie_stats_df
    
    return None

# Read division_list.csv
division_list_df = pd.read_csv('division_list.csv')

# Initialize a list to store the data
data = []

# Iterate through the division list
for index, row in division_list_df.iterrows():
    season_name = row['Season Name']
    division = row['Division']
    division_player_stats_url = row['Division Player Stats URL']
    
    goalie_stats_df = scrape_goalie_stats(division_player_stats_url, season_name, division)
    
    if goalie_stats_df is not None:
        data.append(goalie_stats_df)

# Concatenate all DataFrames in the list if there's any data
if data:
    result_df = pd.concat(data, ignore_index=True)
    
    # Remove duplicate lines
    result_df.drop_duplicates(inplace=True)
    
    # Write the DataFrame to a CSV file named norcal_goalie_stats.csv
    result_df.to_csv('norcal_goalie_stats.csv', index=False)
    
    print("DataFrame written to norcal_goalie_stats.csv")
else:
    print("No valid data found. Skipping writing to norcal_goalie_stats.csv")


DataFrame written to norcal_goalie_stats.csv


### Clean Player Stats

In [7]:
import pandas as pd

# Read norcal_player_stats.csv
player_stats_df = pd.read_csv('norcal_player_stats.csv')

# Define the new column names
new_column_names = ['Season Name', 'Division', 'Name', '#', 'Team', 'GP', 'Goals', 'Ass.', 'Hat', 'Min', 'Pts/Game', 'Pts']

# Replace the column names
player_stats_df.columns = new_column_names

# Remove any rows where the "Name" column has the value "Name", starting from the second row
player_stats_df = player_stats_df.loc[(player_stats_df['Name'] != 'Name') | (player_stats_df.index == 1)]

# Write the cleaned DataFrame back to norcal_player_stats.csv
player_stats_df.to_csv('norcal_player_stats.csv', index=False)


norcal_player_stats.csv cleaned and updated


### Clean Goalie Stats

In [84]:
import pandas as pd

# Read norcal_player_stats.csv
goalie_stats_df = pd.read_csv('norcal_goalie_stats.csv')

# Define the new column names
new_column_names = ['Season Name', 'Division', 'Name', 'Team', 'GP', 'Shots', 'GA', 'GAA', 'Save %', 'SO']

# Replace the column names
goalie_stats_df.columns = new_column_names

# Remove any rows where the "Name" column has the value "Name", starting from the second row
goalie_stats_df = goalie_stats_df.loc[(goalie_stats_df['Name'] != 'Name') | (goalie_stats_df.index == 1)]

# Write the cleaned DataFrame back to norcal_goalie_stats.csv
goalie_stats_df.to_csv('norcal_goalie_stats.csv', index=False)

