In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def fetch_player_details(player_url, retries=3):
    for attempt in range(retries):
        try:
            full_url = player_url if player_url.startswith('https://') else f'https://www.eliteprospects.com{player_url}'
            print(f"Fetching data from: {full_url}")
            player_page = requests.get(full_url, timeout=10)
            if player_page.status_code == 200:
                player_soup = BeautifulSoup(player_page.content, "html.parser")
                facts_section = player_soup.find("section", id="player-facts")
                height_info = None
                weight_info = None
                position_info = None
                dob_info = None
                shoots_info = None
                if facts_section:
                    facts_list = facts_section.find_all("li")
                    for fact in facts_list:
                        label = fact.find("span", class_="PlayerFacts_factLabel__EqzO5")
                        if label:
                            if label.text == "Height":
                                height_info = fact.text.split("Height")[-1].strip()
                                print(f"Height found for {full_url}: {height_info}")
                            elif label.text == "Weight":
                                weight_info = fact.text.split("Weight")[-1].strip()
                                print(f"Weight found for {full_url}: {weight_info}")
                            elif label.text == "Position":
                                position_info = fact.text.split("Position")[-1].strip()
                                print(f"Position found for {full_url}: {position_info}")
                            elif label.text == "Date of Birth":
                                dob_info = fact.find("a").text.strip()
                                print(f"Date of Birth found for {full_url}: {dob_info}")
                            elif label.text == "Shoots":
                                shoots_info = fact.text.split("Shoots")[-1].strip()
                                print(f"Shoots found for {full_url}: {shoots_info}")
                return height_info, weight_info, position_info, dob_info, shoots_info
            elif player_page.status_code == 500:
                print(f"Server error 500 at {full_url}. Retrying...")
                time.sleep(2)
        except requests.RequestException as e:
            print(f"Request failed: {e}. Retrying...")
            time.sleep(2)
    print(f"Failed to fetch data from: {player_url}")
    return None, None, None, None, None

def fetch_all_players(start_url):
    all_players = []
    year = re.search(r'\d{4}-\d{4}', start_url).group(0)  # Extract year from URL
    league = re.search(r'/league/([^/]+)/stats', start_url).group(1)  # Extract league from URL

    while start_url:
        print(f"Fetching page: {start_url}")
        r = requests.get(url=start_url)
        soup = BeautifulSoup(r.content, "html.parser")
        table = soup.find("table", class_="table table-striped table-sortable player-stats highlight-stats season")

        headers = table.find_all("th")
        titles = [header.text for header in headers]

        rows = table.find_all("tr")
        for row in rows[1:]:
            data = row.find_all("td")
            player_data = [td.text.strip() for td in data]

            player_url = row.find("span", class_="txt-blue").find('a').get("href") if row.find("span", class_="txt-blue") else None
            player_data.append(player_url)

            all_players.append(player_data + [year, league])  # Append year and league to player data

        # Find the next page link and ensure it is not a goalie stats link
        next_page_link = soup.find("a", string="Next page")
        if next_page_link:
            next_page_url = next_page_link.get("href")
            if "sort-goalie-stats" not in next_page_url:
                start_url = next_page_url
            else:
                start_url = None
        else:
            start_url = None

    return titles, all_players

def scrape_multiple_seasons_and_leagues(years, leagues):
    all_season_data = []
    for year in years:
        for league in leagues:
            season_url = f'https://www.eliteprospects.com/league/{league}/stats/{year}'
            titles, season_data = fetch_all_players(season_url)
            all_season_data.extend(season_data)

    # Create DataFrame with all player data
    titles.append('Player_URL')  # Adding the Player_URL column to the titles
    titles.append('Year')  # Adding the Year column to the titles
    titles.append('League')  # Adding the League column to the titles
    df = pd.DataFrame(all_season_data, columns=titles)

    # Fetch additional details for each player
    df[['Height', 'Weight', 'Position', 'Date of Birth', 'Shoots']] = df['Player_URL'].apply(lambda x: pd.Series(fetch_player_details(x)) if x else pd.Series([None, None, None, None, None]))

    # Remove rows with missing or empty Player_URL
    df = df.dropna(subset=['Player_URL'])
    df = df[df['Player_URL'].str.strip() != ""]

    # Clean up Height column to only include the cm value
    df['Height'] = df['Height'].str.extract(r'(\d+) cm')[0]

    # Clean up Weight column to only include the kg value
    df['Weight'] = df['Weight'].str.extract(r'(\d+) kg')[0]

    # Remove rows with NaN values in any of the important columns
    df = df.dropna(subset=['Height', 'Weight', 'Position', 'Date of Birth', 'Shoots'])

    return df

# List of years and leagues to scrape
years = ['2023-2024', '2022-2023']
leagues = ['nhl', 'ahl']  # Add more leagues as needed

# Scrape data for multiple seasons and leagues
df = scrape_multiple_seasons_and_leagues(years, leagues)

print(df.to_string())



Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=2
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=3
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=4
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=5
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=6
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=7
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=8
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=9
Fetching page: https://www.eliteprospects.com/league/nhl/stats/2023-2024?page=10
Fetching page: https://www.eliteprospects.com/league/ahl/stats/2023-2024
Fetching page: https://www.eliteprospects.com/league/ahl/stats/2023-2024?page=2
Fetching page: https://www.eliteprospects.com/league/