In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def scrape_page(url, season):
    try:
        r = requests.get(url, timeout=10)
        if r.status_code != 200:
            print(f"Failed to fetch data: HTTP {r.status_code}")
            return None, False
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None, False

    soup = BeautifulSoup(r.content, "html.parser")
    table = soup.find("table", class_="table table-striped table-sortable player-stats highlight-stats season")
    if not table:
        return None, False

    headers = [th.text.strip() for th in table.find_all("th")]
    df = pd.DataFrame(columns=headers)
    rows = table.find_all("tr")[1:]
    for row in rows:
        data = [td.text.strip() for td in row.find_all("td")]
        if data:
            df.loc[len(df)] = data

    # Extract and append player URLs
    for span in table.find_all("span", class_="txt-blue"):
        link = span.find('a')
        if link:
            url = link.get("href")
            name = link.text.strip()
            df.loc[df.Player == name, "Player_URL"] = url

    df['Year'] = season

    # Pagination check
    pagination = soup.find("div", class_="table-pagination")
    next_page_link = pagination.find("a", text="Next page") if pagination else None
    has_next_page = bool(next_page_link)

    return df, has_next_page

def scrape_seasons_early(base_url_template, start_year, end_year):
    dfs = []
    for year in range(start_year, end_year + 1):
        season = f"{year}-{year + 1}"
        base_url = base_url_template.format(season=season)
        print(f"Scraping season: {season}")
        page = 1
        while True:
            url = f"{base_url}?page={page}"
            df, has_next_page = scrape_page(url, season)
            if df is None:
                print("No more data or repeating page.")
                break
            dfs.append(df)
            if not has_next_page:
                break
            page += 1
            time.sleep(random.uniform(3, 7))  # Randomized delay

    # Combine all dataframes
    if dfs:
        df_final = pd.concat(dfs, ignore_index=True)
        df_final.to_csv("nhl_player_stats_1917_to_1991.csv", index=False)
        return df_final
    else:
        return pd.DataFrame()


base_url_template = 'https://www.eliteprospects.com/league/nhl/stats/{season}'
df_early = scrape_seasons_early(base_url_template, 1917, 1990)
print(df_early)



  
