In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent

ua = UserAgent()
userAgent = ua.random
headers = {"user-agent": userAgent}

base_url = "https://www.espn.com/college-football/stats/player/_/view/offense/stat/passing/season/2024/seasontype/2"
start = 1
all_data_rows = []
table_headers = []
previous_names = None  # Store previous page's name rows

while True:
    url = f"{base_url}/table/{start}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    espn_table = soup.find("div", class_="ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization")
    if not espn_table:
        print("No more tables found.")
        break

    if start == 1:
        table_headers = [header.text.strip() for header in espn_table.find_all("th")]

    name_rows = []
    stat_rows = []

    try:
        name_tbody = espn_table.find_all("tbody")[0]
        for row in name_tbody.find_all("tr"):
            fixed = [td.text.strip() for td in row.find_all("td")]
            name_rows.append(fixed)

        stat_tbody = espn_table.find_all("tbody")[1]
        for row in stat_tbody.find_all("tr"):
            stats = [td.text.strip() for td in row.find_all("td")]
            stat_rows.append(stats)
    except IndexError:
        print("Unexpected table structure. Breaking loop.")
        break

    # Check for duplication with the previous page
    if name_rows == previous_names or not name_rows:
        print("Duplicate or empty page detected. Stopping.")
        break

    # Combine and store data
    for name, stats in zip(name_rows, stat_rows):
        all_data_rows.append(name + stats)

    print(f"Fetched {len(name_rows)} records from start={start}")
    previous_names = name_rows
    start += 50  # ESPN paginates by 50

# Final DataFrame
espn_df = pd.DataFrame(all_data_rows, columns=table_headers)
print(f"\n Total records fetched: {len(espn_df)}")
espn_df.head()

Fetched 50 records from start=1
Duplicate or empty page detected. Stopping.

 Total records fetched: 50


Unnamed: 0,RK,Name,POS,CMP,ATT,CMP%,YDS,AVG,LNG,TD,INT,SACK,RTG
0,1,K. McCordSYR,QB,391,592,66.0,4779,8.1,67,34,12,27,148.8
1,2,C. WardMIA,QB,305,454,67.2,4313,9.5,77,39,7,22,172.2
2,3,J. DartMISS,QB,276,398,69.3,4279,10.8,75,29,6,28,180.7
3,4,S. SandersCOLO,QB,353,477,74.0,4134,8.7,69,37,10,42,168.2
4,5,G. NussmeierLSU,QB,337,525,64.2,4052,7.7,76,29,12,16,142.7


In [2]:
espn_df

Unnamed: 0,RK,Name,POS,CMP,ATT,CMP%,YDS,AVG,LNG,TD,INT,SACK,RTG
0,1,Kyle McCordSYR,QB,391,592,66.0,4779,8.1,67,34,12,27,148.8
1,2,Cam WardMIA,QB,305,454,67.2,4313,9.5,77,39,7,22,172.2
2,3,Jaxson DartMISS,QB,276,398,69.3,4279,10.8,75,29,6,28,180.7
3,4,Shedeur SandersCOLO,QB,353,477,74.0,4134,8.7,69,37,10,42,168.2
4,5,Garrett NussmeierLSU,QB,337,525,64.2,4052,7.7,76,29,12,16,142.7
5,6,Will HowardOSU,QB,309,423,73.1,4010,9.5,75,35,10,14,175.3
6,7,Josh HooverTCU,QB,313,471,66.5,3949,8.4,84,27,11,16,151.1
7,8,Dillon GabrielORE,QB,326,447,72.9,3857,8.6,69,30,6,21,164.9
8,9,Chandler MorrisUNT,QB,322,510,63.1,3774,7.4,96,31,12,12,140.6
9,10,Cade KlubnikCLEM,QB,308,486,63.4,3639,7.5,76,36,6,23,148.2


In [3]:
espn_df[['PlayerName', 'TeamCode']] = espn_df['Name'].str.extract(r"^(.*?)([A-Z]{2,})$")

# Optional: Reorder columns for clarity
cols = ['RK', 'PlayerName', 'TeamCode', 'POS', 'CMP', 'ATT', 'CMP%', 'YDS', 'AVG', 'LNG', 'TD', 'INT', 'SACK', 'RTG']
espn_df = espn_df[cols]

In [4]:
espn_df.head()

Unnamed: 0,RK,PlayerName,TeamCode,POS,CMP,ATT,CMP%,YDS,AVG,LNG,TD,INT,SACK,RTG
0,1,Kyle McCord,SYR,QB,391,592,66.0,4779,8.1,67,34,12,27,148.8
1,2,Cam Ward,MIA,QB,305,454,67.2,4313,9.5,77,39,7,22,172.2
2,3,Jaxson Dart,MISS,QB,276,398,69.3,4279,10.8,75,29,6,28,180.7
3,4,Shedeur Sanders,COLO,QB,353,477,74.0,4134,8.7,69,37,10,42,168.2
4,5,Garrett Nussmeier,LSU,QB,337,525,64.2,4052,7.7,76,29,12,16,142.7
