In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#URLs with corresponding years
urls = [
    ('https://www.pro-football-reference.com/draft/2023-combine.htm', 2023),
    ('https://www.pro-football-reference.com/draft/2022-combine.htm', 2022),
    ('https://www.pro-football-reference.com/draft/2021-combine.htm', 2021),
    ('https://www.pro-football-reference.com/draft/2020-combine.htm', 2020),
    ('https://www.pro-football-reference.com/draft/2019-combine.htm', 2019),
    ('https://www.pro-football-reference.com/draft/2018-combine.htm', 2018),
    ('https://www.pro-football-reference.com/draft/2017-combine.htm', 2017),
    ('https://www.pro-football-reference.com/draft/2016-combine.htm', 2016),
    ('https://www.pro-football-reference.com/draft/2015-combine.htm', 2015),
    ('https://www.pro-football-reference.com/draft/2014-combine.htm', 2014)
]


In [2]:
# intitialize to store the dataframes
dfs = []

# big loop for each URL
for url, year in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # table containing the combine data
    table = soup.find('table', {'id': 'combine'})
    
    #column headers
    headers = [th.text for th in table.find('thead').find_all('th')]
    
    # Extract data rows
    data = []
    for row in table.find('tbody').find_all('tr'):
        row_data = [td.text for td in row.find_all('td')]
        # Include player name from the first column
        player_name = row.find('th').text
        row_data.insert(0, player_name)
        # Adjust the number of columns to match the number of headers
        if len(row_data) == len(headers) - 1:
            row_data.insert(0, '')  # Insert an empty string for the missing column
        data.append(row_data)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Check if the 'Drafted (tm/rnd/yr)' column exists before dropping it
    if 'Drafted (tm/rnd/yr)' in df.columns:
        df.drop(columns=['Drafted (tm/rnd/yr)'], inplace=True)
    
    # Add a column for the year
    df['Year'] = year
    dfs.append(df)


ten_yr = pd.concat(dfs, ignore_index=True)
# display(ten_yr)

In [3]:
positions = ['QB', 'RB', 'WR', 'TE', 'OT', 'G', 'C', 'EDGE', 'DT', 'DE', 'LB', 'CB', 'DB', 'K', 'P']

#initialize dictionary
filtered_dfs = {}

# run over each position
for position in positions:
    if position == 'G':
        filtered_dfs[position] = ten_yr[ten_yr['Pos'].isin(['G', 'OG', 'OL'])].copy()
    elif position == 'DB':
        filtered_dfs[position] = ten_yr[ten_yr['Pos'].isin(['SAF', 'DB', 'S'])].copy()
    elif position == 'LB':
        filtered_dfs[position] = ten_yr[ten_yr['Pos'].isin(['LB', 'ILB', 'OLB'])].copy()
    elif position =='RB':
        filtered_dfs[position] = ten_yr[ten_yr['Pos'].isin(['RB', 'FB'])]
    elif position == 'C':
        filtered_dfs[position] = ten_yr[ten_yr['Pos'].isin(['C', 'LS'])]
    else:
        filtered_dfs[position] = ten_yr[ten_yr['Pos'] == position].copy()

# Access filtered DataFrames by position
qb_df = filtered_dfs['QB']
rb_df = filtered_dfs['RB']
wr_df = filtered_dfs['WR']
te_df = filtered_dfs['TE']
ot_df = filtered_dfs['OT']
g_df = filtered_dfs['G']
c_df = filtered_dfs['C']
edge_df = filtered_dfs['EDGE']
dt_df = filtered_dfs['DT']
de_df = filtered_dfs['DE']
lb_df = filtered_dfs['LB']
cb_df = filtered_dfs['CB']
db_df = filtered_dfs['DB']
k_df = filtered_dfs['K']
p_df = filtered_dfs['P']

# display(qb_df)

## Keeping this code because it's useful when debugging. File being run elsewhere

In [4]:
# counts = {}

# # Iterate over each position and count the number of rows in the corresponding DataFrame
# total_count = 0
# for position in positions:
#     count = filtered_dfs[position].shape[0]
#     counts[position] = count
#     total_count += count

# # Display the counts
# for position, count in counts.items():
#     print(f"Number of players in {position}: {count}")

# # Display the total count
# print(f"Total number of players: {total_count}")