In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#URLs with corresponding years
urls = [
    ('https://www.pro-football-reference.com/draft/2023-combine.htm', 2023),
    ('https://www.pro-football-reference.com/draft/2022-combine.htm', 2022),
    ('https://www.pro-football-reference.com/draft/2021-combine.htm', 2021),
    ('https://www.pro-football-reference.com/draft/2020-combine.htm', 2020),
    ('https://www.pro-football-reference.com/draft/2019-combine.htm', 2019),
    ('https://www.pro-football-reference.com/draft/2018-combine.htm', 2018),
    ('https://www.pro-football-reference.com/draft/2017-combine.htm', 2017),
    ('https://www.pro-football-reference.com/draft/2016-combine.htm', 2016),
    ('https://www.pro-football-reference.com/draft/2015-combine.htm', 2015),
    ('https://www.pro-football-reference.com/draft/2014-combine.htm', 2014)
]

In [2]:

# Define a function to scrape and process stats data
def scrape_process_stats(url, year):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find the table containing the stats data
    stats_table = soup.find('table', {'id': 'defense'})  # Update the id as needed
    
    # Extract column headers
    stats_headers = [th.text for th in stats_table.find('thead').find_all('th')]
    
    # Extract data rows
    stats_data = []
    for row in stats_table.find('tbody').find_all('tr'):
        row_data = [td.text for td in row.find_all('td')]
        stats_data.append(row_data)
    
    # Create DataFrame for stats data
    stats_df = pd.DataFrame(stats_data, columns=stats_headers)
    
    # Add a column for the year
    stats_df['Year'] = year
    
    return stats_df

# Initialize to store the dataframes
combine_dfs = []
stats_dfs = []

# Loop for each URL
for url, year in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Table containing the combine data
    combine_table = soup.find('table', {'id': 'combine'})
    
    # Column headers for combine data
    combine_headers = [th.text for th in combine_table.find('thead').find_all('th')]
    
    # Extract combine data rows
    combine_data = []
    for row in combine_table.find('tbody').find_all('tr'):
        row_data = [td.text for td in row.find_all('td')]
        # Include player name from the first column
        player_name = row.find('th').text
        row_data.insert(0, player_name)
        # Adjust the number of columns to match the number of headers
        if len(row_data) == len(combine_headers) - 1:
            row_data.insert(0, '')  # Insert an empty string for the missing column
        combine_data.append(row_data)
    
    # Create DataFrame for combine data
    combine_df = pd.DataFrame(combine_data, columns=combine_headers)
    
    # Check if the 'Drafted (tm/rnd/yr)' column exists before dropping it
    if 'Drafted (tm/rnd/yr)' in combine_df.columns:
        combine_df.drop(columns=['Drafted (tm/rnd/yr)'], inplace=True)
    
    # Add a column for the year
    combine_df['Year'] = year
    
    # Append combine DataFrame to the list
    combine_dfs.append(combine_df)
    
    # Scrape and process stats data
    stats_df = scrape_process_stats(url, year)
    
    # Append stats DataFrame to the list
    stats_dfs.append(stats_df)

# Concatenate all combine and stats dataframes into one
combine_data_df = pd.concat(combine_dfs, ignore_index=True)
stats_data_df = pd.concat(stats_dfs, ignore_index=True)

# Merge the dataframes based on player name
merged_data_df = pd.merge(combine_data_df, stats_data_df, on='Player', how='left')

# Display the merged dataframe
display(merged_data_df)


AttributeError: 'NoneType' object has no attribute 'find'