In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [12]:
# Web Scraping
# Fetch the webpage
url = "https://www.pro-football-reference.com/years/2024/passing.htm"
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the table
    table = soup.find('table', {'id': 'passing'})
    
    # Extract headers
    headers = [th.text for th in table.find_all('tr')[0].find_all('th')]
    headers = headers[1:]  
    
    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  
        cols = [td.text for td in tr.find_all('td')]
        if cols:  
            rows.append(cols)
    
    # Step 5: Create a DataFrame and save as CSV
    df = pd.DataFrame(rows, columns=headers)
    df.to_csv('nfl_passing_stats_2024.csv', index=False)
    print("Data successfully saved to nfl_passing_stats_2024.csv")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Data successfully saved to nfl_passing_stats_2024.csv


In [13]:
# Data Cleaning
# Load the CSV file
file_path = 'nfl_passing_stats_2024.csv'  
df = pd.read_csv(file_path)

# Inspect the column names
print("Columns in the dataset:")
print(df.columns)

# Remove unnecessary columns
columns_to_remove = ['Pos', 'G' , 'TD%' , 'Int%', '1D' , 'Succ%' , 'Lng' , 'Yds' , 'Sk%' , 'AY/A' , 'NY/A' , 'ANY/A'] 

# Rename columns as needed
columns_to_rename = {
    'GS': 'Games Started',
    'QBrec': 'Record',
    'Cmp': 'Completions',
    'Cmp%': 'Completion Percentage',
    'Att': 'Attempts',
    'Yds': 'Yards',
    'TD': 'Touchdowns Thrown',
    'Int': 'Interceeptions Thrown', 
    'Y/A': 'Yards per Pass Attempt',
    'Y/C': 'Yards per Completion', 
    'Y/G': 'Yards per Game',
    'Rate': 'Rating', 
    'QBR': 'QB Rating',
    'Sk': 'Sacks Taken',
    '4QC': '4th Quarter Comebacks', 
    'GWD': 'Game Winning Drives'
}

df_cleaned = df.drop(columns=columns_to_remove).rename(columns=columns_to_rename)

# Step 4: Save the cleaned data to a new CSV
cleaned_file_path = 'nfl_passing_stats_2024_cleaned.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")

Columns in the dataset:
Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att',
       'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Succ%', 'Lng', 'Y/A',
       'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk', 'Yds.1', 'Sk%', 'NY/A',
       'ANY/A', '4QC', 'GWD', 'Awards'],
      dtype='object')
Cleaned data saved to nfl_passing_stats_2024_cleaned.csv
