In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2023/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table
table = soup.find('table', id='kicking')

# Extract headers correctly from the second row of the table header section
headers_row = table.find_all('tr')[1]  # This should directly point to the row with the actual data headers
headers = [th.get_text() for th in headers_row.find_all('th')]

# Process over-headers to determine unique header labels
over_headers = soup.find('tr', class_='over_header').find_all('th')
category_headers = []
current_category = ""
col_index = 0

for oh in over_headers:
    text = oh.text.strip()
    colspan = int(oh.get('colspan', 1))
    for _ in range(colspan):
        if col_index < len(headers):
            header_text = f"{text} {headers[col_index]}" if text else headers[col_index]
            category_headers.append(header_text)
            col_index += 1

# Remove 'Rk' header manually if it is the first header
if category_headers and category_headers[0].startswith('Rk'):
    category_headers = category_headers[1:]

# Print processed headers for verification
print("Processed Headers:", category_headers)
print("Number of processed headers:", len(category_headers))

# Extract row data, skipping rows with mismatched or empty data
rows = []
for i, row in enumerate(table.find_all('tr')[2:], start=2):  # start=2 to match the visual row number on the page
    if i == 30 or i == 61:  # Skip rows 30 and 61
        continue
    cols = row.find_all('td')
    row_data = [col.get_text() for col in cols]
    if len(row_data) == len(category_headers):
        rows.append(row_data)
    else:
        print(f"Mismatch at row {i}: Expected {len(category_headers)}, got {len(row_data)}")

# Print the number of rows extracted
print("Number of rows extracted:", len(rows))

# Create DataFrame
kickers_2023 = pd.DataFrame(rows, columns=category_headers)

# Manually provided row data
joey_slye_data = [
    'Joey Slye', 'WAS', '27', 'K', '17', '0', '', '', '3', '2', '7', '7', '9', '7', '5', '3', '24', '19', '61', '79.2%', '35', '32', '91.4%', '72', '4520', '64', '88.9%', '62.8'
]

# Convert the list to a DataFrame row
joey_slye_row = pd.DataFrame([joey_slye_data], columns=category_headers)

# Insert the row into the DataFrame at the correct index
# We need to break the DataFrame into two parts and concatenate the new row in between
top_df = kickers_2023.iloc[:28]  # All rows up to but not including index 28
bottom_df = kickers_2023.iloc[28:]  # All rows from index 28 onwards

# Recreate the DataFrame with the new row inserted
kickers_2023 = pd.concat([top_df, joey_slye_row, bottom_df], ignore_index=True)

# Extended row data for Rigoberto Sanchez, adding empty strings for missing data
rigoberto_sanchez_data = [
    'Rigoberto Sanchez', 'IND', '29', 'P', '17', '0',
    '', '', '', '', '', '', '', '', '', '', '', '', '', '',  # Assuming these should be empty for missing kicking stats
    '', '', '', '', '1', '62', '', '62.0'  # Only punting stats provided
]

# Convert the list to a DataFrame row
rigoberto_sanchez_row = pd.DataFrame([rigoberto_sanchez_data], columns=category_headers)

# Insert the row into the DataFrame at the correct index
# Assuming this row should be placed at index 59 (which would be index 58 in zero-based Python indexing)
top_df = kickers_2023.iloc[:58]  # All rows up to but not including index 58
bottom_df = kickers_2023.iloc[58:]  # All rows from index 58 onwards

# Recreate the DataFrame with the new row inserted
kickers_2023 = pd.concat([top_df, rigoberto_sanchez_row, bottom_df], ignore_index=True)

# Print the modified DataFrame
print(kickers_2023)




Processed Headers: ['Player', 'Tm', 'Age', 'Pos', 'Games G', 'Games GS', '0-19 FGA', '0-19 FGM', '20-29 FGA', '20-29 FGM', '30-39 FGA', '30-39 FGM', '40-49 FGA', '40-49 FGM', '50+ FGA', '50+ FGM', 'Scoring FGA', 'Scoring FGM', 'Scoring Lng', 'Scoring FG%', 'Scoring XPA', 'Scoring XPM', 'Scoring XP%', 'Kickoffs KO', 'Kickoffs KOYds', 'Kickoffs TB', 'Kickoffs TB%', 'Kickoffs KOAvg']
Number of processed headers: 28
Mismatch at row 31: Expected 28, got 0
Mismatch at row 62: Expected 28, got 0
Number of rows extracted: 58
               Player   Tm Age  Pos Games G Games GS 0-19 FGA 0-19 FGM  \
0         Jason Myers  SEA  32    K      17        0                     
1            Matt Gay  IND  29    K      17        0                     
2    Brandon Aubrey*+  DAL  28    K      17        0                     
3        Cairo Santos  CHI  32    K      17        0                     
4       Greg Zuerlein  NYJ  36    K      16        0                     
5      Justin Tucker*  BAL  34   

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2022/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table by ID or attributes (might need to adjust based on actual HTML structure)
table = soup.find('table', id='kicking')

# Extract headers from the table, assuming they're in the second row of the table header section
header_row = table.find('thead').find_all('tr')[-1]  # Get the last header row which usually has the detailed headers
headers = [th.get_text() for th in header_row.find_all('th')]

# Adjust headers to exclude 'Rk' if it's not needed or keep it based on your requirement
if headers[0] == 'Rk':
    headers = headers[1:]  # Exclude 'Rk' if it's the first header and not needed

# Extract row data
rows = []
for row in table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    if cols:  # Make sure the row is not empty (filtering out possible header rows inside tbody)
        row_data = [col.get_text() for col in cols]
        rows.append(row_data)

# Create DataFrame
kickers_2022 = pd.DataFrame(rows, columns=headers)

# Print the DataFrame to verify
print(kickers_2022)


             Player   Tm Age Pos   G GS FGA FGM FGA FGM  ... Lng    FG% XPA  \
0    Justin Tucker*  BAL  33   K  17  0          13  13  ...  58  86.0%  32   
1       Ryan Succop  TAM  36   K  17  0          11  10  ...  54  81.6%  25   
2      Jason Myers*  SEA  31   K  17  0           9   9  ...  56  91.9%  42   
3   Daniel Carlson+  LVR  27   K  17  0           9   9  ...  57  91.9%  36   
4         Nick Folk  NWE  38   K  17  0   1   1   9   9  ...  54  86.5%  35   
..              ...  ...  ..  ..  .. ..  ..  ..  ..  ..  ...  ..    ...  ..   
60     Jamie Gillan  NYG  25   P  17  0                  ...                  
61    Johnny Hekker  CAR  32   P  17  0                  ...                  
62       Sam Martin  BUF  32   P  16  0                  ...                  
63     Arryn Siposs  PHI  30   P  13  0                  ...                  
64       Tress Way*  WAS  32   P  17  0                  ...                  

   XPM    XP%  KO KOYds  TB    TB% KOAvg  
0   31  

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2021/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table by ID or attributes (might need to adjust based on actual HTML structure)
table = soup.find('table', id='kicking')

# Extract headers from the table, assuming they're in the second row of the table header section
header_row = table.find('thead').find_all('tr')[-1]  # Get the last header row which usually has the detailed headers
headers = [th.get_text() for th in header_row.find_all('th')]

# Adjust headers to exclude 'Rk' if it's not needed or keep it based on your requirement
if headers[0] == 'Rk':
    headers = headers[1:]  # Exclude 'Rk' if it's the first header and not needed

# Extract row data
rows = []
for row in table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    if cols:  # Make sure the row is not empty (filtering out possible header rows inside tbody)
        row_data = [col.get_text() for col in cols]
        rows.append(row_data)

# Create DataFrame
kickers_2021 = pd.DataFrame(rows, columns=headers)

# Print the DataFrame to verify
print(kickers_2021)


             Player   Tm Age Pos   G GS FGA FGM FGA FGM  ... Lng    FG% XPA  \
0    Daniel Carlson  LVR  26   K  17  0          11  10  ...  56  93.0%  33   
1     Chris Boswell  PIT  30   K  17  0           9   9  ...  56  90.0%  29   
2         Nick Folk  NWE  37   K  17  0           9   9  ...  53  92.3%  47   
3       Greg Joseph  MIN  27   K  17  0           8   8  ...  55  86.8%  40   
4       Matt Prater  ARI  37   K  17  0          11  11  ...  62  81.1%  49   
..              ...  ...  ..  ..  .. ..  ..  ..  ..  ..  ...  ..    ...  ..   
60        Tress Way  WAS  31   P  16  0                  ...                  
61     Ryan Winslow  3TM  27   P   4  0                  ...                  
62       Matt Haack  BUF  27   P  17  0                  ...                  
63  Michael Palardy  MIA  29   P  17  0                  ...                  
64     Arryn Siposs  PHI  29   P  17  0                  ...                  

   XPM    XP%  KO KOYds  TB    TB% KOAvg  
0   30  

In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2020/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table by ID or attributes (might need to adjust based on actual HTML structure)
table = soup.find('table', id='kicking')

# Extract headers from the table, assuming they're in the second row of the table header section
header_row = table.find('thead').find_all('tr')[-1]  # Get the last header row which usually has the detailed headers
headers = [th.get_text() for th in header_row.find_all('th')]

# Adjust headers to exclude 'Rk' if it's not needed or keep it based on your requirement
if headers[0] == 'Rk':
    headers = headers[1:]  # Exclude 'Rk' if it's the first header and not needed

# Extract row data
rows = []
for row in table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    if cols:  # Make sure the row is not empty (filtering out possible header rows inside tbody)
        row_data = [col.get_text() for col in cols]
        rows.append(row_data)

# Create DataFrame
kickers_2020 = pd.DataFrame(rows, columns=headers)

# Print the DataFrame to verify
print(kickers_2020)


                 Player   Tm Age Pos   G GS FGA FGM FGA FGM  ... Lng    FG%  \
0         Greg Zuerlein  DAL  33   K  16  0   1   1   5   5  ...  59  82.9%   
1        Jason Sanders+  MIA  25   K  16  0   1   1   8   8  ...  56  92.3%   
2         Younghoe Koo*  ATL  26   K  15  0          10  10  ...  54  94.9%   
3   Rodrigo Blankenship  IND  23   K  16  0          10  10  ...  53  86.5%   
4             Joey Slye  CAR  24   K  16  0          12  11  ...  56  80.6%   
..                  ...  ...  ..  ..  .. ..  ..  ..  ..  ..  ...  ..    ...   
66          Keelan Cole  JAX  27  WR  16  5                  ...              
67          Riley Dixon  NYG  27   P  16  0                  ...              
68           Matt Haack  MIA  26   P  16  0                  ...              
69           Brett Kern  TEN  34   P  13  0                  ...              
70      Thomas Morstead  NOR  34   P  16  0                  ...              

   XPA XPM     XP%  KO KOYds  TB    TB% KOAvg  
0  

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2019/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table by ID or attributes (might need to adjust based on actual HTML structure)
table = soup.find('table', id='kicking')

# Extract headers from the table, assuming they're in the second row of the table header section
header_row = table.find('thead').find_all('tr')[-1]  # Get the last header row which usually has the detailed headers
headers = [th.get_text() for th in header_row.find_all('th')]

# Adjust headers to exclude 'Rk' if it's not needed or keep it based on your requirement
if headers[0] == 'Rk':
    headers = headers[1:]  # Exclude 'Rk' if it's the first header and not needed

# Extract row data
rows = []
for row in table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    if cols:  # Make sure the row is not empty (filtering out possible header rows inside tbody)
        row_data = [col.get_text() for col in cols]
        rows.append(row_data)

# Create DataFrame
kickers_2019 = pd.DataFrame(rows, columns=headers)

# Print the DataFrame to verify
print(kickers_2019)


              Player   Tm Age Pos   G GS FGA FGM FGA FGM  ... Lng    FG% XPA  \
0    Harrison Butker  KAN  24   K  16  0           9   9  ...  56  89.5%  48   
1          Wil Lutz*  NOR  25   K  16  0   1   1   9   9  ...  58  88.9%  49   
2           Matt Gay  TAM  25   K  16  0           6   6  ...  58  77.1%  48   
3      Zane Gonzalez  ARI  24   K  16  0          12  12  ...  54  88.6%  35   
4    Brandon McManus  DEN  28   K  16  0           6   6  ...  53  85.3%  26   
..               ...  ...  ..  ..  .. ..  ..  ..  ..  ..  ...  ..    ...  ..   
58       Bryan Anger  HOU  31   P  14  0                  ...                  
59   Corey Bojorquez  BUF  23   P  16  0                  ...                  
60  Britton Colquitt  MIN  34   P  16  0                  ...                  
61       Riley Dixon  NYG  26   P  16  0                  ...                  
62   Thomas Morstead  NOR  33   P  16  0                  ...                  

   XPM    XP%  KO KOYds  TB    TB% KOAv

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = 'https://www.pro-football-reference.com/years/2018/kicking.htm'

# Fetch the page
response = requests.get(url)
html = response.text

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the table by ID or attributes (might need to adjust based on actual HTML structure)
table = soup.find('table', id='kicking')

# Extract headers from the table, assuming they're in the second row of the table header section
header_row = table.find('thead').find_all('tr')[-1]  # Get the last header row which usually has the detailed headers
headers = [th.get_text() for th in header_row.find_all('th')]

# Adjust headers to exclude 'Rk' if it's not needed or keep it based on your requirement
if headers[0] == 'Rk':
    headers = headers[1:]  # Exclude 'Rk' if it's the first header and not needed

# Extract row data
rows = []
for row in table.find('tbody').find_all('tr'):
    cols = row.find_all('td')
    if cols:  # Make sure the row is not empty (filtering out possible header rows inside tbody)
        row_data = [col.get_text() for col in cols]
        rows.append(row_data)

# Create DataFrame
kickers_2018 = pd.DataFrame(rows, columns=headers)

# Print the DataFrame to verify
print(kickers_2018)


                  Player   Tm Age Pos   G GS FGA FGM FGA FGM  ... Lng     FG%  \
0       Ka'imi Fairbairn  HOU  24   K  16  0   1   1   9   9  ...  54   88.1%   
1         Justin Tucker+  BAL  29   K  16  0          12  12  ...  56   89.7%   
2           Mason Crosby  GNB  34   K  16  0           4   4  ...  53   81.1%   
3           Jason Myers*  NYJ  27   K  16  0           4   4  ...  56   91.7%   
4            Brett Maher  DAL  29   K  16  0          10  10  ...  62   80.6%   
5           Robbie Gould  SFO  36   K  16  0           9   9  ...  53   97.1%   
6         Aldrick Rosas*  NYG  24   K  16  0           9   9  ...  57   97.0%   
7     Stephen Gostkowski  NWE  34   K  16  0          11  11  ...  52   84.4%   
8            Matt Prater  DET  34   K  16  0           8   8  ...  54   87.5%   
9          Greg Zuerlein  LAR  31   K  11  0           9   9  ...  56   87.1%   
10          Jake Elliott  PHI  23   K  16  0           7   7  ...  56   83.9%   
11              Wil Lutz  NO