In [2]:
#Preamble and packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import partial
print("Script is running, please be patient")

Script is running, please be patient


In [4]:
base_url = "https://www.nfl.com/standings/division/" 
# initialize empty pandas dataframe, to which each team-year observation is added
teamRecorddf = pd.DataFrame()

In [6]:
for year in range(2000,2021):
    yeardf = pd.DataFrame()
    print("Scraping year: " + str(year))
    #create unique URL for each year's results page
    url = base_url + str(year) + "/REG"
    print(url)
    #load page html
    page = requests.get(url)
    #Parse html to be readable - This script uses the html parser which is slower than lxml's parser but does not require additional dependencies or installs
    soup = BeautifulSoup(page.content, 'html.parser')

    #Go to the table
    table = soup.find(class_='d3-o-table')
    table_head = table.find('thead').find('tr').find_all('th')
    
    #collect column headers
    df_headers = []
    for header in table_head:
        col = header.string.strip()
        df_headers.append(col) 

    #Change the stand-in conference header to be the correct column title "Team" and changing second "Pct" so as to remove duplicate column titles
    df_headers[0] = 'Team'
    df_headers[4] = 'Pct_total'
    df_headers[11] = 'Pct_div'
    df_headers[13] = 'Pct_conf'




    # Each conference has a separate table (but all have the same headings)
    tables = soup.find_all(class_='d3-o-table')
    for conf in tables:
        confdf = pd.DataFrame()
        #find rows
        table_body = conf.find('tbody')
        table_rows = table_body.find_all('tr')
        #scrape each column for values
        for row in table_rows:
            team_values = []
            cols = row.find_all('td')

            i = 1    
            for col in cols:

                if i == 1:
                    # team_box = col.find(class_='d3-club-info')
                    team_name = col.find(class_='d3-o-club-shortname').contents[0].strip()
                    print("currently scraping: " + team_name)
                    team_values.append(team_name)
                    i = i+1
                else:
                    col_value = col.string.strip()
                    team_values.append(col_value)
                    i = i+1
                    
            team_values = [team_values]
            teamdf = pd.DataFrame(
                team_values,
                columns=df_headers
            )
            teamdf['year'] = year
            
            confdf = pd.concat([confdf, teamdf], ignore_index=True)
            

        yeardf = pd.concat([yeardf,confdf], ignore_index=True)
        
    teamRecorddf = pd.concat([teamRecorddf,yeardf], ignore_index=True)
    




Scraping year: 2000
https://www.nfl.com/standings/division/2000/REG
currently scraping: Cardinals
currently scraping: Cowboys
currently scraping: Washington
currently scraping: Eagles
currently scraping: Falcons
currently scraping: 49ers
currently scraping: Panthers
currently scraping: Rams
currently scraping: Saints
currently scraping: Bears
currently scraping: Lions
currently scraping: Buccaneers
currently scraping: Packers
currently scraping: Vikings
currently scraping: Chargers
currently scraping: Seahawks
currently scraping: Chiefs
currently scraping: Broncos
currently scraping: Raiders
currently scraping: Browns
currently scraping: Bengals
currently scraping: Jaguars
currently scraping: Steelers
currently scraping: Titans
currently scraping: Ravens
currently scraping: Patriots
currently scraping: Bills
currently scraping: Jets
currently scraping: Colts
currently scraping: Dolphins
Scraping year: 2001
https://www.nfl.com/standings/division/2001/REG
currently scraping: Cardinals
cu

In [8]:
yeardf

Unnamed: 0,Team,W,L,T,Pct_total,PF,PA,Net Pts,Home,Road,Div,Pct_div,Conf,Pct_conf,Non-Conf,Strk,Last 5,year
0,Seahawks,12,4,0,0.75,459,371,88,7 - 1 - 0,5 - 3 - 0,4 - 2 - 0,0.667,9 - 3 - 0,0.75,3 - 1 - 0,4W,4 - 1 - 0,2020
1,Rams,10,6,0,0.625,372,296,76,6 - 2 - 0,4 - 4 - 0,3 - 3 - 0,0.5,9 - 3 - 0,0.75,1 - 3 - 0,1W,3 - 2 - 0,2020
2,Cardinals,8,8,0,0.5,410,367,43,4 - 4 - 0,4 - 4 - 0,2 - 4 - 0,0.333,6 - 6 - 0,0.5,2 - 2 - 0,2L,2 - 3 - 0,2020
3,49ers,6,10,0,0.375,376,390,-14,1 - 7 - 0,5 - 3 - 0,3 - 3 - 0,0.5,4 - 8 - 0,0.333,2 - 2 - 0,1L,1 - 4 - 0,2020
4,Saints,12,4,0,0.75,482,337,145,6 - 2 - 0,6 - 2 - 0,6 - 0 - 0,1.0,10 - 2 - 0,0.833,2 - 2 - 0,2W,3 - 2 - 0,2020
5,Buccaneers,11,5,0,0.688,492,355,137,5 - 3 - 0,6 - 2 - 0,4 - 2 - 0,0.667,8 - 4 - 0,0.667,3 - 1 - 0,4W,4 - 1 - 0,2020
6,Panthers,5,11,0,0.313,350,402,-52,2 - 6 - 0,3 - 5 - 0,1 - 5 - 0,0.167,4 - 8 - 0,0.333,1 - 3 - 0,1L,1 - 4 - 0,2020
7,Falcons,4,12,0,0.25,396,414,-18,2 - 6 - 0,2 - 6 - 0,1 - 5 - 0,0.167,2 - 10 - 0,0.167,2 - 2 - 0,5L,0 - 5 - 0,2020
8,Washington,7,9,0,0.438,335,329,6,3 - 5 - 0,4 - 4 - 0,4 - 2 - 0,0.667,5 - 7 - 0,0.417,2 - 2 - 0,1W,3 - 2 - 0,2020
9,Giants,6,10,0,0.375,280,357,-77,3 - 5 - 0,3 - 5 - 0,4 - 2 - 0,0.667,5 - 7 - 0,0.417,1 - 3 - 0,1W,2 - 3 - 0,2020


In [21]:
# Cleaning up column names
teamRecorddf = teamRecorddf.rename(columns=str.lower)
teamRecorddf = teamRecorddf.rename(columns = {'last 5' : 'last_five'})

teamRecorddf.to_csv(r'teamRecord.csv', header=True, index=False)