In [1]:
import pandas as pd
import numpy as np
import requests
import os
from bs4 import BeautifulSoup
import time
import random

In [2]:
#https://www.nfl.com/stats/team-stats/offense/passing/2020/reg/all

In [3]:
#side is the side of the football data is from (offense, defense, special teams)
#category is the type of play (passing, rushing, receiving, downs)
#year is the year of the nfl season
#all data is scraped from nfl.com (specifically their team stats)
#this function grabs an html output given a url and writes it to a file---then it reads the file and returns a
#BeautifulSoup Object
def scrape_stats(side, category, year):
    file = side + category + str(year)
    if not os.path.exists(file):
        url = f"https://www.nfl.com/stats/team-stats/{side}/{category}/{year}/reg/all"
        r = requests.get(url)
        r.raise_for_status()
        with open(file,"w") as f:
            f.write(r.text)
            
    with open(file) as f:
        return BeautifulSoup(f.read(), "html.parser")

In [4]:
#same variables as above function
#this function calls scrape_stats to get a BeautifulSoup object and then uses the html to create a pandas df
def make_df(side, category, year):
    page = scrape_stats(side, category, year)
    tables = page.find_all("table")
    assert len(tables) == 1 #there is only one table per url---make sure site layout hasn't changed in the future
    tbl = tables[0]
    
    headers = tbl.find_all("th") #the table headers correspond to the column names
    columns = [] #this is a list of the columns for the data table---use this later to make pandas df
    for header in headers:
        text = header.getText() #this will be just the column name without the html junk around it
        columns.append(text)
        
    data = tbl.find_all("td")
    contents = [] #this is a list of lists that contains contents for the future pandas df (contents for all teams)
    for i in range(0, len(data), len(columns)): #iterate over data in groups---each i is beginning of a new team
        chunk = data[i:i+len(columns)] #this is a list with each row of the chunk
        l = [] #list contents for a specific team
        for j in range(len(chunk)): 
            l.append(chunk[j].getText().split()[0]) #this string manipulation extracts just the number or team name

        contents.append(l)
        
    df = pd.DataFrame(contents, columns=columns)
    df["Year"] = year
    return df

In [5]:
#this function combines the dfs from each year to make a df with all the years
def concat_df(start_year, side, category):
    df = make_df(side, category, start_year)
    for year in range(start_year + 1, 2020):
        df_year = make_df(side, category, year)
        df = pd.concat([df, df_year], axis=0)
        time.sleep(.5) #be considerate about web scraping
    df = df.reset_index(drop=True)
    return df

In [6]:
#this function adds a year to the existing dataframe for a side and category
def add_year(df, side, category, year):
    df2add = make_df(side, category, year)
    df = pd.concat([df, df2add], axis=0) #combines existing df with df for the year that needs to be added
    df = df.reset_index(drop=True)
    return df  

In [7]:
#creates passing data
t0 = time.time()
passing = concat_df(1971, "offense", "passing")
t1 = time.time()
t1-t0

61.29124402999878

In [8]:
#creates rushing data
t0 = time.time()
rushing = concat_df(1971, "offense", "rushing")
t1 = time.time()
t1-t0

60.37006115913391

In [9]:
#creates receving data
t0 = time.time()
receiving = concat_df(1971, "offense", "receiving")
t1 = time.time()
t1-t0

58.67981815338135

In [39]:
#creates offensive downs data
t0 = time.time()
downs = concat_df(1971, "offense", "downs")
t1 = time.time()
t1-t0

61.02522039413452

In [10]:
#creates defensive passing data
t0 = time.time()
d_passing = concat_df(1971, "defense", "passing")
t1 = time.time()
t1-t0

28.111320972442627

In [11]:
#creates defensive rushing data
t0 = time.time()
d_rushing = concat_df(1971, "defense", "rushing")
t1 = time.time()
t1-t0

60.12420153617859

In [12]:
#creates defensive receiving data
t0 = time.time()
d_receiving = concat_df(1971, "defense", "receiving")
t1 = time.time()
t1-t0

59.299351930618286

In [28]:
#creates defensive fumble data
t0 = time.time()
fumbles = concat_df(1971, "defense", "fumbles")
t1 = time.time()
t1-t0

58.96878361701965

In [30]:
#creates defensive interception data
t0 = time.time()
interceptions = concat_df(1971, "defense", "interceptions")
t1 = time.time()
t1-t0

58.44851207733154

In [43]:
#creates defensive downs data
t0 = time.time()
d_downs = concat_df(1971, "defense", "downs")
t1 = time.time()
t1-t0

58.972487449645996

In [36]:
forced_turnovers = pd.merge(fumbles, interceptions, on=["Team", "Year"])
forced_turnovers

Unnamed: 0,Team,FF,FR,FR TD,Rec FUM,Rush FUM,Year,INT,INT TD,INT Yds,Lng
0,Redskins,0,0,0,0,0,1971,29,5,480,0
1,49ers,0,0,0,0,0,1971,14,0,186,0
2,Chargers,0,0,0,0,0,1971,22,2,317,0
3,Steelers,0,0,0,0,0,1971,17,1,246,0
4,Cardinals,0,0,0,0,0,1971,17,1,191,0
...,...,...,...,...,...,...,...,...,...,...,...
1433,Bears,12,9,0,4,5,2019,10,1,157,59
1434,Panthers,8,7,1,0,5,2019,14,0,141,37
1435,Bills,16,9,0,9,6,2019,14,0,59,49
1436,Ravens,15,12,4,6,5,2019,13,2,259,89T


In [42]:
#writes offensive csv files
if not os.path.exists("offensive_passing_stats.csv"):
    passing.to_csv("offensive_passing_stats.csv", index = False)
    
if not os.path.exists("offensive_rushing_stats.csv"):
    rushing.to_csv("offensive_rushing_stats.csv", index = False)

if not os.path.exists("offensive_receiving_stats.csv"):
    receiving.to_csv("offensive_receiving_stats.csv", index = False)  

if not os.path.exists("offensive_downs.csv"):
    downs.to_csv("offensive_downs.csv", index = False) 

In [47]:
#writes defensive csv files
if not os.path.exists("defensive_passing_stats.csv"):
    d_passing.to_csv("defensive_passing_stats.csv", index = False)
    
if not os.path.exists("defensive_rushing_stats.csv"):
    d_rushing.to_csv("defensive_rushing_stats.csv", index = False)

if not os.path.exists("defensive_receiving_stats.csv"):
    d_receiving.to_csv("defensive_receiving_stats.csv", index = False)   

if not os.path.exists("defensive_turnovers.csv"):
    forced_turnovers.to_csv("defensive_turnovers.csv", index = False)   
    
if not os.path.exists("defensive_downs.csv"):
    d_downs.to_csv("defensive_downs.csv", index = False)   

In [14]:
#available offensive dfs are: passing, rushing, receiving, downs
#available defensive dfs are: passing, rushing, receiving, turnovers, downs