In [1]:
def createFantDB(startYear,endYear,function):
    # Import dependencies
    import pandas as pd
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year))
    return database

In [2]:
def teamScrape(year):
    # Dependencies
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    from functools import reduce
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/#all_team_stats'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    # Find the tables
    df1 = html_soup.find('table', {"id" : 'team_stats'})
    df2 = html_soup.find('table', {"id" : 'passing'})
    df3 = html_soup.find('table', {"id" : 'rushing'})
    # Create dataframes
    df1 = pd.read_html(str(df1),header=1)[0]
    df2 = pd.read_html(str(df2),header=0)[0]
    df3 = pd.read_html(str(df3),header=0)[0]
    # Quit Splinter
    browser.quit()
    # Merge dataframes
    data_frames = [df1, df2, df3]
    df = reduce(lambda  left,right: pd.merge(left,right,on=['Tm'],
                            how='outer'), data_frames)
    teams = ['Dallas Cowboys','Tampa Bay Buccaneers','Buffalo Bills','Kansas City Chiefs','Los Angeles Chargers',
             'New England Patriots','Cincinnati Bengals','Los Angeles Rams','Indianapolis Colts','Green Bay Packers',
             'Arizona Cardinals','Philadelphia Eagles','San Francisco 49ers','Minnesota Vikings','Tennessee Titans',
             'Seattle Seahawks','Baltimore Ravens','Las Vegas Raiders','New Orleans Saints','Cleveland Browns',
             'Pittsburgh Steelers','Miami Dolphins','Denver Broncos','Washington Football Team','Detroit Lions',
             'Atlanta Falcons','Chicago Bears','New York Jets','Carolina Panthers','Houston Texans','New York Giants',
             'Jacksonville Jaguars','San Diego Chargers','St. Louis Rams','Oakland Raiders','Washington Redskins',
             'Wasington Commanders']
    abbr = {'Dallas Cowboys':'DAL','Tampa Bay Buccaneers':'TAM','Buffalo Bills':'BUF','Kansas City Chiefs':'KAN',
            'Los Angeles Chargers':'LAC','New England Patriots':'NWE','Cincinnati Bengals':'CIN',
            'Los Angeles Rams':'LAR','Indianapolis Colts':'IND','Green Bay Packers':'GNB','Arizona Cardinals':'ARI',
            'Philadelphia Eagles':'PHI','San Francisco 49ers':'SFO','Minnesota Vikings':'MIN',
            'Tennessee Titans':'TEN','Seattle Seahawks':'SEA','Baltimore Ravens':'BAL','Las Vegas Raiders':'LVR',
            'New Orleans Saints':'NOR','Cleveland Browns':'CLE','Pittsburgh Steelers':'PIT','Miami Dolphins':'MIA',
            'Denver Broncos':'DEN','Washington Football Team':'WAS','Detroit Lions':'DET','Atlanta Falcons':'ATL',
            'Chicago Bears':'CHI','New York Jets':'NYJ','Carolina Panthers':'CAR','Houston Texans':'HOU',
            'New York Giants':'NYG','Jacksonville Jaguars':'JAX','San Diego Chargers':'SDG','St. Louis Rams':'STL',
            'Oakland Raiders':'OAK','Washington Redskins':'WAS','Wasington Commanders':'WAS'}
    # Clean dataframe including only rows with actual teams + add abbreviations
    df = df[df['Tm'].isin(teams)]
    df['Abbr'] = df.apply(lambda row: abbr[row.Tm], axis=1)
    # Drop unnecessary duplicate columns
    df.drop(columns=['Rk','Rk_x','Rk_y'],inplace=True)
    # Reorganize columns, dropping other unnecessary columns
    columns = ['Tm','Abbr','G_x','PF','Yds_x','Ply','Y/P','TO','FL','1stD','Cmp_x',
           'Att_x','Cmp%','Yds.1_x', 'TD_x', 'TD%','Int_x', 'Int%','NY/A_x', '1stD.1',   
           'Lng_x', 'Y/A_y', 'AY/A', 'Y/C', 'Y/G_x', 'Rate', 'Sk',
           'Yds.1_y', 'Sk%', 'ANY/A', '4QC', 'GWD', 'EXP_y',
           'Att.1','Yds.2', 'TD.1', 'Lng_y','Y/A_x','Y/G_y','1stD.2','Fmb','EXP',
           'Pen', 'Yds.3', '1stPy', 'Sc%','TO%', 'EXP_x']
    df = df[columns]
    # Drop duplicates
    df = df.loc[:,~df.columns.duplicated()]
    # Rename some of the columns
    cols = []
    for column in df.columns:
        if column == 'G_x':
            cols.append('G')
        elif column == 'Yds_x':
            cols.append('TotYds')
        elif column == 'Cmp_x':
            cols.append('Cmp')
        elif column == 'Att_x':
            cols.append('PasAtt')
        elif column == 'Yds.1_x':
            cols.append('PassYds')
        elif column == 'TD_x':
            cols.append('PassTD')
        elif column == 'Int_x':
            cols.append('Int')
        elif column == 'NY/A_x':
            cols.append('NY/A') 
        elif column == '1stD.1':
            cols.append('Pass1stD')
        elif column == 'Lng_x':
            cols.append('PassLng')
        elif column == 'Y/A_y':
            cols.append('Y/PassA')
        elif column == 'AY/A':
            cols.append('AY/PassA')
        elif column == 'Y/G_x':
            cols.append('PassY/G')
        elif column == 'Yds.1_y':
            cols.append('SackYds')
        elif column == 'ANY/A':
            cols.append('ANY/PassA')
        elif column == 'EXP_y':
            cols.append('EXPPass')
        elif column == 'Att.1':
            cols.append('RushAtt')
        elif column == 'Yds.2':
            cols.append('RushYds')
        elif column == 'TD.1':
            cols.append('RushTDs')
        elif column == 'Lng_y':
            cols.append('RushLng')
        elif column == 'Y/A_x':
            cols.append('Y/RushA')
        elif column == 'Y/G_y':
            cols.append('RushY/G')
        elif column == '1stD.2':
            cols.append('Rush1stD')
        elif column == 'EXP':
            cols.append('EXPRush')
        elif column == 'Yds.3':
            cols.append('PenYds')
        elif column == 'EXP_x':
            cols.append('EXPTot')
        else:
            cols.append(column)
    df.columns = cols
    
    # Change dtypes of columns to numeric
    columns = df.columns[2:]
    for col in columns:
        if '%' in col:
            df[col] = df[col].str.rstrip('%').astype('float') / 100.0
        else:
            df[col] = pd.to_numeric(df[col],errors='coerce')
            
    listOfColumns = ['PF','TotYds','Ply','Y/P','TO','FL','1stD','Cmp','PasAtt',
                     'Cmp%','PassYds','PassTD','TD%','Int','Int%','NY/A','Pass1stD',
                     'PassLng','Y/PassA','AY/PassA','Y/C','PassY/G','Rate','Sk',
                     'SackYds','Sk%','ANY/PassA','4QC','GWD','EXPPass','RushAtt',
                     'RushYds','RushTDs','RushLng','Y/RushA','RushY/G','Rush1stD',
                     'Fmb','EXPRush','Pen','PenYds','1stPy','Sc%','TO%','EXPTot']
    
    for column in listOfColumns:
        if column == 'Y/P':
            newColumn = 'Yds/Play'
            df[newColumn] = df[column]
        elif column == 'Cmp%':
            newColumn = 'Comp%'
            df[newColumn] = df[column]
        elif column == 'TD%':
            newColumn = 'PassTD%'
            df[newColumn] = df[column]
        elif column == 'Int%':
            newColumn = 'INT%'
            df[newColumn] = df[column]
        elif column == 'NY/A':
            newColumn = 'NY/PassA'
            df[newColumn] = df[column]
        elif column == 'PassLng':
            newColumn = 'PassLong'
            df[newColumn] = df[column]
        elif column == 'Y/PassA':
            newColumn = 'Yds/PassA'
            df[newColumn] = df[column]
        elif column == 'AY/PassA':
            newColumn = 'AYds/PassA'
            df[newColumn] = df[column]
        elif column == 'Y/C':
            newColumn = 'Y/Comp'
            df[newColumn] = df[column]
        elif column == 'PassY/G':
            newColumn = 'PY/G'
            df[newColumn] = df[column]
        elif column == 'Rate':
            newColumn = 'PassRate'
            df[newColumn] = df[column]
        elif column == 'Sk%':
            newColumn = 'Sack%'
            df[newColumn] = df[column]
        elif column == 'ANY/PassA':
            newColumn = 'ANY/PA'
            df[newColumn] = df[column]
        elif column == '4QC':
            newColumn = '4QComeB'
            df[newColumn] = df[column]
        elif column == 'GWD':
            newColumn = 'GWinDri'
            df[newColumn] = df[column]
        elif column == 'RushLng':
            newColumn = 'RLong'
            df[newColumn] = df[column]
        elif column == 'Y/RushA':
            newColumn = 'Yds/RA'
            df[newColumn] = df[column]
        elif column == 'RushY/G':
            newColumn = 'RY/G'
            df[newColumn] = df[column]
        elif column == 'RushY/G':
            newColumn = 'RushYds/G'
            df[newColumn] = df[column]
        elif column == 'Sc%':
            newColumn = 'Score%'
            df[newColumn] = df[column]
        elif column == 'TO%':
            newColumn = 'TurnOv%'
            df[newColumn] = df[column]
        else:
            newColumn = column + '/G'
            df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
            df[newColumn] = df[newColumn].round(decimals = 3)
    df = df.drop(columns = listOfColumns)
    df = df.drop(columns = ['PY/G','RY/G'])
    # Add year column and fill null values with 0
    df['Year'] = year
    df = df.fillna(0)
    # Create Team ID column
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Abbr'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [3]:
teams_db = createFantDB(2012,2021,teamScrape)













In [4]:
teams_db

Unnamed: 0,Tm,Abbr,G,PF/G,TotYds/G,Ply/G,Yds/Play,TO/G,FL/G,1stD/G,...,Fmb/G,EXPRush/G,Pen/G,PenYds/G,1stPy/G,Score%,TurnOv%,EXPTot/G,Year,TmID
0,New England Patriots,NWE,16,34.812,427.875,74.438,5.7,1.000,0.438,27.750,...,0.875,-4.122,6.062,52.500,2.312,0.481,0.081,7.436,2012,NWE2012
1,Denver Broncos,DEN,16,30.062,397.875,68.125,5.8,1.562,0.875,23.750,...,1.375,-6.850,6.250,50.312,2.250,0.399,0.122,3.392,2012,DEN2012
2,New Orleans Saints,NOR,16,28.812,410.875,66.688,6.2,1.500,0.312,22.000,...,0.812,-1.333,6.500,56.938,1.312,0.374,0.121,4.646,2012,NOR2012
3,Washington Redskins,WAS,16,27.250,383.188,62.125,6.2,0.875,0.375,21.312,...,1.625,-2.558,7.250,61.562,1.625,0.393,0.077,-1.031,2012,WAS2012
4,Green Bay Packers,GNB,16,27.062,359.438,65.125,5.5,1.000,0.500,21.312,...,1.000,-3.974,6.438,57.688,2.688,0.376,0.081,1.611,2012,GNB2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,New York Jets,NYJ,17,18.235,306.353,60.941,5.0,1.588,0.412,18.235,...,0.824,0.919,6.059,50.588,1.647,0.294,0.144,-1.924,2021,NYJ2021
30,Carolina Panthers,CAR,17,17.882,298.882,65.059,4.6,1.706,0.471,18.941,...,1.176,-0.075,6.294,50.824,1.765,0.292,0.149,-5.418,2021,CAR2021
31,Houston Texans,HOU,17,16.471,278.059,59.412,4.7,1.294,0.412,15.647,...,1.059,-3.438,6.706,57.529,1.588,0.266,0.112,-4.370,2021,HOU2021
32,New York Giants,NYG,17,15.176,287.294,61.765,4.7,1.765,0.588,17.588,...,1.353,-1.088,5.176,39.647,2.235,0.275,0.159,-5.890,2021,NYG2021


In [7]:
teams_db = teams_db.astype({"4QComeB":int,"GWinDri": int})

In [8]:
teams_db.isnull().sum()

Tm            0
Abbr          0
G             0
PF/G          0
TotYds/G      0
Ply/G         0
Yds/Play      0
TO/G          0
FL/G          0
1stD/G        0
Cmp/G         0
PasAtt/G      0
Comp%         0
PassYds/G     0
PassTD/G      0
PassTD%       0
Int/G         0
INT%          0
NY/PassA      0
Pass1stD/G    0
PassLong      0
Yds/PassA     0
AYds/PassA    0
Y/Comp        0
PassRate      0
Sk/G          0
SackYds/G     0
Sack%         0
ANY/PA        0
4QComeB       0
GWinDri       0
EXPPass/G     0
RushAtt/G     0
RushYds/G     0
RushTDs/G     0
RLong         0
Yds/RA        0
Rush1stD/G    0
Fmb/G         0
EXPRush/G     0
Pen/G         0
PenYds/G      0
1stPy/G       0
Score%        0
TurnOv%       0
EXPTot/G      0
Year          0
TmID          0
dtype: int64

In [9]:
teams_db.dtypes

Tm             object
Abbr           object
G               int64
PF/G          float64
TotYds/G      float64
Ply/G         float64
Yds/Play      float64
TO/G          float64
FL/G          float64
1stD/G        float64
Cmp/G         float64
PasAtt/G      float64
Comp%         float64
PassYds/G     float64
PassTD/G      float64
PassTD%       float64
Int/G         float64
INT%          float64
NY/PassA      float64
Pass1stD/G    float64
PassLong        int64
Yds/PassA     float64
AYds/PassA    float64
Y/Comp        float64
PassRate      float64
Sk/G          float64
SackYds/G     float64
Sack%         float64
ANY/PA        float64
4QComeB         int64
GWinDri         int64
EXPPass/G     float64
RushAtt/G     float64
RushYds/G     float64
RushTDs/G     float64
RLong           int64
Yds/RA        float64
Rush1stD/G    float64
Fmb/G         float64
EXPRush/G     float64
Pen/G         float64
PenYds/G      float64
1stPy/G       float64
Score%        float64
TurnOv%       float64
EXPTot/G  

In [10]:
teams_db.to_csv('Database_CSVs/teamsDB.csv',index=False)