In [1]:
def createAdvDB(startYear,endYear,function,PerGame=False):
    # Import dependencies
    import pandas as pd
    flag=PerGame
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year,flag))
    return database

In [2]:
def scrapeAdvancedPass(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    from functools import reduce
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/passing_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    try:
        
        df1 = html_soup.find('table', {"id" : 'advanced_air_yards'})
        df2 = html_soup.find('table', {"id" : 'advanced_accuracy'})
        df3 = html_soup.find('table', {"id" : 'advanced_pressure'})
        df4 = html_soup.find('table', {"id" : 'advanced_play_type'})

        df1 = pd.read_html(str(df1),header=1)[0]
        df2 = pd.read_html(str(df2),header=1)[0]
        df3 = pd.read_html(str(df3),header=1)[0]
        df4 = pd.read_html(str(df4),header=1)[0]
    
        data_frames = [df1, df2, df3, df4]
        df = reduce(lambda  left,right: pd.merge(left,right,on=['Player'],
                                                    how='outer'), data_frames)
        df = df[df['Player'] != 'Player']
        df = df[df['Player'].notna()]
        df = df.replace('\/','',regex=True).astype(object)
        df = df.replace('^qb','QB',regex=True).astype(object)
        df = df.replace('^rb','RB',regex=True).astype(object)
        df = df.replace('^wr','WR',regex=True).astype(object)
        df = df.replace('^te','TE',regex=True).astype(object)
        df = df.replace('\+','',regex=True).astype(object)
        df = df.replace('\*','',regex=True).astype(object)
        columns = ['Player','Tm_x','Pos_x','Age_x','G_x','GS_x','Cmp_x','Att_x','Yds_x','IAY',
                   'IAY/PA','CAY','CAY/Cmp','CAY/PA','YAC','YAC/Cmp','Bats','ThAwy',
                   'Spikes','Drops','Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk',
                   'PktTime','Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr','Plays',
                   'Yds.1','PassAtt','PassYds','RushAtt','RushYds', 'PassAtt.1', 'PassYds.1']

        df = df[columns]
        df = df.loc[:,~df.columns.duplicated()]
        cols = []
        for column in df.columns:
            if column == 'Tm_x':
                cols.append('Tm')
            elif column == 'Pos_x':
                cols.append('Pos')
            elif column == 'Age_x':
                cols.append('Age')
            elif column == 'G_x':
                cols.append('G')
            elif column == 'GS_x':
                cols.append('GS')
            elif column == 'Cmp_x':
                cols.append('Cmp')
            elif column == 'Att_x':
                cols.append('Att')
            elif column == 'Yds_x':
                cols.append('Yds')
            elif column == 'Yds.1':
                cols.append('RPO_Yds')
            elif column == 'PassAtt':
                cols.append('RPO_PassAtt')
            elif column == 'PassYds':
                cols.append('RPO_PassYds')
            elif column == 'RushAtt':
                cols.append('RPO_RushAtt')
            elif column == 'RushYds':
                cols.append('RPO_RushYds')
            elif column == 'PassAtt.1':
                cols.append('PA_PassAtt')
            elif column == 'PassYds.1':
                cols.append('PA_PassYds')
            else:
                cols.append(column)

        df.columns = cols
        df = df.fillna(0)
        df = df[df['Pos'] != 'k']
        df = df[df['Pos'] != 'p']
        if PerGame is False:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
        else:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
                    
            listOfColumns = ['Cmp','Att','Yds','IAY','IAY/PA','CAY','CAY/Cmp',
                             'CAY/PA','YAC','YAC/Cmp','Bats','ThAwy','Spikes','Drops',
                             'Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk','PktTime',
                             'Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr','Plays',
                             'RPO_Yds','RPO_PassAtt','RPO_PassYds','RPO_RushAtt',
                             'RPO_RushYds', 'PA_PassAtt','PA_PassYds']
        
            for column in listOfColumns:
                if column == 'IAY/PA':
                    newColumn = 'IAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'CAY/Cmp':
                    newColumn = 'CAirYds/Cmp'
                    df[newColumn] = df[column]
                elif column == 'CAY/PA':
                    newColumn = 'CAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'YAC/Cmp':
                    newColumn = 'YdsAC/Cmp'
                    df[newColumn] = df[column]
                elif column == 'Drop%':
                    newColumn = 'Drp%'
                    df[newColumn] = df[column]
                elif column == 'Bad%':
                    newColumn = 'BadTh%'
                    df[newColumn] = df[column]
                elif column == 'OnTgt%':
                    newColumn = 'OnTarg%'
                    df[newColumn] = df[column]
                elif column == 'PktTime':
                    newColumn = 'PockTime'
                    df[newColumn] = df[column]
                elif column == 'Prss%':
                    newColumn = 'Press%'
                    df[newColumn] = df[column]
                elif column == 'Yds/Scr':
                    newColumn = 'Yds/Scram'
                    df[newColumn] = df[column]
                else:
                    newColumn = column + '/G'
                    df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                    df[newColumn] = df[newColumn].round(decimals = 3)
            df = df.drop(columns = listOfColumns)
                
        df['Year'] = year
        df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
        # creates unique player ID to make stats for different years easier to see
        playerID = []
        for index, row in df.iterrows():
            playerID.append(row['Player'] + str(row['Year']))
        df['PlID'] = playerID
        
        teamID = []
        for index, row in df.iterrows():
            teamID.append(row['Tm'] + str(row['Year']))
        df['TmID'] = teamID
        
    except:
        
        df1 = html_soup.find('table', {"id" : 'advanced_air_yards'})
        df2 = html_soup.find('table', {"id" : 'advanced_accuracy'})
        df3 = html_soup.find('table', {"id" : 'advanced_pressure'})

        df1 = pd.read_html(str(df1),header=1)[0]
        df2 = pd.read_html(str(df2),header=1)[0]
        df3 = pd.read_html(str(df3),header=1)[0]
    
        data_frames = [df1, df2, df3]
        df = reduce(lambda  left,right: pd.merge(left,right,on=['Player'],
                                                    how='outer'), data_frames)
        df = df[df['Player'] != 'Player']
        df = df[df['Player'].notna()]
        df = df.replace('\/','',regex=True).astype(object)
        df = df.replace('^qb','QB',regex=True).astype(object)
        df = df.replace('^rb','RB',regex=True).astype(object)
        df = df.replace('^wr','WR',regex=True).astype(object)
        df = df.replace('^te','TE',regex=True).astype(object)
        df = df.replace('\+','',regex=True).astype(object)
        df = df.replace('\*','',regex=True).astype(object)
        columns = ['Player','Tm_x','Pos_x','Age_x','G_x','GS_x','Cmp_x','Att_x','Yds_x','IAY',
                   'IAY/PA','CAY','CAY/Cmp','CAY/PA','YAC','YAC/Cmp','ThAwy',
                   'Spikes','Drops','Drop%','BadTh','Bad%','Sk',
                   'PktTime','Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr']

        df = df[columns]
        df = df.loc[:,~df.columns.duplicated()]
        cols = []
        for column in df.columns:
            if column == 'Tm_x':
                cols.append('Tm')
            elif column == 'Pos_x':
                cols.append('Pos')
            elif column == 'Age_x':
                cols.append('Age')
            elif column == 'G_x':
                cols.append('G')
            elif column == 'GS_x':
                cols.append('GS')
            elif column == 'Cmp_x':
                cols.append('Cmp')
            elif column == 'Att_x':
                cols.append('Att')
            elif column == 'Yds_x':
                cols.append('Yds')
            else:
                cols.append(column)

        df.columns = cols
        df = df.fillna(0)
        df = df[df['Pos'] != 'k']
        df = df[df['Pos'] != 'p']
        
        if PerGame is False:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
        else:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
                    
            listOfColumns = ['Cmp','Att','Yds','IAY','IAY/PA','CAY','CAY/Cmp',
                             'CAY/PA','YAC','YAC/Cmp','Bats','ThAwy','Spikes','Drops',
                             'Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk','PktTime',
                             'Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr']
        
            for column in listOfColumns:
                if column == 'IAY/PA':
                    newColumn = 'IAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'CAY/Cmp':
                    newColumn = 'CAirYds/Cmp'
                    df[newColumn] = df[column]
                elif column == 'CAY/PA':
                    newColumn = 'CAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'YAC/Cmp':
                    newColumn = 'YdsAC/Cmp'
                    df[newColumn] = df[column]
                elif column == 'Drop%':
                    newColumn = 'Drp%'
                    df[newColumn] = df[column]
                elif column == 'Bad%':
                    newColumn = 'BadTh%'
                    df[newColumn] = df[column]
                elif column == 'OnTgt%':
                    newColumn = 'OnTarg%'
                    df[newColumn] = df[column]
                elif column == 'PktTime':
                    newColumn = 'PockTime'
                    df[newColumn] = df[column]
                elif column == 'Prss%':
                    newColumn = 'Press%'
                    df[newColumn] = df[column]
                elif column == 'Yds/Scr':
                    newColumn = 'Yds/Scram'
                    df[newColumn] = df[column]
                else:
                    newColumn = column + '/G'
                    df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                    df[newColumn] = df[newColumn].round(decimals = 3)
            df = df.drop(columns = listOfColumns)
                
        df['Year'] = year
        df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
        # creates unique player ID to make stats for different years easier to see
        playerID = []
        for index, row in df.iterrows():
            playerID.append(row['Player'] + str(row['Year']))
        df['PlID'] = playerID
        
        teamID = []
        for index, row in df.iterrows():
            teamID.append(row['Tm'] + str(row['Year']))
        df['TmID'] = teamID
    
    
    return df

In [3]:
def scrapeAdvancedRush(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/rushing_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    
    df = html_soup.find('table', {"id" : 'advanced_rushing'})
    df = pd.read_html(str(df),header=1)[0]
    
    df = df[df['Player'] != 'Player']
    df = df[df['Player'].notna()]
    df = df.replace('\/','',regex=True).astype(object)
    df = df.replace('^qb','QB',regex=True).astype(object)
    df = df.replace('^rb','RB',regex=True).astype(object)
    df = df.replace('^wr','WR',regex=True).astype(object)
    df = df.replace('^te','TE',regex=True).astype(object)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    columns = ['Player','Tm','Pos','Age','G','GS','Att','Yds','1D',
               'YBC','YBC/Att','YAC','YAC/Att','BrkTkl','Att/Br']

    df = df[columns]

    df = df.fillna(0)
    df = df[df['Pos'] != 'k']
    df = df[df['Pos'] != 'p']
    if PerGame is False:
        columns = df.columns[3:]
        for col in columns:
            df[col] = pd.to_numeric(df[col],errors='coerce')
    else:
        columns = df.columns[3:]
        for col in columns:
            df[col] = pd.to_numeric(df[col],errors='coerce')
            
        listOfColumns = ['Att','Yds','1D','YBC','YBC/Att','YAC',
                         'YAC/Att','BrkTkl','Att/Br']
        
        for column in listOfColumns:
            if column == 'YBC/Att':
                newColumn = 'YBCont/A'
                df[newColumn] = df[column]
            elif column == 'YAC/Att':
                newColumn = 'YACont/A'
                df[newColumn] = df[column]
            elif column == 'Att/Br':
                newColumn = 'Att/BrTk'
                df[newColumn] = df[column]
            else:
                newColumn = column + '/G'
                df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                df[newColumn] = df[newColumn].round(decimals = 3)
        df = df.drop(columns = listOfColumns)
    
    df['Year'] = year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # creates unique player ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    browser.quit()
    
    return df

In [4]:
def scrapeAdvancedRec(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/receiving_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    
    df = html_soup.find('table', {"id" : 'advanced_receiving'})
    df = pd.read_html(str(df),header=0)[0]
    
    df = df[df['Player'] != 'Player']
    df = df[df['Player'].notna()]
    df = df.replace('\/','',regex=True).astype(object)
    df = df.replace('^qb','QB',regex=True).astype(object)
    df = df.replace('^rb','RB',regex=True).astype(object)
    df = df.replace('^wr','WR',regex=True).astype(object)
    df = df.replace('^te','TE',regex=True).astype(object)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    columns = ['Player','Tm','Pos','Age','G','GS','Tgt','Rec','Yds','TD','1D','YBC','YBC/R',
               'YAC','YAC/R','ADOT','BrkTkl','Rec/Br','Drop','Drop%','Int','Rat']

    df = df[columns]

    df = df.fillna(0)
    df = df[df['Pos'] != 'k']
    df = df[df['Pos'] != 'p']
    if PerGame is False:
        columns = df.columns[3:]
        for col in columns:
            if '%' in col:
                df[col] = df[col].str.rstrip('%').astype('float') / 100.0
            else:
                df[col] = pd.to_numeric(df[col],errors='coerce')
    else:
        columns = df.columns[3:]
        for col in columns:
            if '%' in col:
                df[col] = df[col].str.rstrip('%').astype('float') / 100.0
            else:
                df[col] = pd.to_numeric(df[col],errors='coerce')
        
        listOfColumns = ['Tgt','Rec','Yds','TD','1D','YBC','YBC/R','YAC','YAC/R',
                         'ADOT','BrkTkl','Rec/Br','Drop','Drop%','Int','Rat']
        
        for column in listOfColumns:
            if column == 'YBC/R':
                newColumn = 'YdsBC/Rec'
                df[newColumn] = df[column]
            elif column == 'YAC/R':
                newColumn = 'YdsAC/Rec'
                df[newColumn] = df[column]
            elif column == 'ADOT':
                newColumn = 'AvgDOT'
                df[newColumn] = df[column]
            elif column == 'Rec/Br':
                newColumn = 'Rec/BrTk'
                df[newColumn] = df[column]
            elif column == 'Drop%':
                newColumn = 'Drp%'
                df[newColumn] = df[column]
            elif column == 'Rat':
                newColumn = 'PassRat'
                df[newColumn] = df[column]
            else:
                newColumn = column + '/G'
                df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                df[newColumn] = df[newColumn].round(decimals = 3)
        df = df.drop(columns = listOfColumns)
    
    
    df['Year'] = year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # creates unique player ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [5]:
def positions(year):
    import pandas as pd
    #Scrape and create dataframe from website
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    df = pd.read_html(url,header=1)[0]
    # Drop unnecessary columns and clean up the dataframe
    df = df[['Player','Tm','FantPos']]
    df = df[df['FantPos'].notna()]
    df = df[df['FantPos'] != 'FantPos']
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    # Strips extra whitespace from end of player column
    df['Player'] = df['Player'].str.rstrip()
    df = df.dropna()
    # Create year and PPG columns
    df['Year'] = year
    # Edits name for uniformity year over year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    df.rename(columns={'FantPos':'Pos'},inplace=True)
    # Creates unique Player ID and Team ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [6]:
def createFantDB(startYear,endYear,function):
    # Import dependencies
    import pandas as pd
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year))
    return database

In [7]:
advPass = createAdvDB(2019,2021,scrapeAdvancedPass,PerGame=True)
advRush = createAdvDB(2019,2021,scrapeAdvancedRush,PerGame=True)
advRec = createAdvDB(2019,2021,scrapeAdvancedRec,PerGame=True)
pos = createFantDB(2012,2021,positions)
















In [8]:
advPass

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Cmp/G,Att/G,Yds/G,IAY/G,...,RPO_Yds/G,RPO_PassAtt/G,RPO_PassYds/G,RPO_RushAtt/G,RPO_RushYds/G,PA_PassAtt/G,PA_PassYds/G,Year,PlID,TmID
0,JameisWinston,TAM,QB,25,16,16,23.750,39.125,319.312,405.375,...,5.125,0.500,5.000,0.062,0.125,5.812,70.812,2019,JameisWinston2019,TAM2019
1,DakPrescott,DAL,QB,26,16,16,24.250,37.250,306.375,346.188,...,28.312,2.812,23.938,0.562,4.375,8.062,77.438,2019,DakPrescott2019,DAL2019
2,JaredGoff,LAR,QB,25,16,16,24.625,39.125,289.875,301.562,...,0.188,0.000,0.000,0.125,0.188,11.375,97.750,2019,JaredGoff2019,LAR2019
3,PhilipRivers,LAC,QB,38,16,16,24.375,36.938,288.438,314.062,...,25.188,2.250,25.188,0.000,0.000,5.625,53.812,2019,PhilipRivers2019,LAC2019
4,MattRyan,ATL,QB,34,15,15,27.200,41.067,297.733,333.933,...,16.867,2.067,16.867,0.000,0.000,7.200,55.133,2019,MattRyan2019,ATL2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,CordarrellePatterson,ATL,RB,30,16,13,0.000,0.062,0.000,1.125,...,1.875,0.000,0.000,0.500,1.875,0.000,0.000,2021,CordarrellePatterson2021,ATL2021
121,BrettRypien,DEN,0,25,1,0,0.000,2.000,0.000,11.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2021,BrettRypien2021,DEN2021
122,DAndreSwift,DET,RB,22,13,4,0.000,0.077,0.000,-0.923,...,12.769,0.000,0.000,1.000,12.769,0.000,0.000,2021,DAndreSwift2021,DET2021
123,AlbertWilson,MIA,WR,29,14,5,0.000,0.071,0.000,0.357,...,0.000,0.000,0.000,0.000,0.000,0.071,0.000,2021,AlbertWilson2021,MIA2021


In [9]:
advPass.isnull().sum()

Player           0
Tm               0
Pos              0
Age              0
G                0
GS               0
Cmp/G            0
Att/G            0
Yds/G            0
IAY/G            0
IAirYds/PassA    0
CAY/G            0
CAirYds/Cmp      0
CAirYds/PassA    0
YAC/G            0
YdsAC/Cmp        0
Bats/G           0
ThAwy/G          0
Spikes/G         0
Drops/G          0
Drp%             0
BadTh/G          0
BadTh%           0
OnTgt/G          0
OnTarg%          0
Sk/G             0
PockTime         0
Bltz/G           0
Hrry/G           0
Prss/G           0
Press%           0
Scrm/G           0
Yds/Scram        0
Plays/G          0
RPO_Yds/G        0
RPO_PassAtt/G    0
RPO_PassYds/G    0
RPO_RushAtt/G    0
RPO_RushYds/G    0
PA_PassAtt/G     0
PA_PassYds/G     0
Year             0
PlID             0
TmID             0
dtype: int64

In [10]:
advPass.dtypes

Player            object
Tm                object
Pos               object
Age                int64
G                  int64
GS                 int64
Cmp/G            float64
Att/G            float64
Yds/G            float64
IAY/G            float64
IAirYds/PassA    float64
CAY/G            float64
CAirYds/Cmp      float64
CAirYds/PassA    float64
YAC/G            float64
YdsAC/Cmp        float64
Bats/G           float64
ThAwy/G          float64
Spikes/G         float64
Drops/G          float64
Drp%             float64
BadTh/G          float64
BadTh%           float64
OnTgt/G          float64
OnTarg%          float64
Sk/G             float64
PockTime         float64
Bltz/G           float64
Hrry/G           float64
Prss/G           float64
Press%           float64
Scrm/G           float64
Yds/Scram        float64
Plays/G          float64
RPO_Yds/G        float64
RPO_PassAtt/G    float64
RPO_PassYds/G    float64
RPO_RushAtt/G    float64
RPO_RushYds/G    float64
PA_PassAtt/G     float64


In [11]:
advRush

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Att/G,Yds/G,1D/G,YBC/G,YBCont/A,YAC/G,YACont/A,BrkTkl/G,Att/BrTk,Year,PlID,TmID
0,DerrickHenry,TEN,RB,25,15,15,20.200,102.667,4.867,38.133,1.9,64.533,3.2,1.933,10.4,2019,DerrickHenry2019,TEN2019
1,EzekielElliott,DAL,RB,24,16,16,18.812,84.812,4.875,42.375,2.3,42.438,2.3,1.500,12.5,2019,EzekielElliott2019,DAL2019
2,NickChubb,CLE,RB,24,16,16,18.625,93.375,3.875,38.250,2.1,55.125,3.0,2.000,9.3,2019,NickChubb2019,CLE2019
3,ChristianMcCaffrey,CAR,RB,23,16,16,17.938,86.688,3.562,54.625,3.0,32.062,1.8,1.000,17.9,2019,ChristianMcCaffrey2019,CAR2019
4,ChrisCarson,SEA,RB,25,15,15,18.533,82.000,5.000,33.067,1.8,48.933,2.6,1.867,9.9,2019,ChrisCarson2019,SEA2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,QuezWatkins,PHI,WR,23,17,12,0.059,0.176,0.000,0.176,3.0,0.000,0.0,0.000,0.0,2021,QuezWatkins2021,PHI2021
379,DerekWatt,PIT,fb,29,17,4,0.059,0.059,0.059,0.000,0.0,0.059,1.0,0.000,0.0,2021,DerekWatt2021,PIT2021
380,PrestonWilliams,MIA,WR,24,8,3,0.125,0.875,0.000,0.375,3.0,0.500,4.0,0.125,1.0,2021,PrestonWilliams2021,MIA2021
381,AndrewWingard,JAX,S,25,15,15,0.067,0.267,0.067,0.200,3.0,0.067,1.0,0.000,0.0,2021,AndrewWingard2021,JAX2021


In [12]:
advRush.isnull().sum()

Player      0
Tm          0
Pos         0
Age         0
G           0
GS          0
Att/G       0
Yds/G       0
1D/G        0
YBC/G       0
YBCont/A    0
YAC/G       0
YACont/A    0
BrkTkl/G    0
Att/BrTk    0
Year        0
PlID        0
TmID        0
dtype: int64

In [13]:
advRush.dtypes

Player       object
Tm           object
Pos          object
Age           int64
G             int64
GS            int64
Att/G       float64
Yds/G       float64
1D/G        float64
YBC/G       float64
YBCont/A    float64
YAC/G       float64
YACont/A    float64
BrkTkl/G    float64
Att/BrTk    float64
Year          int64
PlID         object
TmID         object
dtype: object

In [14]:
advRec

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Tgt/G,Rec/G,Yds/G,TD/G,...,AvgDOT,BrkTkl/G,Rec/BrTk,Drop/G,Drp%,Int/G,PassRat,Year,PlID,TmID
0,MichaelThomas,NOR,WR,26,16,15,11.562,9.312,107.812,0.562,...,8.1,0.312,29.8,0.375,0.032,0.000,121.7,2019,MichaelThomas2019,NOR2019
1,ChristianMcCaffrey,CAR,RB,23,16,16,8.875,7.250,62.812,0.250,...,0.6,0.875,8.3,0.438,0.049,0.125,99.7,2019,ChristianMcCaffrey2019,CAR2019
2,KeenanAllen,LAC,WR,27,16,16,9.312,6.500,74.938,0.375,...,10.1,0.250,26.0,0.438,0.047,0.500,84.8,2019,KeenanAllen2019,LAC2019
3,DeAndreHopkins,HOU,WR,27,15,15,10.000,6.933,77.667,0.467,...,10.1,0.600,11.6,0.267,0.027,0.067,105.0,2019,DeAndreHopkins2019,HOU2019
4,JulianEdelman,NWE,WR,33,16,13,9.562,6.250,69.812,0.375,...,9.5,0.250,25.0,0.812,0.085,0.188,91.9,2019,JulianEdelman2019,NWE2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,ChristianWilkins,MIA,DT,26,17,17,0.059,0.059,0.059,0.059,...,1.0,0.000,0.0,0.000,0.000,0.000,118.7,2021,ChristianWilkins2021,MIA2021
523,SethWilliams,DEN,WR,21,2,1,0.500,0.500,17.000,0.000,...,30.0,0.000,0.0,0.000,0.000,0.000,118.7,2021,SethWilliams2021,DEN2021
524,TrayveonWilliams,CIN,0,24,5,0,0.400,0.200,0.800,0.000,...,-3.5,0.200,1.0,0.000,0.000,0.000,56.2,2021,TrayveonWilliams2021,CIN2021
526,EasopWinston,NOR,0,25,3,1,0.333,0.333,1.667,0.000,...,-4.0,0.000,0.0,0.000,0.000,0.000,87.5,2021,EasopWinston2021,NOR2021


In [15]:
advRec.isnull().sum()

Player       0
Tm           0
Pos          0
Age          0
G            0
GS           0
Tgt/G        0
Rec/G        0
Yds/G        0
TD/G         0
1D/G         0
YBC/G        0
YdsBC/Rec    0
YAC/G        0
YdsAC/Rec    0
AvgDOT       0
BrkTkl/G     0
Rec/BrTk     0
Drop/G       0
Drp%         0
Int/G        0
PassRat      0
Year         0
PlID         0
TmID         0
dtype: int64

In [16]:
advRec.dtypes

Player        object
Tm            object
Pos           object
Age            int64
G              int64
GS             int64
Tgt/G        float64
Rec/G        float64
Yds/G        float64
TD/G         float64
1D/G         float64
YBC/G        float64
YdsBC/Rec    float64
YAC/G        float64
YdsAC/Rec    float64
AvgDOT       float64
BrkTkl/G     float64
Rec/BrTk     float64
Drop/G       float64
Drp%         float64
Int/G        float64
PassRat      float64
Year           int64
PlID          object
TmID          object
dtype: object

In [17]:
pos

Unnamed: 0,Player,Tm,Pos,Year,PlID,TmID
0,AdrianPeterson,MIN,RB,2012,AdrianPeterson2012,MIN2012
1,DougMartin,TAM,RB,2012,DougMartin2012,TAM2012
2,ArianFoster,HOU,RB,2012,ArianFoster2012,HOU2012
3,MarshawnLynch,SEA,RB,2012,MarshawnLynch2012,SEA2012
4,AlfredMorris,WAS,RB,2012,AlfredMorris2012,WAS2012
...,...,...,...,...,...,...
687,LoganWoodside,TEN,QB,2021,LoganWoodside2021,TEN2021
689,TrentonCannon,2TM,RB,2021,TrentonCannon2021,2TM2021
690,JohnWolford,LAR,QB,2021,JohnWolford2021,LAR2021
691,JoshRosen,ATL,QB,2021,JoshRosen2021,ATL2021


In [18]:
PassLength = len(advPass)
RushLength = len(advRush)
RecLength = len(advRec)
print(f'Lengths are: Passing - {PassLength}, Rushing - {RushLength}, Receiving - {RecLength}')

Lengths are: Passing - 323, Rushing - 1073, Receiving - 1501


In [19]:
import pandas as pd
advPass = pd.merge(advPass,pos[['PlID','Pos']],on='PlID', how='inner')

In [20]:
len(advPass)

303

In [21]:
advPass.columns

Index(['Player', 'Tm', 'Pos_x', 'Age', 'G', 'GS', 'Cmp/G', 'Att/G', 'Yds/G',
       'IAY/G', 'IAirYds/PassA', 'CAY/G', 'CAirYds/Cmp', 'CAirYds/PassA',
       'YAC/G', 'YdsAC/Cmp', 'Bats/G', 'ThAwy/G', 'Spikes/G', 'Drops/G',
       'Drp%', 'BadTh/G', 'BadTh%', 'OnTgt/G', 'OnTarg%', 'Sk/G', 'PockTime',
       'Bltz/G', 'Hrry/G', 'Prss/G', 'Press%', 'Scrm/G', 'Yds/Scram',
       'Plays/G', 'RPO_Yds/G', 'RPO_PassAtt/G', 'RPO_PassYds/G',
       'RPO_RushAtt/G', 'RPO_RushYds/G', 'PA_PassAtt/G', 'PA_PassYds/G',
       'Year', 'PlID', 'TmID', 'Pos_y'],
      dtype='object')

In [22]:
columns = ['Player', 'Tm', 'Pos_y', 'Age', 'G', 'GS', 'Cmp/G', 'Att/G', 'Yds/G',
       'IAY/G', 'IAirYds/PassA', 'CAY/G', 'CAirYds/Cmp', 'CAirYds/PassA',
       'YAC/G', 'YdsAC/Cmp', 'Bats/G', 'ThAwy/G', 'Spikes/G', 'Drops/G',
       'Drp%', 'BadTh/G', 'BadTh%', 'OnTgt/G', 'OnTarg%', 'Sk/G', 'PockTime',
       'Bltz/G', 'Hrry/G', 'Prss/G', 'Press%', 'Scrm/G', 'Yds/Scram',
       'Plays/G', 'RPO_Yds/G', 'RPO_PassAtt/G', 'RPO_PassYds/G',
       'RPO_RushAtt/G', 'RPO_RushYds/G', 'PA_PassAtt/G', 'PA_PassYds/G',
       'Year', 'PlID', 'TmID']
advPass = advPass[columns]
advPass = advPass.rename(columns={'Pos_y':'Pos'})
advPass

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Cmp/G,Att/G,Yds/G,IAY/G,...,RPO_Yds/G,RPO_PassAtt/G,RPO_PassYds/G,RPO_RushAtt/G,RPO_RushYds/G,PA_PassAtt/G,PA_PassYds/G,Year,PlID,TmID
0,JameisWinston,TAM,QB,25,16,16,23.750,39.125,319.312,405.375,...,5.125,0.500,5.000,0.062,0.125,5.812,70.812,2019,JameisWinston2019,TAM2019
1,DakPrescott,DAL,QB,26,16,16,24.250,37.250,306.375,346.188,...,28.312,2.812,23.938,0.562,4.375,8.062,77.438,2019,DakPrescott2019,DAL2019
2,JaredGoff,LAR,QB,25,16,16,24.625,39.125,289.875,301.562,...,0.188,0.000,0.000,0.125,0.188,11.375,97.750,2019,JaredGoff2019,LAR2019
3,PhilipRivers,LAC,QB,38,16,16,24.375,36.938,288.438,314.062,...,25.188,2.250,25.188,0.000,0.000,5.625,53.812,2019,PhilipRivers2019,LAC2019
4,MattRyan,ATL,QB,34,15,15,27.200,41.067,297.733,333.933,...,16.867,2.067,16.867,0.000,0.000,7.200,55.133,2019,MattRyan2019,ATL2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,CordarrellePatterson,ATL,RB,30,16,13,0.000,0.062,0.000,1.125,...,1.875,0.000,0.000,0.500,1.875,0.000,0.000,2021,CordarrellePatterson2021,ATL2021
299,BrettRypien,DEN,QB,25,1,0,0.000,2.000,0.000,11.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2021,BrettRypien2021,DEN2021
300,DAndreSwift,DET,RB,22,13,4,0.000,0.077,0.000,-0.923,...,12.769,0.000,0.000,1.000,12.769,0.000,0.000,2021,DAndreSwift2021,DET2021
301,AlbertWilson,MIA,WR,29,14,5,0.000,0.071,0.000,0.357,...,0.000,0.000,0.000,0.000,0.000,0.071,0.000,2021,AlbertWilson2021,MIA2021


In [23]:
advRush = pd.merge(advRush,pos[['PlID','Pos']],on='PlID', how='inner')

In [24]:
len(advRush)

1040

In [25]:
advRush.columns

Index(['Player', 'Tm', 'Pos_x', 'Age', 'G', 'GS', 'Att/G', 'Yds/G', '1D/G',
       'YBC/G', 'YBCont/A', 'YAC/G', 'YACont/A', 'BrkTkl/G', 'Att/BrTk',
       'Year', 'PlID', 'TmID', 'Pos_y'],
      dtype='object')

In [26]:
columns = ['Player', 'Tm', 'Pos_y', 'Age', 'G', 'GS', 'Att/G', 'Yds/G', '1D/G',
       'YBC/G', 'YBCont/A', 'YAC/G', 'YACont/A', 'BrkTkl/G', 'Att/BrTk',
       'Year', 'PlID', 'TmID']
advRush = advRush[columns]
advRush = advRush.rename(columns={'Pos_y':'Pos'})
advRush

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Att/G,Yds/G,1D/G,YBC/G,YBCont/A,YAC/G,YACont/A,BrkTkl/G,Att/BrTk,Year,PlID,TmID
0,DerrickHenry,TEN,RB,25,15,15,20.200,102.667,4.867,38.133,1.9,64.533,3.2,1.933,10.4,2019,DerrickHenry2019,TEN2019
1,EzekielElliott,DAL,RB,24,16,16,18.812,84.812,4.875,42.375,2.3,42.438,2.3,1.500,12.5,2019,EzekielElliott2019,DAL2019
2,NickChubb,CLE,RB,24,16,16,18.625,93.375,3.875,38.250,2.1,55.125,3.0,2.000,9.3,2019,NickChubb2019,CLE2019
3,ChristianMcCaffrey,CAR,RB,23,16,16,17.938,86.688,3.562,54.625,3.0,32.062,1.8,1.000,17.9,2019,ChristianMcCaffrey2019,CAR2019
4,ChrisCarson,SEA,RB,25,15,15,18.533,82.000,5.000,33.067,1.8,48.933,2.6,1.867,9.9,2019,ChrisCarson2019,SEA2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,AdamThielen,MIN,WR,31,13,13,0.077,0.154,0.000,0.000,0.0,0.154,2.0,0.000,0.0,2021,AdamThielen2021,MIN2021
1036,QuezWatkins,PHI,WR,23,17,12,0.059,0.176,0.000,0.176,3.0,0.000,0.0,0.000,0.0,2021,QuezWatkins2021,PHI2021
1037,DerekWatt,PIT,RB,29,17,4,0.059,0.059,0.059,0.000,0.0,0.059,1.0,0.000,0.0,2021,DerekWatt2021,PIT2021
1038,PrestonWilliams,MIA,WR,24,8,3,0.125,0.875,0.000,0.375,3.0,0.500,4.0,0.125,1.0,2021,PrestonWilliams2021,MIA2021


In [27]:
advRush['Pos'].value_counts()

RB    463
WR    311
QB    224
TE     42
Name: Pos, dtype: int64

In [28]:
advRec = pd.merge(advRec,pos[['PlID','Pos']],on='PlID', how='inner')

In [29]:
len(advRec)

1470

In [30]:
advRec.columns

Index(['Player', 'Tm', 'Pos_x', 'Age', 'G', 'GS', 'Tgt/G', 'Rec/G', 'Yds/G',
       'TD/G', '1D/G', 'YBC/G', 'YdsBC/Rec', 'YAC/G', 'YdsAC/Rec', 'AvgDOT',
       'BrkTkl/G', 'Rec/BrTk', 'Drop/G', 'Drp%', 'Int/G', 'PassRat', 'Year',
       'PlID', 'TmID', 'Pos_y'],
      dtype='object')

In [31]:
columns = ['Player', 'Tm', 'Pos_y', 'Age', 'G', 'GS', 'Tgt/G', 'Rec/G', 'Yds/G',
       'TD/G', '1D/G', 'YBC/G', 'YdsBC/Rec', 'YAC/G', 'YdsAC/Rec', 'AvgDOT',
       'BrkTkl/G', 'Rec/BrTk', 'Drop/G', 'Drp%', 'Int/G', 'PassRat', 'Year',
       'PlID', 'TmID']
advRec = advRec[columns]
advRec = advRec.rename(columns={'Pos_y':'Pos'})
advRec

Unnamed: 0,Player,Tm,Pos,Age,G,GS,Tgt/G,Rec/G,Yds/G,TD/G,...,AvgDOT,BrkTkl/G,Rec/BrTk,Drop/G,Drp%,Int/G,PassRat,Year,PlID,TmID
0,MichaelThomas,NOR,WR,26,16,15,11.562,9.312,107.812,0.562,...,8.1,0.312,29.8,0.375,0.032,0.000,121.7,2019,MichaelThomas2019,NOR2019
1,ChristianMcCaffrey,CAR,RB,23,16,16,8.875,7.250,62.812,0.250,...,0.6,0.875,8.3,0.438,0.049,0.125,99.7,2019,ChristianMcCaffrey2019,CAR2019
2,KeenanAllen,LAC,WR,27,16,16,9.312,6.500,74.938,0.375,...,10.1,0.250,26.0,0.438,0.047,0.500,84.8,2019,KeenanAllen2019,LAC2019
3,DeAndreHopkins,HOU,WR,27,15,15,10.000,6.933,77.667,0.467,...,10.1,0.600,11.6,0.267,0.027,0.067,105.0,2019,DeAndreHopkins2019,HOU2019
4,JulianEdelman,NWE,WR,33,16,13,9.562,6.250,69.812,0.375,...,9.5,0.250,25.0,0.812,0.085,0.188,91.9,2019,JulianEdelman2019,NWE2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,KevinWhite,NOR,WR,29,6,1,0.833,0.167,6.333,0.000,...,24.4,0.000,0.0,0.167,0.200,0.000,58.7,2021,KevinWhite2021,NOR2021
1466,SethWilliams,DEN,WR,21,2,1,0.500,0.500,17.000,0.000,...,30.0,0.000,0.0,0.000,0.000,0.000,118.7,2021,SethWilliams2021,DEN2021
1467,TrayveonWilliams,CIN,RB,24,5,0,0.400,0.200,0.800,0.000,...,-3.5,0.200,1.0,0.000,0.000,0.000,56.2,2021,TrayveonWilliams2021,CIN2021
1468,EasopWinston,NOR,WR,25,3,1,0.333,0.333,1.667,0.000,...,-4.0,0.000,0.0,0.000,0.000,0.000,87.5,2021,EasopWinston2021,NOR2021


In [33]:
advPass.to_csv('Database_CSVs/advancedPass.csv',index=False)
advRush.to_csv('Database_CSVs/advancedRush.csv',index=False)
advRec.to_csv('Database_CSVs/advancedRec.csv',index=False)