In [None]:
def teamScrape(year):
    # Dependencies
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    from functools import reduce
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/#all_team_stats'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    # Find the tables
    df1 = html_soup.find('table', {"id" : 'team_stats'})
    df2 = html_soup.find('table', {"id" : 'passing'})
    df3 = html_soup.find('table', {"id" : 'rushing'})
    # Create dataframes
    df1 = pd.read_html(str(df1),header=1)[0]
    df2 = pd.read_html(str(df2),header=0)[0]
    df3 = pd.read_html(str(df3),header=0)[0]
    # Quit Splinter
    browser.quit()
    # Merge dataframes
    data_frames = [df1, df2, df3]
    df = reduce(lambda  left,right: pd.merge(left,right,on=['Tm'],
                            how='outer'), data_frames)
    teams = ['Dallas Cowboys','Tampa Bay Buccaneers','Buffalo Bills','Kansas City Chiefs','Los Angeles Chargers',
             'New England Patriots','Cincinnati Bengals','Los Angeles Rams','Indianapolis Colts','Green Bay Packers',
             'Arizona Cardinals','Philadelphia Eagles','San Francisco 49ers','Minnesota Vikings','Tennessee Titans',
             'Seattle Seahawks','Baltimore Ravens','Las Vegas Raiders','New Orleans Saints','Cleveland Browns',
             'Pittsburgh Steelers','Miami Dolphins','Denver Broncos','Washington Football Team','Detroit Lions',
             'Atlanta Falcons','Chicago Bears','New York Jets','Carolina Panthers','Houston Texans','New York Giants',
             'Jacksonville Jaguars','San Diego Chargers','St. Louis Rams','Oakland Raiders','Washington Redskins',
             'Wasington Commanders']
    abbr = {'Dallas Cowboys':'DAL','Tampa Bay Buccaneers':'TAM','Buffalo Bills':'BUF','Kansas City Chiefs':'KAN',
            'Los Angeles Chargers':'LAC','New England Patriots':'NWE','Cincinnati Bengals':'CIN',
            'Los Angeles Rams':'LAR','Indianapolis Colts':'IND','Green Bay Packers':'GNB','Arizona Cardinals':'ARI',
            'Philadelphia Eagles':'PHI','San Francisco 49ers':'SFO','Minnesota Vikings':'MIN',
            'Tennessee Titans':'TEN','Seattle Seahawks':'SEA','Baltimore Ravens':'BAL','Las Vegas Raiders':'LVR',
            'New Orleans Saints':'NOR','Cleveland Browns':'CLE','Pittsburgh Steelers':'PIT','Miami Dolphins':'MIA',
            'Denver Broncos':'DEN','Washington Football Team':'WAS','Detroit Lions':'DET','Atlanta Falcons':'ATL',
            'Chicago Bears':'CHI','New York Jets':'NYJ','Carolina Panthers':'CAR','Houston Texans':'HOU',
            'New York Giants':'NYG','Jacksonville Jaguars':'JAX','San Diego Chargers':'SDG','St. Louis Rams':'STL',
            'Oakland Raiders':'OAK','Washington Redskins':'WAS','Wasington Commanders':'WAS'}
    # Clean dataframe including only rows with actual teams + add abbreviations
    df = df[df['Tm'].isin(teams)]
    df['Abbr'] = df.apply(lambda row: abbr[row.Tm], axis=1)
    # Drop unnecessary duplicate columns
    df.drop(columns=['Rk','Rk_x','Rk_y'],inplace=True)
    # Reorganize columns, dropping other unnecessary columns
    columns = ['Tm','Abbr','G_x','PF','Yds_x','Ply','Y/P','TO','FL','1stD','Cmp_x',
           'Att_x','Cmp%','Yds.1_x', 'TD_x', 'TD%','Int_x', 'Int%','NY/A_x', '1stD.1',   
           'Lng_x', 'Y/A_y', 'AY/A', 'Y/C', 'Y/G_x', 'Rate', 'Sk',
           'Yds.1_y', 'Sk%', 'ANY/A', '4QC', 'GWD', 'EXP_y',
           'Att.1','Yds.2', 'TD.1', 'Lng_y','Y/A_x','Y/G_y','1stD.2','Fmb','EXP',
           'Pen', 'Yds.3', '1stPy', 'Sc%','TO%', 'EXP_x']
    df = df[columns]
    # Drop duplicates
    df = df.loc[:,~df.columns.duplicated()]
    # Rename some of the columns
    cols = []
    for column in df.columns:
        if column == 'G_x':
            cols.append('G')
        elif column == 'Yds_x':
            cols.append('TotYds')
        elif column == 'Cmp_x':
            cols.append('Cmp')
        elif column == 'Att_x':
            cols.append('PasAtt')
        elif column == 'Yds.1_x':
            cols.append('PassYds')
        elif column == 'TD_x':
            cols.append('PassTD')
        elif column == 'Int_x':
            cols.append('Int')
        elif column == 'NY/A_x':
            cols.append('NY/A') 
        elif column == '1stD.1':
            cols.append('Pass1stD')
        elif column == 'Lng_x':
            cols.append('PassLng')
        elif column == 'Y/A_y':
            cols.append('Y/PassA')
        elif column == 'AY/A':
            cols.append('AY/PassA')
        elif column == 'Y/G_x':
            cols.append('PassY/G')
        elif column == 'Yds.1_y':
            cols.append('SackYds')
        elif column == 'ANY/A':
            cols.append('ANY/PassA')
        elif column == 'EXP_y':
            cols.append('EXPPass')
        elif column == 'Att.1':
            cols.append('RushAtt')
        elif column == 'Yds.2':
            cols.append('RushYds')
        elif column == 'TD.1':
            cols.append('RushTDs')
        elif column == 'Lng_y':
            cols.append('RushLng')
        elif column == 'Y/A_x':
            cols.append('Y/RushA')
        elif column == 'Y/G_y':
            cols.append('RushY/G')
        elif column == '1stD.2':
            cols.append('Rush1stD')
        elif column == 'EXP':
            cols.append('EXPRush')
        elif column == 'Yds.3':
            cols.append('PenYds')
        elif column == 'EXP_x':
            cols.append('EXPTot')
        else:
            cols.append(column)
    df.columns = cols
    
    # Change dtypes of columns to numeric
    columns = df.columns[2:]
    for col in columns:
        if '%' in col:
            df[col] = df[col].str.rstrip('%').astype('float') / 100.0
        else:
            df[col] = pd.to_numeric(df[col],errors='coerce')
    # Add year column and fill null values with 0
    df['Year'] = year
    df = df.fillna(0)
    # Create Team ID column
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Abbr'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [1]:
def createFantDB(startYear,endYear,function):
    # Import dependencies
    import pandas as pd
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year))
    return database

In [None]:
def createAdvDB(startYear,endYear,function,PerGame=False):
    # Import dependencies
    import pandas as pd
    flag=PerGame
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year,flag))
    return database

In [None]:
def cleanffball(year):
    # Import dependencies
    import pandas as pd
    #Scrape and create dataframe from website
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    df = pd.read_html(url,header=1)[0]
    # Drop unnecessary columns and clean up the dataframe
    df = df.drop(['Rk','VBD','FantPt','DKPt','FDPt'],axis=1)
    df = df[df['FantPos'].notna()]
    df = df[df['FantPos'] != 'FantPos']
    df['Y/A'] = df['Y/A'].fillna(0)
    df['Y/R'] = df['Y/R'].fillna(0)
    df['2PM'] = df['2PM'].fillna(0)
    df['2PP'] = df['2PP'].fillna(0)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    # Strips extra whitespace from end of player column
    df['Player'] = df['Player'].str.rstrip()
    # Change dtypes of columns to numeric
    columns = df.columns[3:]
    for col in columns:
        df[col] = pd.to_numeric(df[col],errors='coerce')
    # Create overall rank column
    df['PPR_OvRank'] = df['PPR'].rank(method='first',ascending=False)
    # Drop columns and null values
    df = df.drop(columns=['OvRank'],axis=1)
    df = df.dropna()
    # Create year and PPG columns
    df['Year'] = year
    df['PPG'] = df['PPR'] / df['G']
    df['PPG'] = df['PPG'].round(decimals = 3)
    # Edits name for uniformity year over year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # Creates unique Player ID and Team ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [None]:
def cleanDB(database):
    import pandas as pd
    cols = []
    for column in database.columns:
        if column == 'FantPos':
            cols.append('Pos')
        elif column == 'Yds':
            cols.append('PassYds')
        elif column == 'Yds.1':
            cols.append('RushYds')
        elif column == 'Yds.2':
            cols.append('RecYds')
        elif column == 'Att':
            cols.append('PassAtt')
        elif column == 'Att.1':
            cols.append('RushAtt')
        elif column == 'TD':
            cols.append('PassTD')
        elif column == 'TD.1':
            cols.append('RushTD')
        elif column == 'TD.2':
            cols.append('RecTD')
        elif column == 'TD.3':
            cols.append('TotTD')
        else:
            cols.append(column)

    database.columns = cols
    
    listOfColumns = ['Cmp','PassAtt','PassYds','PassTD','Int','RushAtt','RushYds',
                     'Y/A','RushTD','Tgt','Rec','RecYds','Y/R','RecTD','Fmb',
                     'FL','TotTD','2PM','2PP','PPR','PosRank','PPR_OvRank','Year',
                     'PPG']
    
    # Creates columns for stats per game
    for column in listOfColumns:
        if column == 'Y/A':
            newColumn = 'RushYds/Att'
            database[newColumn] = database[column]
        elif column == 'Y/R':
            newColumn = 'Yds/Rec'
            database[newColumn] = database[column]
        elif column == 'PPR':
            newColumn = 'FPts'
            database[newColumn] = database[column]
        elif column == 'PosRank':
            newColumn = 'PosRk'
            database[newColumn] = database[column]
        elif column == 'PPR_OvRank':
            newColumn = 'OvRank'
            database[newColumn] = database[column]
        elif column == 'Year':
            newColumn = 'Yr'
            database[newColumn] = database[column]
        elif column == 'PPG':
            newColumn = 'PPR/G'
            database[newColumn] = database[column]
        else:
            newColumn = column + '/G'
            database[newColumn] = (pd.to_numeric(database[column]) / pd.to_numeric(database['G']))
            database[newColumn] = database[newColumn].round(decimals = 3)
            
    database = database.drop(columns = listOfColumns)
    database.loc[(database['Pos'] == 'QB') & ((database['GS'] / database['G']) >= .75) & (database['GS'] >= 5), 'Starter'] = 1
    database.loc[(database['Pos'] == 'QB') & ((database['GS'] / database['G']) < .75) & (database['GS'] < 5), 'Starter'] = 0 
    database['Starter'] = database['Starter'].fillna(0)
    
    database = database.sort_values(by = ['Player','Yr'], ascending = [True,False])
    nextYearPPG = []
    nextYearStart = []
    lastPlayer = 'NaN'
    lastPlayerPPG = 'NaN'
    lastPlayerStart = 'NaN'

    for index, row in database.iterrows():
            player = row['Player']
            if lastPlayer == player:
                nextYearPPG.append(lastPlayerPPG)
                nextYearStart.append(lastPlayerStart)
            else:
                nextYearPPG.append('NaN')
                nextYearStart.append('NaN')
            lastPlayer = row['Player']
            lastPlayerPPG = row['PPR/G']
            lastPlayerStart = row['Starter']
    
    database['Next_Yr_PPG'] = nextYearPPG
    database['Next_Yr_PPG'] = pd.to_numeric(database['Next_Yr_PPG'],errors='coerce')
    database['Next_Yr_Starter'] = nextYearStart
    database['Next_Yr_Starter'] = pd.to_numeric(database['Next_Yr_Starter'],errors='coerce',downcast='integer')
    database = database.reset_index(drop = True)
    return database

In [None]:
def scrapeAdvancedPass(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    from functools import reduce
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/passing_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    try:
        
        df1 = html_soup.find('table', {"id" : 'advanced_air_yards'})
        df2 = html_soup.find('table', {"id" : 'advanced_accuracy'})
        df3 = html_soup.find('table', {"id" : 'advanced_pressure'})
        df4 = html_soup.find('table', {"id" : 'advanced_play_type'})

        df1 = pd.read_html(str(df1),header=1)[0]
        df2 = pd.read_html(str(df2),header=1)[0]
        df3 = pd.read_html(str(df3),header=1)[0]
        df4 = pd.read_html(str(df4),header=1)[0]
    
        data_frames = [df1, df2, df3, df4]
        df = reduce(lambda  left,right: pd.merge(left,right,on=['Player'],
                                                    how='outer'), data_frames)
        df = df[df['Player'] != 'Player']
        df = df[df['Player'].notna()]
        df = df.replace('\/','',regex=True).astype(object)
        df = df.replace('^qb','QB',regex=True).astype(object)
        df = df.replace('^rb','RB',regex=True).astype(object)
        df = df.replace('^wr','WR',regex=True).astype(object)
        df = df.replace('^te','TE',regex=True).astype(object)
        df = df.replace('\+','',regex=True).astype(object)
        df = df.replace('\*','',regex=True).astype(object)
        columns = ['Player','Tm_x','Pos_x','Age_x','G_x','GS_x','Cmp_x','Att_x','Yds_x','IAY',
                   'IAY/PA','CAY','CAY/Cmp','CAY/PA','YAC','YAC/Cmp','Bats','ThAwy',
                   'Spikes','Drops','Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk',
                   'PktTime','Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr','Plays',
                   'Yds.1','PassAtt','PassYds','RushAtt','RushYds', 'PassAtt.1', 'PassYds.1']

        df = df[columns]
        df = df.loc[:,~df.columns.duplicated()]
        cols = []
        for column in df.columns:
            if column == 'Tm_x':
                cols.append('Tm')
            elif column == 'Pos_x':
                cols.append('Pos')
            elif column == 'Age_x':
                cols.append('Age')
            elif column == 'G_x':
                cols.append('G')
            elif column == 'GS_x':
                cols.append('GS')
            elif column == 'Cmp_x':
                cols.append('Cmp')
            elif column == 'Att_x':
                cols.append('Att')
            elif column == 'Yds_x':
                cols.append('Yds')
            elif column == 'Yds.1':
                cols.append('RPO_Yds')
            elif column == 'PassAtt':
                cols.append('RPO_PassAtt')
            elif column == 'PassYds':
                cols.append('RPO_PassYds')
            elif column == 'RushAtt':
                cols.append('RPO_RushAtt')
            elif column == 'RushYds':
                cols.append('RPO_RushYds')
            elif column == 'PassAtt.1':
                cols.append('PA_PassAtt')
            elif column == 'PassYds.1':
                cols.append('PA_PassYds')
            else:
                cols.append(column)

        df.columns = cols
        df = df.fillna(0)
        df = df[df['Pos'] != 'k']
        df = df[df['Pos'] != 'p']
        if PerGame is False:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
        else:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
                    
            listOfColumns = ['Cmp','Att','Yds','IAY','IAY/PA','CAY','CAY/Cmp',
                             'CAY/PA','YAC','YAC/Cmp','Bats','ThAwy','Spikes','Drops',
                             'Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk','PktTime',
                             'Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr','Plays',
                             'RPO_Yds','RPO_PassAtt','RPO_PassYds','RPO_RushAtt',
                             'RPO_RushYds', 'PA_PassAtt','PA_PassYds']
        
            for column in listOfColumns:
                if column == 'IAY/PA':
                    newColumn = 'IAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'CAY/Cmp':
                    newColumn = 'CAirYds/Cmp'
                    df[newColumn] = df[column]
                elif column == 'CAY/PA':
                    newColumn = 'CAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'YAC/Cmp':
                    newColumn = 'YdsAC/Cmp'
                    df[newColumn] = df[column]
                elif column == 'Drop%':
                    newColumn = 'Drp%'
                    df[newColumn] = df[column]
                elif column == 'Bad%':
                    newColumn = 'BadTh%'
                    df[newColumn] = df[column]
                elif column == 'OnTgt%':
                    newColumn = 'OnTarg%'
                    df[newColumn] = df[column]
                elif column == 'PktTime':
                    newColumn = 'PockTime'
                    df[newColumn] = df[column]
                elif column == 'Prss%':
                    newColumn = 'Press%'
                    df[newColumn] = df[column]
                elif column == 'Yds/Scr':
                    newColumn = 'Yds/Scram'
                    df[newColumn] = df[column]
                else:
                    newColumn = column + '/G'
                    df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                    df[newColumn] = df[newColumn].round(decimals = 3)
            df = df.drop(columns = listOfColumns)
                
        df['Year'] = year
        df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
        # creates unique player ID to make stats for different years easier to see
        playerID = []
        for index, row in df.iterrows():
            playerID.append(row['Player'] + str(row['Year']))
        df['ID'] = playerID
        
        teamID = []
        for index, row in df.iterrows():
            teamID.append(row['Tm'] + str(row['Year']))
        df['TmID'] = teamID
        
    except:
        
        df1 = html_soup.find('table', {"id" : 'advanced_air_yards'})
        df2 = html_soup.find('table', {"id" : 'advanced_accuracy'})
        df3 = html_soup.find('table', {"id" : 'advanced_pressure'})

        df1 = pd.read_html(str(df1),header=1)[0]
        df2 = pd.read_html(str(df2),header=1)[0]
        df3 = pd.read_html(str(df3),header=1)[0]
    
        data_frames = [df1, df2, df3]
        df = reduce(lambda  left,right: pd.merge(left,right,on=['Player'],
                                                    how='outer'), data_frames)
        df = df[df['Player'] != 'Player']
        df = df[df['Player'].notna()]
        df = df.replace('\/','',regex=True).astype(object)
        df = df.replace('^qb','QB',regex=True).astype(object)
        df = df.replace('^rb','RB',regex=True).astype(object)
        df = df.replace('^wr','WR',regex=True).astype(object)
        df = df.replace('^te','TE',regex=True).astype(object)
        df = df.replace('\+','',regex=True).astype(object)
        df = df.replace('\*','',regex=True).astype(object)
        columns = ['Player','Tm_x','Pos_x','Age_x','G_x','GS_x','Cmp_x','Att_x','Yds_x','IAY',
                   'IAY/PA','CAY','CAY/Cmp','CAY/PA','YAC','YAC/Cmp','ThAwy',
                   'Spikes','Drops','Drop%','BadTh','Bad%','Sk',
                   'PktTime','Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr']

        df = df[columns]
        df = df.loc[:,~df.columns.duplicated()]
        cols = []
        for column in df.columns:
            if column == 'Tm_x':
                cols.append('Tm')
            elif column == 'Pos_x':
                cols.append('Pos')
            elif column == 'Age_x':
                cols.append('Age')
            elif column == 'G_x':
                cols.append('G')
            elif column == 'GS_x':
                cols.append('GS')
            elif column == 'Cmp_x':
                cols.append('Cmp')
            elif column == 'Att_x':
                cols.append('Att')
            elif column == 'Yds_x':
                cols.append('Yds')
            else:
                cols.append(column)

        df.columns = cols
        df = df.fillna(0)
        df = df[df['Pos'] != 'k']
        df = df[df['Pos'] != 'p']
        
        if PerGame is False:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
        else:
            columns = df.columns[3:]
            for col in columns:
                if '%' in col:
                    df[col] = df[col].str.rstrip('%').astype('float') / 100.0
                else:
                    df[col] = pd.to_numeric(df[col],errors='coerce')
                    
            listOfColumns = ['Cmp','Att','Yds','IAY','IAY/PA','CAY','CAY/Cmp',
                             'CAY/PA','YAC','YAC/Cmp','Bats','ThAwy','Spikes','Drops',
                             'Drop%','BadTh','Bad%','OnTgt','OnTgt%','Sk','PktTime',
                             'Bltz','Hrry','Prss','Prss%','Scrm','Yds/Scr']
        
            for column in listOfColumns:
                if column == 'IAY/PA':
                    newColumn = 'IAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'CAY/Cmp':
                    newColumn = 'CAirYds/Cmp'
                    df[newColumn] = df[column]
                elif column == 'CAY/PA':
                    newColumn = 'CAirYds/PassA'
                    df[newColumn] = df[column]
                elif column == 'YAC/Cmp':
                    newColumn = 'YdsAC/Cmp'
                    df[newColumn] = df[column]
                elif column == 'Drop%':
                    newColumn = 'Drp%'
                    df[newColumn] = df[column]
                elif column == 'Bad%':
                    newColumn = 'BadTh%'
                    df[newColumn] = df[column]
                elif column == 'OnTgt%':
                    newColumn = 'OnTarg%'
                    df[newColumn] = df[column]
                elif column == 'PktTime':
                    newColumn = 'PockTime'
                    df[newColumn] = df[column]
                elif column == 'Prss%':
                    newColumn = 'Press%'
                    df[newColumn] = df[column]
                elif column == 'Yds/Scr':
                    newColumn = 'Yds/Scram'
                    df[newColumn] = df[column]
                else:
                    newColumn = column + '/G'
                    df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                    df[newColumn] = df[newColumn].round(decimals = 3)
            df = df.drop(columns = listOfColumns)
                
        df['Year'] = year
        df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
        # creates unique player ID to make stats for different years easier to see
        playerID = []
        for index, row in df.iterrows():
            playerID.append(row['Player'] + str(row['Year']))
        df['PlID'] = playerID
        
        teamID = []
        for index, row in df.iterrows():
            teamID.append(row['Tm'] + str(row['Year']))
        df['TmID'] = teamID
    
    
    return df

In [None]:
def scrapeAdvancedRush(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/rushing_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    
    df = html_soup.find('table', {"id" : 'advanced_rushing'})
    df = pd.read_html(str(df),header=1)[0]
    
    df = df[df['Player'] != 'Player']
    df = df[df['Player'].notna()]
    df = df.replace('\/','',regex=True).astype(object)
    df = df.replace('^qb','QB',regex=True).astype(object)
    df = df.replace('^rb','RB',regex=True).astype(object)
    df = df.replace('^wr','WR',regex=True).astype(object)
    df = df.replace('^te','TE',regex=True).astype(object)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    columns = ['Player','Tm','Pos','Age','G','GS','Att','Yds','1D',
               'YBC','YBC/Att','YAC','YAC/Att','BrkTkl','Att/Br']

    df = df[columns]

    df = df.fillna(0)
    df = df[df['Pos'] != 'k']
    df = df[df['Pos'] != 'p']
    if PerGame is False:
        columns = df.columns[3:]
        for col in columns:
            df[col] = pd.to_numeric(df[col],errors='coerce')
    else:
        columns = df.columns[3:]
        for col in columns:
            df[col] = pd.to_numeric(df[col],errors='coerce')
            
        listOfColumns = ['Att','Yds','1D','YBC','YBC/Att','YAC',
                         'YAC/Att','BrkTkl','Att/Br']
        
        for column in listOfColumns:
            if column == 'YBC/Att':
                newColumn = 'YBCont/A'
                df[newColumn] = df[column]
            elif column == 'YAC/Att':
                newColumn = 'YACont/A'
                df[newColumn] = df[column]
            elif column == 'Att/Br':
                newColumn = 'Att/BrTk'
                df[newColumn] = df[column]
            else:
                newColumn = column + '/G'
                df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                df[newColumn] = df[newColumn].round(decimals = 3)
        df = df.drop(columns = listOfColumns)
    
    df['Year'] = year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # creates unique player ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    browser.quit()
    
    return df

In [None]:
def scrapeAdvancedRec(year,PerGame=False):
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    from webdriver_manager.chrome import ChromeDriverManager
    import pandas as pd
    # Set up Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)
    # Visit the AdvancedStats to Scrape site
    url = f'https://www.pro-football-reference.com/years/{year}/receiving_advanced.htm'
    browser.visit(url)
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    browser.quit()
    
    df = html_soup.find('table', {"id" : 'advanced_receiving'})
    df = pd.read_html(str(df),header=0)[0]
    
    df = df[df['Player'] != 'Player']
    df = df[df['Player'].notna()]
    df = df.replace('\/','',regex=True).astype(object)
    df = df.replace('^qb','QB',regex=True).astype(object)
    df = df.replace('^rb','RB',regex=True).astype(object)
    df = df.replace('^wr','WR',regex=True).astype(object)
    df = df.replace('^te','TE',regex=True).astype(object)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    columns = ['Player','Tm','Pos','Age','G','GS','Tgt','Rec','Yds','TD','1D','YBC','YBC/R',
               'YAC','YAC/R','ADOT','BrkTkl','Rec/Br','Drop','Drop%','Int','Rat']

    df = df[columns]

    df = df.fillna(0)
    df = df[df['Pos'] != 'k']
    df = df[df['Pos'] != 'p']
    if PerGame is False:
        columns = df.columns[3:]
        for col in columns:
            if '%' in col:
                df[col] = df[col].str.rstrip('%').astype('float') / 100.0
            else:
                df[col] = pd.to_numeric(df[col],errors='coerce')
    else:
        columns = df.columns[3:]
        for col in columns:
            if '%' in col:
                df[col] = df[col].str.rstrip('%').astype('float') / 100.0
            else:
                df[col] = pd.to_numeric(df[col],errors='coerce')
        
        listOfColumns = ['Tgt','Rec','Yds','TD','1D','YBC','YBC/R','YAC','YAC/R',
                         'ADOT','BrkTkl','Rec/Br','Drop','Drop%','Int','Rat']
        
        for column in listOfColumns:
            if column == 'YBC/R':
                newColumn = 'YdsBC/Rec'
                df[newColumn] = df[column]
            elif column == 'YAC/R':
                newColumn = 'YdsAC/Rec'
                df[newColumn] = df[column]
            elif column == 'ADOT':
                newColumn = 'AvgDOT'
                df[newColumn] = df[column]
            elif column == 'Rec/Br':
                newColumn = 'Rec/BrTk'
                df[newColumn] = df[column]
            elif column == 'Drop%':
                newColumn = 'Drp%'
                df[newColumn] = df[column]
            elif column == 'Rat':
                newColumn = 'PassRat'
                df[newColumn] = df[column]
            else:
                newColumn = column + '/G'
                df[newColumn] = (pd.to_numeric(df[column]) / pd.to_numeric(df['G']))
                df[newColumn] = df[newColumn].round(decimals = 3)
        df = df.drop(columns = listOfColumns)
    
    
    df['Year'] = year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # creates unique player ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [2]:
def positions(year):
    import pandas as pd
    #Scrape and create dataframe from website
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    df = pd.read_html(url,header=1)[0]
    # Drop unnecessary columns and clean up the dataframe
    df = df[['Player','Tm','FantPos']]
    df = df[df['FantPos'].notna()]
    df = df[df['FantPos'] != 'FantPos']
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    # Strips extra whitespace from end of player column
    df['Player'] = df['Player'].str.rstrip()
    df = df.dropna()
    # Create year and PPG columns
    df['Year'] = year
    # Edits name for uniformity year over year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # Creates unique Player ID and Team ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [None]:
advPass = createAdvDB(2019,2021,scrapeAdvancedPass)
advPassGame = createAdvDB(2019,2021,scrapeAdvancedPass,PerGame=True)
advRush = createAdvDB(2019,2021,scrapeAdvancedRush)
advRushGame = createAdvDB(2019,2021,scrapeAdvancedRush,PerGame=True)
advRec = createAdvDB(2019,2021,scrapeAdvancedRec)
advRecGame = createAdvDB(2019,2021,scrapeAdvancedRec,PerGame=True)

#fantasy = cleanDB(createDB(2012,2021,cleanffball))

In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
positions = createFantDB(2012,2021,positions)
positions

Unnamed: 0,Player,Tm,FantPos,Year,PlID,TmID
0,AdrianPeterson,MIN,RB,2012,AdrianPeterson2012,MIN2012
1,DougMartin,TAM,RB,2012,DougMartin2012,TAM2012
2,ArianFoster,HOU,RB,2012,ArianFoster2012,HOU2012
3,MarshawnLynch,SEA,RB,2012,MarshawnLynch2012,SEA2012
4,AlfredMorris,WAS,RB,2012,AlfredMorris2012,WAS2012
5,RayRice,BAL,RB,2012,RayRice2012,BAL2012
6,CalvinJohnson,DET,WR,2012,CalvinJohnson2012,DET2012
7,BrandonMarshall,CHI,WR,2012,BrandonMarshall2012,CHI2012
8,CJSpiller,BUF,RB,2012,CJSpiller2012,BUF2012
9,DrewBrees,NOR,QB,2012,DrewBrees2012,NOR2012


In [None]:
advPass

In [None]:
advPassGame

In [None]:
advRush

In [None]:
advRushGame

In [None]:
advRec

In [None]:
advRecGame

In [None]:
list1 = []
list2 = []

def addto(year,trigger=False):
    if trigger is False:
        list1.append(year)
    else:
        list2.append(year)

In [None]:
def create(startYear,endYear,function,PPG=False):
    flag = PPG
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    function(years,flag)

In [None]:
create(2017,2019,addto,PPG=False)

In [None]:
print(list1)

In [None]:
print(list2)

In [None]:
advRush

In [None]:
advRec

In [None]:
PassLen = len(advPass)
RushLen = len(advRush)
RecLen = len(advRec)
print(f'Current sizes of the DFs are: Passing - {PassLen}, Rushing - {RushLen}, Receiving - {RecLen}.')

In [None]:
teams

In [None]:
teams.dtypes

In [None]:
merged = advRush.merge(teams,how='outer',left_on=['TmID','Year'],right_on=['TmID','Year'])
merged

In [None]:
#merged = merged.drop(columns=['Tm_y','G_y','Abbr'])
merged = merged.rename(columns={"Tm_x": "Tm","G_x":"G"})
merged = merged.dropna(subset=['Player'])
merged

In [None]:
merged.dtypes

In [None]:
merged

In [None]:
#fantDB['Tm'].value_counts()

In [None]:
#fantDB[(fantDB['Tm'].isin(['4TM','3TM','2TM'])) & (fantDB['FPts'] >= 80)]