In [1]:
def cleanffball(year):
    # Import dependencies
    import pandas as pd
    #Scrape and create dataframe from website
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    df = pd.read_html(url,header=1)[0]
    # Drop unnecessary columns and clean up the dataframe
    df = df.drop(['Rk','VBD','FantPt','DKPt','FDPt'],axis=1)
    df = df[df['FantPos'].notna()]
    df = df[df['FantPos'] != 'FantPos']
    df['Y/A'] = df['Y/A'].fillna(0)
    df['Y/R'] = df['Y/R'].fillna(0)
    df['2PM'] = df['2PM'].fillna(0)
    df['2PP'] = df['2PP'].fillna(0)
    df = df.replace('\+','',regex=True).astype(object)
    df = df.replace('\*','',regex=True).astype(object)
    # Strips extra whitespace from end of player column
    df['Player'] = df['Player'].str.rstrip()
    # Change dtypes of columns to numeric
    columns = df.columns[3:]
    for col in columns:
        df[col] = pd.to_numeric(df[col],errors='coerce')
    # Create overall rank column
    df['PPR_OvRank'] = df['PPR'].rank(method='first',ascending=False)
    # Drop columns and null values
    df = df.drop(columns=['OvRank'],axis=1)
    df = df.dropna()
    # Create year and PPG columns
    df['Year'] = year
    df['PPG'] = df['PPR'] / df['G']
    df['PPG'] = df['PPG'].round(decimals = 3)
    # Edits name for uniformity year over year
    df['Player'] = df.Player.str.replace('[^a-zA-Z]', '', regex = True)
    # Creates unique Player ID and Team ID to make stats for different years easier to see
    playerID = []
    for index, row in df.iterrows():
        playerID.append(row['Player'] + str(row['Year']))
    df['PlID'] = playerID
    
    teamID = []
    for index, row in df.iterrows():
        teamID.append(row['Tm'] + str(row['Year']))
    df['TmID'] = teamID
    
    return df

In [2]:
def createFantDB(startYear,endYear,function):
    # Import dependencies
    import pandas as pd
    database = pd.DataFrame()
    years = []
    # Iterates through years specified
    for year in range(startYear, endYear + 1):
        years.append(year)
    for year in years:
        database = database.append(function(year))
    return database

In [3]:
def cleanDB(database):
    import pandas as pd
    cols = []
    for column in database.columns:
        if column == 'FantPos':
            cols.append('Pos')
        elif column == 'Yds':
            cols.append('PassYds')
        elif column == 'Yds.1':
            cols.append('RushYds')
        elif column == 'Yds.2':
            cols.append('RecYds')
        elif column == 'Att':
            cols.append('PassAtt')
        elif column == 'Att.1':
            cols.append('RushAtt')
        elif column == 'TD':
            cols.append('PassTD')
        elif column == 'TD.1':
            cols.append('RushTD')
        elif column == 'TD.2':
            cols.append('RecTD')
        elif column == 'TD.3':
            cols.append('TotTD')
        else:
            cols.append(column)

    database.columns = cols
    
    listOfColumns = ['Cmp','PassAtt','PassYds','PassTD','Int','RushAtt','RushYds',
                     'Y/A','RushTD','Tgt','Rec','RecYds','Y/R','RecTD','Fmb',
                     'FL','TotTD','2PM','2PP','PPR','PosRank','PPR_OvRank','Year',
                     'PPG']
    
    # Creates columns for stats per game
    for column in listOfColumns:
        if column == 'Y/A':
            newColumn = 'RushYds/Att'
            database[newColumn] = database[column]
        elif column == 'Y/R':
            newColumn = 'Yds/Rec'
            database[newColumn] = database[column]
        elif column == 'PPR':
            newColumn = 'FPts'
            database[newColumn] = database[column]
        elif column == 'PosRank':
            newColumn = 'PosRk'
            database[newColumn] = database[column]
        elif column == 'PPR_OvRank':
            newColumn = 'OvRank'
            database[newColumn] = database[column]
        elif column == 'Year':
            newColumn = 'Yr'
            database[newColumn] = database[column]
        elif column == 'PPG':
            newColumn = 'PPR/G'
            database[newColumn] = database[column]
        else:
            newColumn = column + '/G'
            database[newColumn] = (pd.to_numeric(database[column]) / pd.to_numeric(database['G']))
            database[newColumn] = database[newColumn].round(decimals = 3)
            
    database = database.drop(columns = listOfColumns)
    database.loc[(database['Pos'] == 'QB') & ((database['GS'] / database['G']) >= .75) & (database['GS'] >= 5), 'Starter'] = 1
    database.loc[(database['Pos'] == 'QB') & ((database['GS'] / database['G']) < .75) & (database['GS'] < 5), 'Starter'] = 0 
    database['Starter'] = database['Starter'].fillna(0)
    
    database = database.sort_values(by = ['Player','Yr'], ascending = [True,False])
    nextYearPPG = []
    nextYearStart = []
    lastPlayer = 'NaN'
    lastPlayerPPG = 'NaN'
    lastPlayerStart = 'NaN'

    for index, row in database.iterrows():
            player = row['Player']
            if lastPlayer == player:
                nextYearPPG.append(lastPlayerPPG)
                nextYearStart.append(lastPlayerStart)
            else:
                nextYearPPG.append('NaN')
                nextYearStart.append('NaN')
            lastPlayer = row['Player']
            lastPlayerPPG = row['PPR/G']
            lastPlayerStart = row['Starter']
    
    database['Next_Yr_PPG'] = nextYearPPG
    database['Next_Yr_PPG'] = pd.to_numeric(database['Next_Yr_PPG'],errors='coerce')
    database['Next_Yr_Starter'] = nextYearStart
    database['Next_Yr_Starter'] = pd.to_numeric(database['Next_Yr_Starter'],errors='coerce',downcast='integer')
    database = database.reset_index(drop = True)
    return database

In [4]:
fantasy_db = cleanDB(createFantDB(2012,2021,cleanffball))

In [5]:
fantasy_db

Unnamed: 0,Player,Tm,Pos,Age,G,GS,PlID,TmID,Cmp/G,PassAtt/G,...,2PM/G,2PP/G,FPts,PosRk,OvRank,Yr,PPR/G,Starter,Next_Yr_PPG,Next_Yr_Starter
0,AJBrown,TEN,WR,24,13,13,AJBrown2021,TEN2021,0.0,0.154,...,0.000,0.0,180.9,32,85.0,2021,13.915,0.0,,
1,AJBrown,TEN,WR,23,14,12,AJBrown2020,TEN2020,0.0,0.000,...,0.000,0.0,247.5,9,38.0,2020,17.679,0.0,13.915,0.0
2,AJBrown,TEN,WR,22,16,11,AJBrown2019,TEN2019,0.0,0.000,...,0.000,0.0,217.1,9,62.0,2019,13.569,0.0,17.679,0.0
3,AJDerby,MIA,TE,27,4,0,AJDerby2018,MIA2018,0.0,0.000,...,0.000,0.0,13.8,76,400.0,2018,3.450,0.0,,
4,AJDerby,2TM,TE,26,11,1,AJDerby2017,2TM2017,0.0,0.000,...,0.000,0.0,55.4,41,257.0,2017,5.036,0.0,3.450,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5462,ZayJones,2TM,WR,24,15,9,ZayJones2019,2TM2019,0.0,0.000,...,0.000,0.0,48.9,134,269.0,2019,3.260,0.0,2.288,0.0
5463,ZayJones,BUF,WR,23,16,15,ZayJones2018,BUF2018,0.0,0.062,...,0.062,0.0,165.2,34,93.0,2018,10.325,0.0,3.260,0.0
5464,ZayJones,BUF,WR,22,15,10,ZayJones2017,BUF2017,0.0,0.000,...,0.000,0.0,70.6,90,218.0,2017,4.707,0.0,10.325,0.0
5465,ZurlonTipton,IND,RB,25,10,0,ZurlonTipton2015,IND2015,0.0,0.000,...,0.000,0.0,12.7,124,419.0,2015,1.270,0.0,,


In [7]:
fantasy_db = fantasy_db.astype({"OvRank":int,"Starter": int})

In [8]:
fantasy_db.dtypes

Player              object
Tm                  object
Pos                 object
Age                  int64
G                    int64
GS                   int64
PlID                object
TmID                object
Cmp/G              float64
PassAtt/G          float64
PassYds/G          float64
PassTD/G           float64
Int/G              float64
RushAtt/G          float64
RushYds/G          float64
RushYds/Att        float64
RushTD/G           float64
Tgt/G              float64
Rec/G              float64
RecYds/G           float64
Yds/Rec            float64
RecTD/G            float64
Fmb/G              float64
FL/G               float64
TotTD/G            float64
2PM/G              float64
2PP/G              float64
FPts               float64
PosRk                int64
OvRank               int64
Yr                   int64
PPR/G              float64
Starter              int64
Next_Yr_PPG        float64
Next_Yr_Starter    float64
dtype: object

In [9]:
fantasy_db.to_csv('Database_CSVs/fantasyDB.csv',index=False)

In [10]:
fantasy_db.isnull().sum()

Player                0
Tm                    0
Pos                   0
Age                   0
G                     0
GS                    0
PlID                  0
TmID                  0
Cmp/G                 0
PassAtt/G             0
PassYds/G             0
PassTD/G              0
Int/G                 0
RushAtt/G             0
RushYds/G             0
RushYds/Att           0
RushTD/G              0
Tgt/G                 0
Rec/G                 0
RecYds/G              0
Yds/Rec               0
RecTD/G               0
Fmb/G                 0
FL/G                  0
TotTD/G               0
2PM/G                 0
2PP/G                 0
FPts                  0
PosRk                 0
OvRank                0
Yr                    0
PPR/G                 0
Starter               0
Next_Yr_PPG        1681
Next_Yr_Starter    1681
dtype: int64

In [None]:
fantasy_db.dtypes