In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
years = [str(i) for i in range(2009, 2023)]

In [4]:
for year in years:
    print(year)
    
    # Scoring Statistics, keep rounds from this page as it most accurately reflects total rounds player completed in season.
    scoring = pd.read_html('https://www.pgatour.com/stats/stat.120.y{}.html'.format(year))[1][['PLAYER NAME', 'ROUNDS', 'AVG']]
    # Rename Columns
    scoring = scoring.rename(columns={'AVG':'Scoring_Avg.'})
    
    # Driving Distance
    driving_distance = pd.read_html('https://www.pgatour.com/stats/stat.101.y{}.html'.format(year))[1][['PLAYER NAME', 'AVG.']]
    # Rename Columns
    driving_distance = driving_distance.rename(columns={'AVG.':'Drive_Distance'})
    
    # Scrambling
    scrambling = pd.read_html('https://www.pgatour.com/stats/stat.130.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    scrambling = scrambling.rename(columns={'%':'Scrambling_%'})
    
    # SG: Tee To Green
    sg_ttg = pd.read_html('https://www.pgatour.com/stats/stat.02674.y{}.html'.format(year))[1][['PLAYER NAME', 'AVERAGE']]
    # Rename Columns
    sg_ttg = sg_ttg.rename(columns={'AVERAGE':'SG_TTG'})
    
    # SG: Off The Tee
    sg_ott = pd.read_html('https://www.pgatour.com/stats/stat.02567.y{}.html'.format(year))[1][['PLAYER NAME', 'AVERAGE']]
    # Rename Columns
    sg_ott = sg_ott.rename(columns={'AVERAGE':'SG_OTT'})
    
    # SG: Approach Shots
    sg_app = pd.read_html('https://www.pgatour.com/stats/stat.02568.y{}.html'.format(year))[1][['PLAYER NAME', 'AVERAGE']]
    # Rename Columns
    sg_app = sg_app.rename(columns={'AVERAGE':'SG_APP'})
    
    # SG: Around The Green
    sg_atg = pd.read_html('https://www.pgatour.com/stats/stat.02569.y{}.html'.format(year))[1][['PLAYER NAME', 'AVERAGE']]
    # Rename Columns
    sg_atg = sg_atg.rename(columns={'AVERAGE':'SG_ATG'})
    
    # SG: Putting
    sg_putt = pd.read_html('https://www.pgatour.com/stats/stat.02564.y{}.html'.format(year))[1][['PLAYER NAME', 'AVERAGE']]
    # Rename Columns
    sg_putt = sg_putt.rename(columns={'AVERAGE':'SG_PUTT'})
    
    # Driving Accuracy Percentage
    driving_accuracy = pd.read_html('https://www.pgatour.com/stats/stat.102.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    driving_accuracy = driving_accuracy.rename(columns={'%':'Drive_Accuracy'})
    
    # Club Head Speed
    club_head_speed = pd.read_html('https://www.pgatour.com/stats/stat.02401.y{}.html'.format(year))[1][['PLAYER NAME', 'AVG.']]
    # Rename Columns
    club_head_speed = club_head_speed.rename(columns={'AVG.':'CHS (MPH)'})
    
    # Greens In Regulation Percentage
    gir = pd.read_html('https://www.pgatour.com/stats/stat.103.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    gir = gir.rename(columns={'%':'GIR_%'})
    
    # Going For The Green
    gftg = pd.read_html('https://www.pgatour.com/stats/stat.419.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    gftg = gftg.rename(columns={'%':'GFTG_%'})
    
    # One-Putt Percentage
    opp = pd.read_html('https://www.pgatour.com/stats/stat.413.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    opp = opp.rename(columns={'%':'One_Putt_%'})
    
    # Putting - Inside 10'
    p_inside10 = pd.read_html('https://www.pgatour.com/stats/stat.484.y{}.html'.format(year))[1][['PLAYER NAME', '% MADE']]
    # Rename Columns
    p_inside10 = p_inside10.rename(columns={'% MADE':'Inside_10ft_%'})
    
    # 3-Putt Percentage
    three_putt_avoid = pd.read_html('https://www.pgatour.com/stats/stat.426.y{}.html'.format(year))[1][['PLAYER NAME', '%']]
    # Rename Columns
    three_putt_avoid = three_putt_avoid.rename(columns={'%':'3_Putt_%'})
    
    # Par 3 Scoring Average
    par_3 = pd.read_html('https://www.pgatour.com/stats/stat.142.y{}.html'.format(year))[1][['PLAYER NAME', 'AVG']]
    # Rename Columns
    par_3 = par_3.rename(columns={'AVG':'Par_3_Avg.'})
    
    # Par 4 Scoring Average
    par_4 = pd.read_html('https://www.pgatour.com/stats/stat.143.y{}.html'.format(year))[1][['PLAYER NAME', 'AVG']]
    # Rename Columns
    par_4 = par_4.rename(columns={'AVG':'Par_4_Avg.'})
    
    # Par 5 Scoring Average
    par_5 = pd.read_html('https://www.pgatour.com/stats/stat.144.y{}.html'.format(year))[1][['PLAYER NAME', 'AVG']]
    # Rename Columns
    par_5 = par_5.rename(columns={'AVG':'Par_5_Avg.'})
    
    # Get Dataframes into list
    data_frames = [driving_distance, 
                   scrambling, 
                   sg_ttg, 
                   sg_ott, 
                   sg_app, 
                   sg_atg, 
                   sg_putt, 
                   driving_accuracy, 
                   club_head_speed, 
                   gir, 
                   gftg, 
                   opp, 
                   p_inside10, 
                   three_putt_avoid, 
                   par_3, 
                   par_4, 
                   par_5]
    
    # Merge all Dataframes together
    df_merged = pd.DataFrame()
    df_merged = scoring
    for df in data_frames:
        df_merged = pd.merge(df_merged, df, on='PLAYER NAME')
        
    # Only get players who's scoring average isn't null
    df_merged = df_merged.loc[df_merged['Scoring_Avg.'].isnull() == False]
    
    # Add year column
    df_merged['Year'] = year
    
    # Concatenate dataframe to overall dataframe
    if year == '2009':
        df_total = pd.DataFrame()
        df_total = pd.concat([df_total, df_merged], axis=0)
    else:
        df_total = pd.concat([df_total, df_merged], axis=0)

2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [5]:
df_total.to_csv('data/PGA_TOUR_Stats.csv')