In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
from bs4 import BeautifulSoup
import requests

In [2]:
# all of the stats (id mapped to text) to capture from pgatour.com
stats = {
         'DRIVING DISTANCE': '101',
         'DRIVING ACCURACY': '102',
         'SG: OFF THE TEE': '02567', 
         'SG: APP THE GREEN': '02568', 
         'GIR %': '103', 
         'PROXIMITY TO HOLE': '331',
         'SG: AROUND THE GREEN': '02569',
         'SAND SAVE %': '111',
         'SCRAMBLING': '130',
         'SG: PUTTING': '02564', 
         '3-PUTT AVOIDANCE': '426',
         'BIRDIE OR BETTER CONVERSION %': '115'
        }

# years of interest (the last 6, including 2020)
years = ['2015', '2016', '2017', '2018', '2019', '2020']

In [3]:
# local directory
data_dir = 'C:/Users/johnd/Desktop/Continuing Education/Udacity/Machine Learning Nanodegree Program/PGA Golf Predictor Idea/HTML Data'

In [4]:
# Obtain all of the urls of interest for stats across 2015 - 2020 range
for stat in stats.values():
    for year in years:
        
        url = f'https://www.pgatour.com/content/pgatour/stats/stat.%s.y%s.html' % (stat, year)
        
        # Make a GET request to fetch the raw HTML content
        html_content = requests.get(url).text

        # Parse the html content
        soup = BeautifulSoup(html_content, "lxml")
        
        # Locate table, table headings, and table body
        table = soup.find("table", attrs={"class": "table-styled"})
        table_header = table.thead.find_all("tr")  # RANK THIS WEEK, RANK LAST WEEK, PLAYER NAME,...
        table_body = table.tbody.find_all("tr")

        # Get all the headings of Lists
        headings = []
        for th in table_header[0].find_all("th"):
            # remove any newlines and extra spaces from left and right
            headings.append(th.text.replace('\n', ' ').strip())

        # Get data from each row in the table
        table_data = []
        for tr in table.tbody.find_all('tr'):
            t_row=[]
            for td in tr.find_all('td'):
                t_row.append(td.text.replace('\n', '').strip())
            table_data.append(t_row)
        
        # put list of data into dataframe
        df = pd.DataFrame(table_data, columns=headings)
        df['Year'] = year
        df['Stat ID'] = stat
        df = df.set_index(df.columns[0])
        
        # save data to local directory (so we don't have to keep scraping)
        df.to_csv(os.path.join(data_dir, '%s_%s.csv' % (year, soup.title.text[:-17].replace(':', ''))))