# PGA Tour Player Performance: Web Scrape

In [1]:
#import modules
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import time

In [2]:
#URL of page to be scraped
url = 'https://www.pgatour.com/players.html'

In [3]:
#Retrieve page with the requests module
response = requests.get(url)

In [4]:
#create BeautifulSoup object; parse w 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

### Step 1: Scrape Link to Each Player Stat Page

In [5]:
# #execute chromedriver
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)
# browser.visit(url)

In [6]:
#get html code via beautifulsoup
# html = browser.html
# soup = BeautifulSoup(html, 'html.parser')
#soup.prettify()

In [7]:
#get names of all the player links
#retrieve the parent divs for all links
players = soup.find_all('span',class_="name")

In [8]:
#create empty list to accept data
player_names = []
player_url = []

#loop through each parent div and grab the link to the player stat page
for player in players:
    #get name of player
    player_names.append(player.a.text)
    #get url for player performance page
    player_url.append(player.a['href'])

### Step 2: Scrape PGA Performance Data for Each Individual Player

In [9]:
#create url for player
base_url = 'https://www.pgatour.com'
test_url = player_url[107]
scrape_url = base_url + test_url
#scrape_url = "https://www.pgatour.com/players/player.01006.john-adams.html"

#go to url page
response = requests.get(scrape_url)
soup = BeautifulSoup(response.text, 'html.parser')

scrape_url

'https://www.pgatour.com/players/player.52372.cameron-champ.html'

### Splinter Method

In [11]:
#execute chromedriver
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [12]:
browser.visit(scrape_url)

In [13]:
browser.click_link_by_partial_text("Performance")
time.sleep(1)

In [14]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [103]:
#test to see if javascript rendered data was scraped properly
# test = soup.find('div', class_='wrap').find('div', class_='tabbable').find('div', class_="performance").find('div',class_='tab-content') \
# .find('div', class_='tab-pane')
# test

In [359]:
#scrape all html hstat code
info = soup.findAll('div', class_='item')

#headline stats
info_stats = []

for i in info:
    try:
        #get stat name
        caption = i.find('div',class_='denotation').text
        #get stat value
        value = i.find('div', class_='value').text
        #clean up text
        value = value.replace('\xa0','')
        value = value.replace('\n','')
        #save to dictionary
        post = {'caption':caption, 'value': value}
        #append to list
        info_stats.append(post)
    except AttributeError:
        nothing = 0

In [360]:
#scrape all html hstat code
hstats = soup.findAll('div', class_='stat')

#headline stats
h_stats = []
for hstat in hstats:
    try:
        #get stat name
        caption = hstat.find('div',class_='caption').text
        #get stat value
        value = hstat.find('div',class_='value').text
        
        #save to dictionary
        post = {'caption':caption, 'value': value}

        #append to list
        h_stats.append(post)
    except AttributeError:
        nothing = 0

In [361]:
#scrape all html astat code
astats = soup.findAll('tr')

#attribute stats
a_stats = []

for astat in astats:
    try:
        #get the stat name
        caption = astat.find('td',class_='caption').text
        #get stat value
        value = astat.find('td',class_='value').text
        
        #save to dictionary
        post = {'caption':caption, 'value': value}
        
        #append to list
        a_stats.append(post)
    except AttributeError:
        nothing = 0

In [242]:
#scrape for additional needed info
extrastats = soup.findAll('td')

#attribute stats
extra_stats = []

for extra in extrastats:
    try:
        #get the stat name
        text = extra.text
        #append to list
        extra_stats.append(text)
    except AttributeError:
        nothing=0

In [415]:
#these attributes are unique with no captions/values-All string format
#search sub_strings of desired variables for values
sub_strings = ['Total Left rough', 'Total Right rough', 'Possible Fwys', 'Distance Rank', 'Accuracy Rank',
      'Total Club Head Speed', 'Total Attempts']

extra_stats_var= []
for sub in sub_strings:
    x = [s for s in extra_stats if sub in s]
    x = x[0].split(':')
    x[1] = x[1].replace(' ','')
    
    post = {'caption': x[0], 'value': x[1]}
    extra_stats_var.append(post)

In [347]:
#function that takes desired variables with your list of dictionaries scraped and returns a clean list of these variables
def get_vars(stats, vars_wanted):
    stats_vars = []
    #iterate through scraped data to find desired variables
    for list_item in stats:
        dict_item = [value for key,value in list_item.items()]
        if dict_item[0] in vars_wanted:
            #get rid of duplicates
            if dict_item not in stats_vars:
                stats_vars.append(dict_item)
    return stats_vars

In [348]:
#variables wanted from info stats
info_var = ['Height', 'Weight', 'AGE', 'Turned Pro', 'College', 'Birthplace' ,'FEDEXCUP Rank', 'FEDEXCUP Points', 'Scoring Average']
#variables wanted from headline stats
h_var = ['Total Distance', 'Total Drives', '# of Drives', 'Fairways Hit', 'Possible Fairways', 'Measured Rounds']
#variables wanted from additional stats
a_var = ['Driving Distance','Driving Accuracy Percentage','Total Driving','Club Head Speed',
         'Distance from Edge of Fairway','Left Rough Tendency','Right Rough Tendency','Total Driving Efficiency']
extra_var = ['Total Left rough', 'Total Right rough', 'Possible Fwys', 'Distance Rank', 'Accuracy Rank',
      'Total Club Head Speed', 'Total Attempts']

In [417]:
info_stats_vars = get_vars(info_stats, info_var)
h_stats_vars = get_vars(h_stats, h_var)
a_stats_vars = get_vars(a_stats, a_var)
extra_stats_vars = get_vars(extra_stats_var, sub_strings)

In [421]:
#all needed variables for analysis/visualization
all_var = info_stats_vars + h_stats_vars + a_stats_vars + extra_stats_vars
all_var

[['Height', '6 ft, 0 in'],
 ['Weight', '175lbs'],
 ['AGE', '23'],
 ['Turned Pro', '2017'],
 ['College', 'Texas A&M University '],
 ['Birthplace', 'Sacramento, California'],
 ['FEDEXCUP Rank', '39'],
 ['FEDEXCUP Points', '619'],
 ['Scoring Average', '70.916'],
 ['Measured Rounds', '38'],
 ['Total Distance', '29,047'],
 ['Total Drives', '92'],
 ['Total Distance', '163,169'],
 ['# of Drives', '536'],
 ['Fairways Hit', '354'],
 ['Possible Fairways', '647'],
 ['Driving Distance', '315.7'],
 ['Driving Accuracy Percentage', '54.71%'],
 ['Distance from Edge of Fairway', '30\' 4"'],
 ['Left Rough Tendency', '15.94%'],
 ['Right Rough Tendency', '18.60%'],
 ['Total Driving', '188'],
 ['Club Head Speed', '129.35'],
 ['Total Driving Efficiency', '334'],
 ['Total Left rough', '84'],
 ['Total Right rough', '98'],
 ['Possible Fwys', '527'],
 ['Distance Rank', '1'],
 ['Accuracy Rank', '187'],
 ['Total Club Head Speed', '4,397.91'],
 ['Total Attempts', '34']]

In [430]:
#photo
photo_url = soup.find('img', class_='photo')['src']

'https://pga-tour-res.cloudinary.com/image/upload/c_fill,d_headshots_default.png,f_auto,g_face:center,h_350,q_auto,w_280/headshots_52372.png'

In [432]:
soup.findAll('td', class_='date')

[<td class="date">10/07/18</td>,
 <td class="date">10/28/18</td>,
 <td class="date">11/04/18</td>,
 <td class="date">11/11/18</td>,
 <td class="date">11/18/18</td>,
 <td class="date">12/09/18</td>,
 <td class="date">1/06/19</td>,
 <td class="date">1/13/19</td>,
 <td class="date">1/27/19</td>,
 <td class="date">2/03/19</td>,
 <td class="date">2/10/19</td>,
 <td class="date">2/17/19</td>,
 <td class="date">3/10/19</td>,
 <td class="date">3/17/19</td>,
 <td class="date">4/21/19</td>,
 <td class="date">4/28/19</td>,
 <td class="date">5/05/19</td>,
 <td class="date">10/07/18</td>,
 <td class="date">10/28/18</td>,
 <td class="date">11/04/18</td>,
 <td class="date">11/11/18</td>,
 <td class="date">11/18/18</td>,
 <td class="date">12/09/18</td>,
 <td class="date">1/06/19</td>,
 <td class="date">1/13/19</td>,
 <td class="date">1/27/19</td>,
 <td class="date">2/03/19</td>,
 <td class="date">2/10/19</td>,
 <td class="date">2/17/19</td>,
 <td class="date">3/10/19</td>,
 <td class="date">3/17/19</t

In [470]:
soup.findAll('b')

[<b class="sharing popover-visible" data-share-url=""><span class="icon" data-original-title="" style="display: none;" title=""></span><div class="popover fade bottom in" style="top: 0px; left: 0px; display: block; position: relative;"><div class="arrow"></div><h3 class="popover-title"></h3><div class="popover-content"><div class="share-popup"><div><h3>SHARE ON</h3><div class="link-box"><span class="facebook"></span><span class="twitter"></span><span class="pinterest"></span><span class="tumblr"></span><span class="mailto"></span></div></div></div></div></div></b>,
 <b class="caret icon-chevron-down"></b>,
 <b class="icon-chevron-down"></b>,
 <b>FULL</b>,
 <b class="sharing" data-share-url="https://www.pgatour.com/players/player.52372.cameron-champ.html"><span class="icon" data-original-title="" title=""></span></b>,
 <b>Achievements</b>,
 <b>Latest</b>,
 <b>Scoring</b>,
 <b>Hole 9</b>,
 <b><!-- react-text: 230 -->Legend<!-- /react-text --><!-- react-text: 231 --> <!-- /react-text --><

In [486]:
bla = soup.findAll('div', class_ = 'holder')
dates = soup.findAll('td', class_ = 'date')

for i in bla:
    x = i.find('tbody')
    tourneys = x.findAll('b')
    tourneys2 = x.findAll('p')
tourneys2

[<p><b><!-- react-text: 148 -->Safeway Open<!-- /react-text --><!-- react-text: 149 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 166 -->Sanderson Farms Championship<!-- /react-text --><!-- react-text: 167 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 184 -->Shriners Hospitals for Children Open<!-- /react-text --><!-- react-text: 185 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 202 -->Mayakoba Golf Classic<!-- /react-text --><!-- react-text: 203 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 220 -->The RSM Classic<!-- /react-text --><!-- react-text: 221 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 238 -->QBE Shootout<!-- /react-text --><!-- react-text: 239 --> *<!-- /react-text --></b></p>,
 <p><b><!-- react-text: 256 -->Sentry Tournament of Champions<!-- /react-text --><!-- react-text: 257 --><!-- /react-text --></b></p>,
 <p><b><!-- react-text: 274 -->Sony Open in Hawaii<!-- /react-text --><!-- react-text: 275 --><!-- /react-tex

In [485]:
dates

[<td class="date">10/07/18</td>,
 <td class="date">10/28/18</td>,
 <td class="date">11/04/18</td>,
 <td class="date">11/11/18</td>,
 <td class="date">11/18/18</td>,
 <td class="date">12/09/18</td>,
 <td class="date">1/06/19</td>,
 <td class="date">1/13/19</td>,
 <td class="date">1/27/19</td>,
 <td class="date">2/03/19</td>,
 <td class="date">2/10/19</td>,
 <td class="date">2/17/19</td>,
 <td class="date">3/10/19</td>,
 <td class="date">3/17/19</td>,
 <td class="date">4/21/19</td>,
 <td class="date">4/28/19</td>,
 <td class="date">5/05/19</td>,
 <td class="date">10/07/18</td>,
 <td class="date">10/28/18</td>,
 <td class="date">11/04/18</td>,
 <td class="date">11/11/18</td>,
 <td class="date">11/18/18</td>,
 <td class="date">12/09/18</td>,
 <td class="date">1/06/19</td>,
 <td class="date">1/13/19</td>,
 <td class="date">1/27/19</td>,
 <td class="date">2/03/19</td>,
 <td class="date">2/10/19</td>,
 <td class="date">2/17/19</td>,
 <td class="date">3/10/19</td>,
 <td class="date">3/17/19</t