# PGA Tour Player Performance: Web Scrape

In [187]:
#import modules
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import time

In [188]:
#URL of page to be scraped
url = 'https://www.pgatour.com/players.html'

In [189]:
#Retrieve page with the requests module
response = requests.get(url)

In [190]:
#create BeautifulSoup object; parse w 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

### Step 1: Scrape Link to Each Player Stat Page

In [191]:
# #execute chromedriver
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)
# browser.visit(url)

In [192]:
#get html code via beautifulsoup
# html = browser.html
# soup = BeautifulSoup(html, 'html.parser')
#soup.prettify()

In [193]:
#get names of all the player links
#retrieve the parent divs for all links
players = soup.find_all('span',class_="name")

In [194]:
#create empty list to accept data
player_names = []
player_url = []

#loop through each parent div and grab the link to the player stat page
for player in players:
    #get name of player
    player_names.append(player.a.text)
    #get url for player performance page
    player_url.append(player.a['href'])

### Step 2: Scrape PGA Performance Data for Each Individual Player

In [214]:
#create url for player
base_url = 'https://www.pgatour.com'
test_url = player_url[671]
scrape_url = base_url + test_url
#scrape_url = "https://www.pgatour.com/players/player.01006.john-adams.html"

#go to url page
response = requests.get(scrape_url)
soup = BeautifulSoup(response.text, 'html.parser')

scrape_url

'https://www.pgatour.com/players/player.29221.webb-simpson.html'

### Splinter Method

In [215]:
#execute chromedriver
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [216]:
browser.visit(scrape_url)

In [217]:
browser.click_link_by_partial_text("Performance")
time.sleep(1)

In [218]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [219]:
#test to see if javascript rendered data was scraped properly
# test = soup.find('div', class_='wrap').find('div', class_='tabbable').find('div', class_="performance").find('div',class_='tab-content') \
# .find('div', class_='tab-pane')
# test

In [220]:
#photo of player
try:
    photo_url = soup.find('img', class_='photo')['src']
    #player_intro = [{'Player Name':}]
except TypeError:
    photo_url = 'none'

In [221]:
#scrape all html hstat code
info = soup.findAll('div', class_='item')

#headline stats
info_stats = []

for i in info:
    try:
        #get stat name
        caption = i.find('div',class_='denotation').text
        #get stat value
        value = i.find('div', class_='value').text
        #clean up text
        value = value.replace('\xa0','')
        value = value.replace('\n','')
        #save to dictionary
        #post = {'caption':caption, 'value': value}
        post = {caption: value}
        #append to list
        info_stats.append(post)
    except AttributeError:
        nothing = 0

In [222]:
#scrape all html hstat code
hstats = soup.findAll('div', class_='stat')

#headline stats
h_stats = []
for hstat in hstats:
    try:
        #get stat name
        caption = hstat.find('div',class_='caption').text
        #get stat value
        value = hstat.find('div',class_='value').text
        
        #save to dictionary
        #post = {'caption':caption, 'value': value}
        post = {caption : value}

        #append to list
        h_stats.append(post)
    except AttributeError:
        nothing = 0

In [223]:
#scrape all html astat code
astats = soup.findAll('tr')

#attribute stats
a_stats = []

for astat in astats:
    try:
        #get the stat name
        caption = astat.find('td',class_='caption').text
        #get stat value
        value = astat.find('td',class_='value').text
        
        #save to dictionary
        post = {caption : value}
        
        #append to list
        a_stats.append(post)
    except AttributeError:
        nothing = 0

In [224]:
#scrape for additional needed info
extrastats = soup.findAll('td')

#attribute stats
extra_stats = []

for extra in extrastats:
    try:
        #get the stat name
        text = extra.text
        #append to list
        extra_stats.append(text)
    except AttributeError:
        nothing=0

In [225]:
#these attributes are unique with no captions/values-All string format
#search sub_strings of desired variables for values
sub_strings = ['Total Left rough', 'Total Right rough', 'Possible Fwys', 'Distance Rank', 'Accuracy Rank',
      'Total Club Head Speed', 'Total Attempts']

extra_stats_var= []
for sub in sub_strings:
    x = [s for s in extra_stats if sub in s]
    if x:
        x = x[0].split(':')
        x[1] = x[1].replace(' ','')

        #post = {'caption': x[0], 'value': x[1]}
        post = {x[0] : x[1]}
        extra_stats_var.append(post)
    else:
        nothing=0

In [226]:
#function that takes desired variables with your list of dictionaries scraped and returns a clean list of these variables
def get_vars(stats, vars_wanted):
    stats_vars = []
    items=[]
    #iterate through scraped data to find desired variables
    for list_item in stats:
        dict_item = [key for key,value in list_item.items()]
        if dict_item[0] in vars_wanted:
            #check for duplicates
            if dict_item not in items:
                items.append(dict_item)
                stats_vars.append(list_item)
    return stats_vars

In [227]:
#variables wanted from info stats
info_var = ['Height', 'Weight', 'AGE', 'Turned Pro', 'College', 'Birthplace' ,'FEDEXCUP Rank', 'FEDEXCUP Points', 'Scoring Average']
#variables wanted from headline stats
h_var = ['Total Distance', 'Total Drives', '# of Drives', 'Fairways Hit', 'Possible Fairways', 'Measured Rounds']
#variables wanted from additional stats
a_var = ['Driving Distance','Driving Accuracy Percentage','Total Driving','Club Head Speed',
         'Distance from Edge of Fairway','Left Rough Tendency','Right Rough Tendency','Total Driving Efficiency']
extra_var = ['Total Left rough', 'Total Right rough', 'Possible Fwys', 'Distance Rank', 'Accuracy Rank',
      'Total Club Head Speed', 'Total Attempts']

In [228]:
info_stats_vars = get_vars(info_stats, info_var)
h_stats_vars = get_vars(h_stats, h_var)
a_stats_vars = get_vars(a_stats, a_var)
extra_stats_vars = get_vars(extra_stats_var, sub_strings)

In [229]:
#combine all stat variables
# old method : all_stat_var = info_stats_vars + h_stats_vars + a_stats_vars + extra_stats_vars
all_stat_var = {}

a_stats_vars = {key: value for a_stat in a_stats_vars for key, value in a_stat.items()}
all_stat_var.update(a_stats_vars)

h_stats_vars = {key: value for h_stat in h_stats_vars for key, value in h_stat.items()}
all_stat_var.update(h_stats_vars)

extra_stats_vars = {key: value for extra_stat in extra_stats_vars for key, value in extra_stat.items()}
all_stat_var.update(extra_stats_vars)

info_stats_vars = {key: value for info_stat in info_stats_vars for key, value in info_stat.items()}
all_stat_var.update(info_stats_vars)

In [230]:
bla = soup.findAll('div', class_ = 'holder')
dates_ = soup.findAll('td', class_ = 'date')
rounds = soup.findAll('td', class_='round')

tourney_name = []
all_text = []
scores = []
to_par = []
pos = []
dates = []

for i in bla:
    x = i.find('tbody')
    #tourney info
    tourneys = x.findAll('p')
    #need this for all text
    tds = x.findAll('td')
    #get all text to use later for pos
    [all_text.append(td.text) for td in tds]
    
    #tournament names
    [tourney_name.append(j.text) for j in tourneys]

#clean dates
[dates.append(d.text) for d in dates_]
#scores of each round in increments of 4 ('--' means no score)
[scores.append(r.text) for r in rounds]
#now append tournament position results by getting list item after tournament name
[pos.append(all_text[all_text.index(tourney)+1]) for tourney in tourney_name]
#now append tournament position results by getting list item after tournament name
[to_par.append(all_text[all_text.index(tourney)+8]) for tourney in tourney_name]
#delete first one
print('done')

done


In [231]:
#create final dictionary of tournaments for the past year
tournament_history = []
for date,tourney,score,rank in zip(dates,tourney_name,to_par,pos):
    try:
        #create dictionary with all info
        post = {'Date':date, 
                'Tournament Name':tourney, 
                'Total Score':score, 
                'POS':rank}
        #append to final list
        tournament_history.append(post)
    except AttributeError:
        nothing=0

In [232]:
#output list
player_final = {}
player_final['player_intro'] = photo_url
player_final['all_stat_var'] = all_stat_var
player_final['tournament_hist'] = tournament_history
print('done')

done


## Titleist Site Web Scrape

In [233]:
#execute chromedriver
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [234]:
url = "https://www.titleist.com/tour/pga/players"
base_url = "https://www.titleist.com"
browser.visit(url)
time.sleep(2)

In [235]:
#close pop-up
try:
    browser.find_by_css("Button")[11].click()
except:
    print("No Pop Up")

In [236]:
#info we will scrape
done = 0
titleist_players = []

In [237]:
while done != 1:
    try:
#         browser.click_link_by_partial_text("Next")
#         time.sleep(1)
        
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        
        #scrape all html hstat code
        test = soup.findAll('li', class_='m-results-item')
        
        for t in test:

            #name of player
            n = t.find('h2', class_='m-results-label').text
            #format name to match other data scraped
            n = n.replace(' ', '')
            n = n.replace('\n', '')
            isupper = [letter.isupper() for letter in n]
            n = n + ", "
            #index of last name starting
            last_name_index = [i for i, x in enumerate(isupper) if x][1]
            #break up
            first_name = n[0:last_name_index]
            last_name = n[last_name_index:]
            n = last_name + first_name
            name = n

            #equipment using
            e = t.find('em').text
            #name.append(n)
            equipment = e

            if e == 'Brand Ambassador':
                player_page = t.find('h2', class_='m-results-label').a['href']
                club_url = base_url + player_page
                browser.visit(club_url)
                html = browser.html
                soup = BeautifulSoup(html, 'html.parser')
                try:
                    driver = soup.find('div', class_ = "m-category-listing-content").a.text
                    driver = driver.replace(" ", "")
                    driver = driver.replace("\n", "")
                    type_equipment = driver
                    browser.back()
                except:
                    golf_ball = t.findAll('p')
                    ball = [b.text for b in golf_ball]
                    ball = ball[1]
                    ball = ball.replace("Brand Ambassador", "")
                    type_equipment = ball
                    browser.back()
            else:
                golf_ball = t.findAll('p')
                ball = [b.text for b in golf_ball]
                ball = ball[1]
                ball = ball.replace("Golf Ball Player", "")
                type_equipment = ball
            
            #put needed info to dictionary
            post = {'Player Name': n, 
            'Type': equipment, 
            'Equipment': type_equipment, 
            }
            
            #append each player to list
            if (post['Type'] == "Brand Ambassador"):
                titleist_players.append(post)
                    
        browser.click_link_by_partial_text("Next")
        time.sleep(1)
    except:
        done=1    

In [244]:
for i in titleist_players:
    print(i['Player Name'])

Thomas, Justin
Cantlay, Patrick
Simpson, Webb
Scott, Adam
Poulter, Ian
Smith, Cameron
CabreraBello, Rafa
Spieth, Jordan
Bjerregaard, Lucas
HowellIII, Charles
HunAn, Byeong-
JaeIm, Sung-
Piercy, Scott
Hoffman, Charley
Harman, Brian
Hadley, Chesson
Kizzire, Patton
Homa, Max
Walker, Jimmy
Uihlein, Peter
Stallings, Scott
Henley, Russell
Bhullar, Gaganjeet
Dunne, Paul
T.Poston, J.
Burgoon, Bronson
Jones, Matt
Watney, Nick
Cauley, Bud
HoonKang, Sung-
Werenski, Richy
Horsfield, Sam
Ormsby, Wade
Coetzee, George
O'Hair, Sean
Malnati, Peter
Sloan, Roger
Johnston, Andrew
Streb, Robert
Albertson, Anders
Blaum, Ryan
Haas, Bill
Wagner, Johnson
McLachlin, Parker
Hoge, Thomas
Bozzelli, Dominic
Kim, Michael
Duncan, Tyler
Wiesberger, Bernd
Lebioda, Hank
Jaidee, Thongchai
Svensson, Adam
Fathauer, Derek
Karlsson, Robert
Martin, Ben
VanAswegen, Tyrone
Bourdy, Gregory
Hoffmann, Morgan
DuToit, Jared
Loupe, Andrew
Ogilvy, Geoff
Pastore, David
Thornberry, Braden
Holman, Nathan
DeLaet, Graham
Faxon, Brad
Haas, 

DEBUGGING

In [127]:
#execute chromedriver
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [161]:
url = "https://www.titleist.com/tour/pga/players?page=11"
base_url = "https://www.titleist.com"
browser.visit(url)
time.sleep(2)

In [129]:
#close pop-up
try:
    browser.find_by_css("Button")[11].click()
except:
    print("No Pop Up")

In [165]:
name = []
equipment = []
type_equipment = []

In [166]:
# for x in range(5):
#         browser.click_link_by_partial_text("Next")
#         time.sleep(1)
    
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
        
#scrape all html hstat code
test = soup.findAll('li', class_='m-results-item')
        
for t in test:
    
    #name of player
    n = t.find('h2', class_='m-results-label').text
    #format name to match other data scraped
    n = n.replace(' ', '')
    n = n.replace('\n', '')
    isupper = [letter.isupper() for letter in n]
    n = n + ", "
    #index of last name starting
    last_name_index = [i for i, x in enumerate(isupper) if x][1]
    #break up
    first_name = n[0:last_name_index]
    last_name = n[last_name_index:]
    n = last_name + first_name
    name.append(n)
            
    #equipment using
    e = t.find('em').text
    #name.append(n)
    equipment.append(e)
            
    if e == 'Brand Ambassador':
        player_page = t.find('h2', class_='m-results-label').a['href']
        club_url = base_url + player_page
        browser.visit(club_url)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        try:
            driver = soup.find('div', class_ = "m-category-listing-content").a.text
            driver = driver.replace(" ", "")
            driver = driver.replace("\n", "")
            type_equipment.append(driver)
            browser.back()
        except:
            golf_ball = t.findAll('p')
            ball = [b.text for b in golf_ball]
            ball = ball[1]
            ball = ball.replace("Brand Ambassador", "")
            print(ball)
            type_equipment.append(ball)
            browser.back()  
    else:
        golf_ball = t.findAll('p')
        ball = [b.text for b in golf_ball]
        ball = ball[1]
        ball = ball.replace("Golf Ball Player", "")
        type_equipment.append(ball)   

Titleist Pro V1 


In [37]:
name

['\n\n                                Lucas Bjerregaard\n                                    \n',
 '\n\n                                Branden Grace\n                                    \n']

In [316]:
test = name[0]
test = test.replace(' ', '')
test = test.replace('\n', '')

In [341]:
test = test + ", "

In [343]:
first_name = test[0:last_name_index]

In [345]:
last_name = test[last_name_index:]

In [347]:
final_name = last_name + first_name

In [348]:
final_name

'Thomas, Justin'

In [346]:
last_name

'Thomas, '

In [344]:
first_name

'Justin'

In [327]:
test[2].islower()

True

In [354]:
isupper = [letter.isupper() for letter in test]

In [355]:
last_name_index = [i for i, x in enumerate(isupper) if x][1]
last_name_index

6

In [331]:
first = []
last = []
for letter in test:
    if letter.isupper():
        first.append(letter)
        letter = letter.islower()
    while(letter.islower()):
        first.append(letter)

AttributeError: 'bool' object has no attribute 'islower'

In [315]:
upper = []
lower = []
for letter in test:
    print(letter.islower())

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
False
False
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [13]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [14]:
#scrape all html hstat code
test = soup.findAll('li', class_='m-results-item')
link = []
name = []
equipment = []

for t in test:
    l = t.find('h2', class_='m-results-label').a['href']
    n = t.find('h2', class_='m-results-label').text
    e = t.find('em').text
    
    link.append(l)
    name.append(n)
    equipment.append(e)

In [43]:
t

<li class="m-results-item">
<div class="m-results-media">
<a href="/tour/2778/branden-grace">
<picture>
<source srcset="//acushnet.scene7.com/is/image/titleist/player_100x100?$player=titleist/2016_branden_grace_thumbnail, //acushnet.scene7.com/is/image/titleist/player_200x200?$player=titleist/2016_branden_grace_thumbnail 2x"/>
<img alt="Branden Grace" src="//acushnet.scene7.com/is/image/titleist/player_100x100?$player=titleist/2016_branden_grace_thumbnail"/>
</picture>
</a>
</div><!--/.m-results-media-->
<div class="m-results-content">
<p class="titleist-tag">
<a href="/tour/all-players?tour=PGA">PGA</a>
</p>
<h2 class="m-results-label">
<a href="/tour/2778/branden-grace">
                                Branden Grace
                                    </a>
</h2>
<p><em>Golf Ball Player<br/></em>Titleist Pro V1x </p>
<p>World Rank: 46</p>
</div><!--/.m-results-content-->
</li>

In [196]:
type_[0] == 'Brand Ambassador'

True

In [284]:
bla = t.findAll('p')
ball = []
ball = [b.text for b in bla]

In [285]:
ball = ball[1]
ball = ball.replace("Golf Ball Player", "")
ball

'Titleist Pro V1x '

In [218]:
newstr = exstring.replace("Golf Ball Player", "")

In [219]:
newstr

'Titleist Pro V1x '

In [40]:
browser.visit("https://www.titleist.com/tour/2778/branden-grace")
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [41]:
bla = soup.find('div', class_ = "m-category-listing-content").a.text
bla

AttributeError: 'NoneType' object has no attribute 'text'

In [42]:
bla

NameError: name 'bla' is not defined

In [263]:
bla = bla.replace(" ", "")
bla = bla.replace("\n", "")

In [264]:
bla

'917D2Driver'