In [1]:
import pandas as pd

import logging 
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

In [17]:
import re

In [2]:
cd ..

/Users/amlvt225/Code/GitHub/donkey_golf


In [3]:
import bs4

In [4]:
import requests

In [18]:
class PullPlayerInfo():
    
    def scrape_page(self):
        url = 'https://www.espn.com/golf/leaderboard'
        response = requests.get(url)

        self.soup = bs4.BeautifulSoup(response.text, "html.parser")

    def extract_main_table(self):
        # Grab the main leaderboard table
        table = self.soup.find('table', {'class': 'Table2__table-scroller Table2__right-aligned Table2__table'})

        # Grab all of the rows in a list
        self.rows = table.find_all('tr')
        
    def extract_data(self):
    
        # Dict to store all results
        self.output = {}

        # Loop thru all rows in table and pull data points
        # Don't want first row
        for player in self.rows[1:]:
            try:
                # Create clean dict to store results for each row
                new_dict = {}

                # Extract ESPN player ID from player URL
                url = player.find_all('a', href=True)[0]['href']
                player_id = re.sub("[^0-9]", "", url)   


                new_dict['player_page'] = url

                # Grab the URL for the country flag
                new_dict['country'] = player.find_all('img')[0]['src']    

                # Grab the player name
                new_dict['player_name'] = player.find_all('td')[2].text

                # Add results to dictionary
                self.output[player_id] = new_dict
            except Exception as e:
                logger.info("BAD ROW in BS4 ESPN table scrape - could just be the cut row")
                logger.info(e)
                
    def generate_output_data(self):
        self.data = pd.DataFrame.from_dict(self.output, orient='index',
                        columns=['player_name','player_page', 'country']).reset_index()
        
        self.data = self.data.rename(columns={'index':'player_id'})
        
    def run(self):
        self.scrape_page()
        self.extract_main_table()
        self.extract_data()
        self.generate_output_data()

In [19]:
yay = PullPlayerInfo()

In [20]:
yay.run()

In [13]:
yay.rows

[<tr class="Table2__header-row Table2__tr Table2__even"><th class="pos tl Table2__th" title=""><a class="clr-gray-04" data-sort-key="pos" href="/">POS</a></th><th class="name tl Table2__th" title=""><a class="clr-gray-04" data-sort-key="name" href="/">PLAYER</a></th><th class="toPar Table2__th" title=""><a class="clr-gray-04" data-sort-key="toPar" href="/">TO PAR</a><span class="dib arrow-icon_cont" style="width:10px;height:10px"><svg class="w-70 icon__svg" viewbox="0 0 24 24"><use xlink:href="#icon__caret__down"></use></svg></span></th><th class="today Table2__th" title=""><a class="clr-gray-04" data-sort-key="today" href="/">TODAY</a></th><th class="thru Table2__th" title=""><a class="clr-gray-04" data-sort-key="thru" href="/">THRU</a></th><th class="r1 Table2__th" title=""><a class="clr-gray-04" data-sort-key="r1" href="/">R1</a></th><th class="r2 Table2__th" title=""><a class="clr-gray-04" data-sort-key="r2" href="/">R2</a></th><th class="r3 Table2__th" title=""><a class="clr-gray-

In [31]:
new_dict = {}
for player in yay.rows[1:2]:

    # Grab the URL for the country flag
    new_dict['country'] = player.find_all('img')[0]['src']    

    # Grab the player name
    new_dict['player_name'] = player.find_all('td')[2].text
    
    print(player.find_all('td')[0].text)
    print(player.find_all('td')[1].text)
    print(player.find_all('td')[2].text)    

1
Webb Simpson
-5


In [21]:
yay.data

Unnamed: 0,player_id,player_name,player_page,country
0,10017,+4,http://www.espn.com/golf/player/_/id/10017/aus...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
1,10046,+1,http://www.espn.com/golf/player/_/id/10046/bry...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
2,10099,-2,http://www.espn.com/golf/player/_/id/10099/rom...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
3,10140,+1,http://www.espn.com/golf/player/_/id/10140/xan...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
4,10336,E,http://www.espn.com/golf/player/_/id/10336/jak...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
5,10364,E,http://www.espn.com/golf/player/_/id/10364/kur...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
6,10548,E,http://www.espn.com/golf/player/_/id/10548/mat...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
7,10577,E,http://www.espn.com/golf/player/_/id/10577/aar...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
8,1059,-1,http://www.espn.com/golf/player/_/id/1059/luke...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
9,10630,+6,http://www.espn.com/golf/player/_/id/10630/shu...,https://a.espncdn.com/combiner/i?img=/i/teamlo...


In [9]:
yay.scrape_page()

In [10]:
yay.extract_main_table()

In [11]:
yay.extract_data()

2019-07-18 07:39:56,823 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,824 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,829 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,831 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,834 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,836 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,837 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,839 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,846 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:56,848 - __main__ - INFO - BAD ROW in 

2019-07-18 07:39:57,042 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,043 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,044 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,045 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,046 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,047 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,048 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,049 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,049 - __main__ - INFO - BAD ROW in BS4 ESPN table scrape - could just be the cut row
2019-07-18 07:39:57,052 - __main__ - INFO - BAD ROW in 

Unnamed: 0,index,player_name,player_page,country
0,10046,Bryson DeChambeau,http://www.espn.com/golf/player/_/id/10046/bry...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
1,10048,Nick Hardy,http://www.espn.com/golf/player/_/id/10048/nic...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
2,10049,Sam Horsfield,http://www.espn.com/golf/player/_/id/10049/sam...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
3,10140,Xander Schauffele,http://www.espn.com/golf/player/_/id/10140/xan...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
4,10230,Clement Sordet,http://www.espn.com/golf/player/_/id/10230/cle...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
5,1030,Jhonattan Vegas,http://www.espn.com/golf/player/_/id/1030/jhon...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
6,1037,Scott Piercy,http://www.espn.com/golf/player/_/id/1037/scot...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
7,10442,Charlie Danielson,http://www.espn.com/golf/player/_/id/10442/cha...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
8,1049,Brian Davis,http://www.espn.com/golf/player/_/id/1049/bria...,https://a.espncdn.com/combiner/i?img=/i/teamlo...
9,10548,Matt Wallace,http://www.espn.com/golf/player/_/id/10548/mat...,https://a.espncdn.com/combiner/i?img=/i/teamlo...


In [107]:
for player in sample_rows:
    url = player.find_all('a', href=True)[0]['href']
    print(re.sub("[^0-9]", "", url))

3550
569
1293
10577
3470
769
5704
257
6798
10548
576
388
1037
9780
686
301
158
10140
153
1483
9037
8961
5532
3448
4355673
5408
1600
5467
110
5934
308
10442
208
72
1614
4364873
5285
462
4425897
5882
601
4304
5860
9261
377
9025
1612
1651
3351
2552
10046
1680
3702
9131
4827
10592
3599
9496
4362864
4425904
4321
1222
6007
5502
3322
4317
9364
10230
11350


In [96]:
gary.find_all('td')[2].text

'Gary Woodland'

In [53]:
gary = rows[1]

In [76]:
gary.find_all('img')[0]['src']

'https://a.espncdn.com/combiner/i?img=/i/teamlogos/countries/500/usa.png&w=40&h=40&scale=crop'

In [62]:
urls = gary.find_all('a', href=True)

In [65]:
urls

[<a class="leaderboard_player_name" href="http://www.espn.com/golf/player/_/id/3550/gary-woodland" to="http://www.espn.com/golf/player/_/id/3550/gary-woodland">Gary Woodland</a>]

In [98]:
cmon = urls[0]['href']

In [99]:
cmon

'http://www.espn.com/golf/player/_/id/3550/gary-woodland'

In [100]:
import re

In [101]:
re.findall('/id/', cmon)

['/id/']

In [102]:
re.sub("[^0-9]", "", cmon)

'3550'

In [36]:
body = table.find_all('td')

In [45]:
body[2]

<td class="tl Table2__td"><img class="flag mr2" src="https://a.espncdn.com/combiner/i?img=/i/teamlogos/countries/500/usa.png&amp;w=40&amp;h=40&amp;scale=crop"/><a class="leaderboard_player_name" href="http://www.espn.com/golf/player/_/id/3550/gary-woodland" to="http://www.espn.com/golf/player/_/id/3550/gary-woodland">Gary Woodland</a></td>

In [44]:
body[2].find_all('a')

[<a class="leaderboard_player_name" href="http://www.espn.com/golf/player/_/id/3550/gary-woodland" to="http://www.espn.com/golf/player/_/id/3550/gary-woodland">Gary Woodland</a>]

In [42]:
for i in range(15):
    print(f"I: {i}")
    print(body[i].text)

I: 0
1
I: 1
7
I: 2
Gary Woodland
I: 3
-9
I: 4
-6
I: 5
F
I: 6
68
I: 7
65
I: 8
--
I: 9
--
I: 10
133
I: 11
2
I: 12
1
I: 13
Justin Rose
I: 14
-7


In [38]:
body[3]

<td class="Table2__td">-9</td>