In [1]:
#Imports
import requests # Request module
import pandas as pd # Data Wrangling
import numpy as np # Data Wrangling
from bs4 import BeautifulSoup #Web sraping module

# 1st web-scraping

## Pull Headers from web

In [2]:

def get_headers(soup):
    
    '''This function get's the column names to use for the data frame.'''

    headers = []
    
    #Get events header
    events = soup.find_all(class_="rounds hidden-small hidden-medium")[0].get_text()
    headers.append(events)
    
    #Get other headers
    stat_headers = soup.find_all(class_="col-stat hidden-small hidden-medium")
    for header in stat_headers:
        headers.append(header.get_text())
 
    return headers


## Pull players form web

In [3]:
#Get Players
def get_players(soup):
    '''This function takes the beautiful soup created and uses it to gather player names from the specified stats page.'''
    
    player_list = []
    
    #Get player as html tags
    players = soup.select('td a')[1:] #Use 1 beacuse first line of all tables is not useful.
    #Loop through list
    for player in players:
        player_list.append(player.get_text())
    
    return player_list

## Pull statistics from page

In [4]:
#Get Stats
def get_stats(soup, categories):
    '''This function takes the soup created before and the number of categories needed to generate this'''
    
    stat_list = []
    
    #Finds all tags with class specified and puts into a list
    stats = soup.find_all(class_="hidden-small hidden-medium") 
    
    #Loop through 
    for i in range(1, len(stats)-categories+1, categories):
        temp_list = []
        for j in range(categories):
            temp_list.append(stats[i + j].get_text())
        stat_list.append(temp_list)
            
    return stat_list


## Create data dictionary for page

In [5]:
def stats_dict(players, stats):
        '''This function takes two lists, players and stats, 
        and creates a dictionary with the player being the key 
        and the stats as the vales (as a list)'''
    
        #initialize player dictionary
        player_dict = {}
    
        #Loop through player list
        for i, player in enumerate(players):
            player_dict[player] = stats[i]
    
        return player_dict

## Use functions 1-4 to create dataframe for statistic. "make_dataframe"

In [6]:

##Mega function
def make_dataframe(url, categories):
        
    ##Create soup object from url.
    response = requests.get(url)
    text = response.text
    soup = BeautifulSoup(response.content)
    #soup = BeautifulSoup(text, 'lxml')
    
    #1. Get Headers
    headers = get_headers(soup)
    
    #2. Get Players
    players = get_players(soup)
    
    #3. Get Stats
    stats = get_stats(soup, categories)
    
    #4. Make stats dictionary.
    stats_dictionary = stats_dict(players, stats)
    
    #Make dataframe
    frame = pd.DataFrame(stats_dictionary, index = headers).T
    
    #Reset index
    frame = frame.reset_index()
    
    #For each Dataframe, change index column to 'NAME'
    frame = frame.rename(index = str, columns = {'index': 'NAME'})

    return frame

## Loop through years 2010-2021 to create a dataframe from years 2010-2017


In [7]:
year = 2014

In [8]:
fedex = make_dataframe("https://www.pgatour.com/content/pgatour/stats/stat.02671.y2014.html".format(year), 5)

In [9]:
fedex

Unnamed: 0,NAME,EVENTS,# OF WINS,# OF TOP-10S,POINTS BEHIND LEAD,RESET POINTS
0,Billy Horschel,27,2,5,,
1,Chris Kirk,28,2,5,1650,
2,Rory McIlroy,17,3,12,1700,
3,Jim Furyk,21,0,11,2300,
4,Bubba Watson,21,2,8,2465,
...,...,...,...,...,...,...
253,Paul Stankowski,3,0,,6804,
254,Scott Verplank,12,0,,6804,
255,Chris DiMarco,7,0,,6804,
256,Brett Quigley,1,0,,6804,


# 2nd Web Scraping

In [10]:
def get_headers2(soup):
    
    '''This function get's the column names to use for the data frame.'''

    headers = []

    #Get headers
    stat_headers = soup.find_all(class_="col-stat hidden-small hidden-medium")
    for header in stat_headers:
        headers.append(header.get_text())
 
    return headers

In [11]:
#Get Players
def get_players2(soup):
    '''This function takes the beautiful soup created and uses it to gather player names from the specified stats page.'''
    
    player_list = []
    
    #Get player as html tags
    players = soup.select('td a')[1:] #Use 1 beacuse first line of all tables is not useful.
    #Loop through list
    for player in players:
        player_list.append(player.get_text())
    
    return player_list

In [12]:
#Get Stats
def get_stats2(soup, categories):
    '''This function takes the soup created before and the number of categories needed to generate this'''
    
    stat_list = []
    
    #Finds all tags with class specified and puts into a list
    
    
    stats = soup.find_all(class_="hidden-small hidden-medium") 
    
    #Loop through 
    for i in range(1, len(stats)-categories+1, categories):
        temp_list = []
        for j in range(categories):
            temp_list.append(stats[i + j].get_text())
        stat_list.append(temp_list)
            
    return stat_list


In [13]:
def stats_dict2(players, stats):
        '''This function takes two lists, players and stats, 
        and creates a dictionary with the player being the key 
        and the stats as the vales (as a list)'''
    
        #initialize player dictionary
        player_dict = {}
    
        #Loop through player list
        for i, player in enumerate(players):
            player_dict[player] = stats[i]
    
        return player_dict

In [14]:
##Mega function
def make_dataframe2(url, categories):
        
    ##Create soup object from url.
    response = requests.get(url)
    text = response.text
    soup = BeautifulSoup(response.content)
   
    
    #1. Get Headers
    headers = get_headers2(soup)
    
    #2. Get Players
    players = get_players2(soup)
    
    #3. Get Stats
    stats = get_stats(soup, categories)
    
    #4. Make stats dictionary.
    stats_dictionary = stats_dict2(players, stats)
    
    #Make dataframe
    frame = pd.DataFrame(stats_dictionary, index = headers).T
    
    #Reset index
    frame = frame.reset_index()
    
    #For each Dataframe, change index column to 'NAME'
    frame = frame.rename(index = str, columns = {'index': 'NAME'})
    
    return frame

In [15]:
top10 = make_dataframe2("https://www.pgatour.com/content/pgatour/stats/stat.138.y2014.html".format(year), 3)[['NAME', '1ST','2ND', '3RD']]

In [16]:
top10

Unnamed: 0,NAME,1ST,2ND,3RD
0,Rory McIlroy,3,2,
1,Jim Furyk,,4,
2,Matt Kuchar,1,1,
3,Adam Scott,1,,1
4,Sergio Garcia,,3,2
...,...,...,...,...
170,Brice Garnett,,,
171,Danny Lee,,1,
172,Kyle Stanley,,,
173,Scott Stallings,1,,


# 3rd Web Scraping

In [17]:

def get_headers3(soup):
    
    '''This function get's the column names to use for the data frame.'''

    headers = []
    
    #Get events header
    events = soup.find_all(class_="rounds hidden-small hidden-medium")[0].get_text()
    headers.append(events)
    
    #Get other headers
    stat_headers = soup.find_all(class_="col-stat hidden-small hidden-medium")
    for header in stat_headers:
        headers.append(header.get_text())
 
    return headers


In [18]:
#Get Players
def get_players3(soup):
    '''This function takes the beautiful soup created and uses it to gather player names from the specified stats page.'''
    
    player_list = []
    
    #Get player as html tags
    players = soup.select('td a')[1:] #Use 1 beacuse first line of all tables is not useful.
    #Loop through list
    for player in players:
        player_list.append(player.get_text())
    
    return player_list

In [19]:
#Get Stats
def get_stats3(soup, categories):
    '''This function takes the soup created before and the number of categories needed to generate this'''
    
    stat_list = []
    
    #Finds all tags with class specified and puts into a list
    stats = soup.find_all(class_="hidden-small hidden-medium") 
    
    #Loop through 
    for i in range(1, len(stats)-categories+1, categories):
        temp_list = []
        for j in range(categories):
            temp_list.append(stats[i + j].get_text())
        stat_list.append(temp_list)
            
    return stat_list

In [20]:
def stats_dict3(players, stats):
        '''This function takes two lists, players and stats, 
        and creates a dictionary with the player being the key 
        and the stats as the vales (as a list)'''
    
        #initialize player dictionary
        player_dict = {}
    
        #Loop through player list
        for i, player in enumerate(players):
            player_dict[player] = stats[i]
    
        return player_dict

In [21]:
##Mega function
def make_dataframe3(url, categories):
        
    ##Create soup object from url.
    response = requests.get(url)
    text = response.text
    soup = BeautifulSoup(response.content)
    #soup = BeautifulSoup(text, 'lxml')
    
    #1. Get Headers
    headers = get_headers3(soup)
    
    #2. Get Players
    players = get_players3(soup)
    
    #3. Get Stats
    stats = get_stats3(soup, categories)
    
    #4. Make stats dictionary.
    stats_dictionary = stats_dict3(players, stats)
    
    #Make dataframe
    frame = pd.DataFrame(stats_dictionary, index = headers).T
    
    #Reset index
    frame = frame.reset_index()
    
    #For each Dataframe, change index column to 'NAME'
    frame = frame.rename(index = str, columns = {'index': 'NAME'})
    
    return frame

In [22]:

    #Scoring statistics, keep rounds from this page as it most accurately reflects total rounds player completed in season.
    scoring = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.120.y2014.html".format(year), 4)[['NAME', 'TOTAL ROUNDS', 'TOTAL STROKES', 'TOTAL ADJUSTMENT']]
    #Create new Column
    #scoring = scoring.rename(columns={'AVG':'SCORING'})
    
    #Driving Distance
    drivedistance = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.101.y2014.html".format(year), 3)[['NAME', 'TOTAL DISTANCE', 'TOTAL DRIVES']]
    #Create new column
    #drivedistance = drivedistance.rename(columns = {'AVG.':'DRIVE_DISTANCE'})
    
    #Driving Accuracy
    driveacc = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.102.y2014.html".format(year), 3)[['NAME', 'FAIRWAYS HIT', 'POSSIBLE FAIRWAYS']]
    #Create new column
    #driveacc['Driving Accuracy'] = driveacc['FAIRWAYS HIT']/driveacc['POSSIBLE FAIRWAYS']
    
    #Greens in Regulation Percentage.
    gir = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.103.y2014.html".format(year), 4)[['NAME', 'GREENS HIT', '# HOLES', 'RELATIVE/PAR']]
    #Create new column
    #gir = gir.rename(columns = {'%': "GIR_%"})
    
    #Strokes gained putting
    sg_putting = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.02564.y2014.html".format(year), 3)[['NAME', 'TOTAL SG:PUTTING', 'MEASURED ROUNDS']]
    #Change name of average column
    #sg_putting = sg_putting.rename(columns = {'AVERAGE': 'SG_P'})

    #Strokes gained tee to green
    sg_teetogreen = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.02674.y2014.html".format(year), 5)[['NAME', 'SG:OTT', 'SG:APR', 'SG:ARG']]
    #Change name of average column
    #sg_teetogreen = sg_teetogreen.rename(columns = {'AVERAGE' : 'SG_TTG'})
    
    #sg total
    sg_total = make_dataframe3("https://www.pgatour.com/content/pgatour/stats/stat.02675.y2014.html".format(year), 5)[['NAME', 'TOTAL SG:T', 'MEASURED ROUNDS']]
    #sg_total = sg_total.rename(columns = {'AVERAGE':'SG_T'})
    
    #Get Dataframes into list.
    data_frames = [drivedistance, driveacc, gir, sg_putting, sg_teetogreen, sg_total]
    
        
    #Merge all Dataframes together
    df_merge = pd.DataFrame()
    df_merge = scoring
    for df in data_frames:
        df_merge = pd.merge(df_merge, df, on='NAME')

In [23]:
#merge fex ex cup points
df_merged = pd.merge(df_merge, fedex, how='outer', on='NAME')


#Merge top 10's
df_merged = pd.merge(df_merged, top10, how='outer', on='NAME')

        
#Add year column
df_merged['YEAR'] = year

In [24]:
df_merged = df_merged[df_merged['TOTAL ROUNDS'].notna()]

In [25]:
actual_2014_winners = df_merged['NAME'].head(10)
actual_2014_winners.to_csv('./data/winners2014.csv')

In [26]:
df_merged.to_csv('./data/2014.csv')