In [22]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import re

In [76]:
def scrape_data():
    
    url = 'https://www.pgatour.com/tournaments/2024/masters-tournament/R2024014/past-results'
    response = requests.get(url)

    print("HTTP Response Status Code:", response.status_code)

    soup = BeautifulSoup(response.content, 'html.parser')

    print("HTML Content (first 200 characters):", soup.prettify()[:200])

    table = soup.find('table', class_='chakra-table')
    # Check if the table was found
    if table:
        print("Table found!")

    data = []
    headers = []

    for row in table.find_all('tr'):
        cols = row.find_all(['th', 'td'])
        if not headers:
            headers = [header.text.strip() for header in cols]
        else:
            row_data = [col.text.strip() for col in cols]
            data.append(row_data)

    # Print the headers and a sample row to verify the data
    print("Headers:", headers)
    if data:
        print("Sample Row:", data[0])

    df = pd.DataFrame(data, columns=headers)

    return df

In [77]:
df = scrape_data()

HTTP Response Status Code: 200
HTML Content (first 200 characters): <!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <title>
   Masters Tournament 2024 Golf Leaderboard - PGA TOUR - Past Result
Table found!
Headers: ['Pos', 'Player', 'R1', 'R2', 'R3', 'R4', 'To Par', 'FedExCup Pts', 'Official Money']
Sample Row: ['1', 'Jon Rahm', '-7', '-3', '+1', '-3', '-12', '600.00', '$3,240,000']


In [78]:
df.head(50)

Unnamed: 0,Pos,Player,R1,R2,R3,R4,To Par,FedExCup Pts,Official Money
0,1,Jon Rahm,-7,-3,+1,-3,-12,600.0,"$3,240,000"
1,T2,Phil Mickelson,-1,-3,+3,-7,-8,0.0,"$1,584,000"
2,T2,Brooks Koepka,-7,-5,+1,+3,-8,0.0,"$1,584,000"
3,T4,Jordan Spieth,-3,-2,+4,-6,-7,126.667,"$744,000"
4,T4,Patrick Reed,-1,-2,E,-4,-7,0.0,"$744,000"
5,,,,,,,,,
6,T4,Russell Henley,+1,-5,-1,-2,-7,126.667,"$744,000"
7,T7,Cameron Young,-5,E,+3,-4,-6,97.0,"$580,500"
8,T7,Viktor Hovland,-7,+1,-2,+2,-6,97.0,"$580,500"
9,9,Sahith Theegala,+1,-2,+1,-5,-5,88.0,"$522,000"


In [79]:
# cleaning 
df.drop(columns=['FedExCup Pts','Official Money'], inplace=True)
df['Index'] = df.index
df.set_index('Index', inplace=True)
df['Player'] = df['Player'].str.replace(' \(a\)', '', regex=True)

def is_non_numeric_or_blank(s):
    return not bool(re.match(r'^\+?-?\d*\.?\d*$', str(s)))

columns_to_check = ['R1', 'R2', 'R3', 'R4', 'To Par']

# Convert non-numeric or blank values to '0'
for col in columns_to_check:
    df[col] = df[col].apply(lambda x: '0' if pd.isna(x) or is_non_numeric_or_blank(x) else x)

# Convert string representations of columns to integers if they represent numbers
scraped_data = df
for index, row in scraped_data.iterrows():
    for col in columns_to_check:
        val = row[col]
        if val.startswith('+'):
            val = int(val[1:])  # Convert to positive integer
        elif val.startswith('-'):
            val = -int(val[1:])  # Convert to negative integer
        else:
            val = 0  # Set to 0 for non-numeric or blank values
            
        # Update the value back to the DataFrame
        df.at[index, col] = val

ValueError: invalid literal for int() with base 10: ''

In [82]:
df.head(50)

Unnamed: 0_level_0,Pos,Player,R1,R2,R3,R4,To Par
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,Jon Rahm,-7,-3,1,-3,-12
1,T2,Phil Mickelson,-1,-3,3,-7,-8
2,T2,Brooks Koepka,-7,-5,1,3,-8
3,T4,Jordan Spieth,-3,-2,4,-6,-7
4,T4,Patrick Reed,-1,-2,0,-4,-7
5,,,0,0,0,0,0
6,T4,Russell Henley,1,-5,-1,-2,-7
7,T7,Cameron Young,-5,0,3,-4,-6
8,T7,Viktor Hovland,-7,1,-2,2,-6
9,9,Sahith Theegala,1,-2,1,-5,-5


In [34]:
df.head(100)

Unnamed: 0_level_0,Pos,Player,R1,R2,R3,R4,To Par
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,Jon Rahm,-7,-3,0,-3,-12
1,T2,Phil Mickelson,-1,-3,0,-7,-8
2,T2,Brooks Koepka,-7,-5,0,0,-8
3,T4,Jordan Spieth,-3,-2,0,-6,-7
4,T4,Patrick Reed,-1,-2,0,-4,-7
...,...,...,...,...,...,...,...
85,CUT,Sandy Lyle,0,0,0,0,0
86,W/D,Kevin Na,0,0,0,0,0
87,W/D,Will Zalatoris,0,0,0,0,0
88,W/D,Louis Oosthuizen,0,0,0,0,0


In [28]:
##df.replace('', np.nan, inplace=True)
#df.dropna(axis=0, how='any', inplace=True)
df['Player'] = df['Player'].str.replace(' \(a\)', '', regex=True)
#df = df[~df['Pos'].str.contains('CUT')]
columns_to_replace = ['R1', 'R2', 'R3', 'R4', 'To Par']
df[columns_to_replace] = df[columns_to_replace].replace('E', '0')
columns_to_replace = ['R1', 'R2', 'R3', 'R4', 'To Par']
df[columns_to_replace] = df[columns_to_replace].fillna('0')

Unnamed: 0_level_0,Pos,Player,R1,R2,R3,R4,To Par
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,Jon Rahm,-7,-3,+1,-3,-12
1,T2,Phil Mickelson,-1,-3,+3,-7,-8
2,T2,Brooks Koepka,-7,-5,+1,+3,-8
3,T4,Jordan Spieth,-3,-2,+4,-6,-7
4,T4,Patrick Reed,-1,-2,0,-4,-7
...,...,...,...,...,...,...,...
85,CUT,Sandy Lyle,+9,+11,0,0,+20
86,W/D,Kevin Na,-,0,0,0,
87,W/D,Will Zalatoris,-,0,0,0,
88,W/D,Louis Oosthuizen,+4,0,0,0,+4


In [30]:
players = df
players['id'] = players['Player'].str.replace(' ', '').str.upper()
df.drop(columns=['Pos','R1','R2','R3','R4','To Par'], inplace=True)
players.head()
players.to_csv("players.csv")

SyntaxError: EOF while scanning triple-quoted string literal (390595961.py, line 6)

In [4]:
players = pd.read_csv('app/players_tiered.csv')
players.head()

Unnamed: 0.1,Unnamed: 0,Golfer,Rank,Odds,Tier
0,2,Scottie Scheffler,1,7/1,1
1,3,Rory McIlroy,2,7/1,1
2,5,Viktor Hovland,4,35/1,1
3,6,Patrick Cantlay,5,18/1,1
4,14,Jordan Spieth,14,16/1,1


In [22]:
# Sort the DataFrame by the 'Rank' column in ascending order
df = df.sort_values(by='Rank')

# Calculate the number of players in each tier
num_players = len(df)
players_per_tier = num_players // 6

# Create a Tier column
df['Tier'] = (df.index // players_per_tier) + 1

# Reset the index to start from 1
df.index = df.index + 1

df = df.sort_values(by='Tier', ascending=True)

# Display the DataFrame
df.head(30)

Unnamed: 0,Golfer,Rank,Odds,Tier
2,Scottie Scheffler,1,7/1,1
3,Rory McIlroy,2,7/1,1
5,Viktor Hovland,4,35/1,1
6,Patrick Cantlay,5,18/1,1
14,Jordan Spieth,14,16/1,1
7,Xander Schauffele,6,22/1,1
4,Jon Rahm,3,9/1,1
12,Collin Morikawa,12,22/1,1
10,Brian Harman,9,130/1,1
13,Tyrrell Hatton,13,50/1,1


In [19]:


# Reset the index if you want to reindex the DataFrame
#df = df.reset_index(drop=True)

In [20]:
df.head(100)

Unnamed: 0,Golfer,Rank,Odds,Tier
1,Scottie Scheffler,1,7/1,1
3,Jon Rahm,3,9/1,1
2,Rory McIlroy,2,7/1,1
5,Patrick Cantlay,5,18/1,1
14,Tommy Fleetwood,15,50/1,1
...,...,...,...,...
82,Larry Mize,>200,5000/1,6
74,Aldrich Potgieter,>200,2500/1,6
72,Gordon Sargent,>200,1000/1,6
73,Sam Bennett,>200,2000/1,6


In [24]:
df.to_csv("app/static/players_tiered.csv")