In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: bradklassen
"""

from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np

from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.error import URLError
from tqdm import tqdm
import re
import json

In [17]:
def process_tournament_liv(tournament_id):
    tournament_id = str(tournament_id)
    url = "https://web-common.livgolf.com/api/leaderboard/players/" + tournament_id

    html = urlopen(url)
    soup = BeautifulSoup(html, 'lxml')
    
    # Extract the text from the <p> tag containing the JSON-like content
    json_text = soup.find('p').get_text()

    # Convert the JSON text to a Python dictionary
    data = json.loads(json_text)

    player_results = []

    for player_dict in data["players"]:
        # List of keys to keep
        keys_to_keep = ['id', 'name', 'position', 'totalScore']

        # Create a new dictionary with only the keys you want
        new_d = {key: player_dict[key] for key in keys_to_keep if key in player_dict}

        new_d['player_id_liv'] = new_d.pop('id')

        # Add round scores as separate keys in the new dictionary
        for idx, round_info in enumerate(player_dict['rounds'], start=1):
            new_d[f'R{idx}_topar'] = round_info['score']

        player_results.append(new_d)

    tournament_results = pd.DataFrame(player_results)
    
    tournament_results["tournament_id_liv"] = tournament_id
    
    # gather tournament par information
    url = "https://web-common.livgolf.com/api/leaderboard/courses/" + tournament_id

    html = urlopen(url)
    soup = BeautifulSoup(html, 'lxml')

    # Extract the text from the <p> tag containing the JSON-like content
    json_text = soup.find('p').get_text()

    # Convert the JSON text to a Python dictionary
    data = json.loads(json_text)

    round_par = int(data["rounds"][0]["totalPar"])
    
    for key in ["R1", "R2", "R3"]:
        tournament_results[f"{key}_topar"] = tournament_results[f"{key}_topar"].replace('E', '0')
        tournament_results[f"{key}_topar"] = np.where(tournament_results[f"{key}_topar"] == '',
                                                            np.nan,
                                                            tournament_results[f"{key}_topar"])

        tournament_results[f"{key}_topar"] = tournament_results[f"{key}_topar"].astype(float)
        tournament_results[key] = tournament_results[f"{key}_topar"] + round_par

    tournament_results["AGG"] = tournament_results["R1"] + tournament_results["R2"] + tournament_results["R3"]
    return tournament_results

In [8]:
def process_tournament_owgr(tournament_id):
    tournament_id = str(tournament_id)
    url = "https://www.owgr.com/events/" + tournament_id

    try:
        # get the Scores:

        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        
        # Find the table
        table = soup.find('table')

        # Get all rows from the table
        rows = table.find_all('tr')

        # Initialize a list to store table data
        table_data = []

        # Iterate through each row
        for row in rows:
            # Get all cells (both headers and regular data)
            cells = row.find_all(['th', 'td'])
            
            # Initialize a list to store row data
            row_data = []
            
            # Iterate through each cell
            for cell in cells:
                # Check if the cell contains a link (for player profile)
                a_tag = cell.find('a', href=True)
                if a_tag:
                    # Extract the href attribute (player profile link)
                    player_id = a_tag['href'].split('-')[-1]  # Extract player ID (e.g., 'greg-dalziel-28451')
                    row_data.append(player_id)    
                       
                # adding data
                text = cell.get_text(strip=True)

                # this is special case for first row, want to add in player ids, which we do above by searching a tags
                if text == "NAME":
                    row_data.append("player_id")

                # another special case when bonus points are awarded
                if "Bonus Points" in text:
                    row_data.append("")

                # always append the text
                row_data.append(text)
            
            # Add non-empty row data to the table_data list
            if row_data:
                table_data.append(row_data)

        # Get metadata:
        
        # Dictionary to store the event information
        event_info = {}

        # Find the div containing the week info (span with the specific classes)
        week_spans = soup.find_all('span', class_='boldedString_content__bolded__QAKW8')
        if week_spans and len(week_spans) >= 2:
            week_number = week_spans[0].get_text(strip=True)
            week_value = week_spans[1].get_text(strip=True)
            event_info['Week'] = f"{week_number} {week_value}"

        # Find the exact event date by targeting spans with specific text pattern
        date_spans = soup.find_all('span', class_='boldedString_content__normal__NreGx')
        if date_spans and len(date_spans) >= 2:
            # Ensuring that only the relevant spans are captured (month/day/year)
            event_date = ' '.join([span.get_text(strip=True) for span in date_spans[:3]])
            event_info['Event Date'] = event_date

        # Find the div with event title (event name)
        event_title = soup.find('div', class_='eventTitleComponent_name__C2ZKJ')
        if event_title:
            event_info['Event Title'] = event_title.get_text(strip=True)

        # Find the div with the field rating
        field_rating = soup.find('div', class_='eventTitleComponent_rating__4UcQg')
        if field_rating:
            event_info['Field Rating'] = re.search(r'\d+\.\d+', field_rating.get_text(strip=True)).group()

        event_info["tournament_id"] = tournament_id
        # tour information

        # Find the script tag with the specific id and type
        script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})

        # Extract the content inside the tag (which is a JSON string)
        json_data = script_tag.string

        # Parse the JSON string into a Python dictionary
        data = json.loads(json_data)

        tour_info = data["props"]["pageProps"]["eventDetailsData"]["eventDetails"]["tours"][0]
        
        merged_dict = dict(event_info, **tour_info)

        scores_df = pd.DataFrame(table_data[1:], columns=table_data[0])
        scores_df["tournament_id"] = tournament_id

        return scores_df, pd.DataFrame([merged_dict])
    
    except URLError as e:
        print(f"Error opening URL for tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on error
    
    except Exception as e:
        print(f"Error processing tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on any other error

In [18]:
df_liv = process_tournament_liv(6)
df_liv

Unnamed: 0,name,position,totalScore,player_id_liv,R1_topar,R2_topar,R3_topar,tournament_id_liv,R1,R2,R3,AGG
0,Eugenio Chacarra,1.0,-19,57,-7.0,-9.0,-3.0,6,65.0,63.0,69.0,197.0
1,Patrick Reed,2.0,-16,54,-4.0,-7.0,-5.0,6,68.0,65.0,67.0,200.0
2,Paul Casey,3.0,-15,58,-1.0,-7.0,-7.0,6,71.0,65.0,65.0,201.0
3,Richard Bland,4.0,-15,2,-7.0,-4.0,-4.0,6,65.0,68.0,68.0,201.0
4,Sihwan Kim,5.0,-15,17,-5.0,-6.0,-4.0,6,67.0,66.0,68.0,201.0
5,James Piot,6.0,-14,31,-3.0,-7.0,-4.0,6,69.0,65.0,68.0,202.0
6,Harold Varner III,7.0,-14,65,-5.0,-6.0,-3.0,6,67.0,66.0,69.0,202.0
7,Charles Howell III,8.0,-13,59,-3.0,-5.0,-5.0,6,69.0,67.0,67.0,203.0
8,Brooks Koepka,9.0,-13,52,-5.0,-5.0,-3.0,6,67.0,67.0,69.0,203.0
9,Abraham Ancer,10.0,-11,49,-3.0,-4.0,-4.0,6,69.0,68.0,68.0,205.0


In [9]:
df_pga, _ = process_tournament_owgr(8187)
df_pga

Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,R1,R2,R3,R4,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id
0,1,,22085,Collin Morikawa,67,64,68,66,265,100.00000,4,3,8187
1,2,,14636,Jordan Spieth,65,67,69,66,267,60.00000,23,14,8187
2,T3,,7672,Louis Oosthuizen,64,65,69,71,269,35.00000,13,9,8187
3,T3,,19195,Jon Rahm,71,64,68,66,269,35.00000,2,1,8187
4,5,,11955,Dylan Frittelli,66,67,70,68,271,24.00000,106,78,8187
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,MC,,1547,Phil Mickelson,80,72,-,-,152,0.00000,32,32,8187
152,MC,,28616,Daniel Croft,76,78,-,-,154,0.00000,-,1892,8187
153,MC,,16165,Yuki Inamori,75,81,-,-,156,0.00000,139,147,8187
154,MC,,15885,Deyen Lawson,80,77,-,-,157,0.00000,633,637,8187


In [22]:
df_merged = df_liv.merge(df_pga, how="inner", left_on="name", right_on="NAME")[["name", "player_id_liv", "player_id"]]
df_merged

Unnamed: 0,name,player_id_liv,player_id
0,Patrick Reed,54,14838
1,Paul Casey,58,7108
2,Richard Bland,2,6516
3,Harold Varner III,65,16602
4,Brooks Koepka,52,16243
5,Abraham Ancer,49,18238
6,Ian Poulter,32,1435
7,Bryson DeChambeau,50,19841
8,Matt Jones,13,8605
9,Phil Mickelson,43,1547


In [41]:
def get_tournament_info_liv(year):
    year = 2024
    year = str(year)
    url = "https://web-common.livgolf.com/api/events/" + year

    html = urlopen(url)
    soup = BeautifulSoup(html, 'lxml')
    
    # Extract the text from the <p> tag containing the JSON-like content
    json_text = soup.find('p').get_text()

    # Convert the JSON text to a Python dictionary
    data = json.loads(json_text)

    tournaments = []

    for tournament_dict in data["events"]:
        # List of keys to keep
        keys_to_keep = ['eventId', 'eventLivId', 'eventTitle', 'endDate']

        # Create a new dictionary with only the keys you want
        new_d = {key: tournament_dict[key] for key in keys_to_keep if key in tournament_dict}

        tournaments.append(new_d)

    tournament_info = pd.DataFrame(tournaments)
    
    return tournament_info

In [42]:
get_tournament_info_liv(2024)

Unnamed: 0,eventId,eventLivId,eventTitle,endDate
0,24,10022,Mayakoba,2024-02-04T18:00:00.000-05:00
1,25,10021,Las Vegas,2024-02-10T15:40:00.000-08:00
2,26,10024,Jeddah,2024-03-03T16:00:00.000+03:00
3,27,10019,Hong Kong,2024-03-10T17:30:00.000+08:00
4,28,10028,Miami,2024-04-07T18:00:00.000-04:00
5,29,10016,Adelaide,2024-04-28T16:30:00.000+09:30
6,30,10025,Singapore,2024-05-05T13:15:00.000+08:00
7,31,10020,Houston,2024-06-09T17:00:00.000-06:00
8,32,10023,Nashville,2024-06-23T17:30:00.000-05:00
9,33,10017,Andalucía,2024-07-14T18:15:00.000+02:00


In [24]:
def process_tournaments_liv(tournament_ids):
    all_table_data = []

    # Using ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_tournament_liv, tid): tid for tid in tournament_ids}

        # Use tqdm to show progress
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing tournaments"):
            try:
                table_data = future.result()
                all_table_data.append(table_data)
            except Exception as e:
                print(f"Error processing tournament {}: {e}")

    # Concatenate all DataFrames collected
    concatenated_table_data = pd.concat(all_table_data, ignore_index=True) if all_table_data else pd.DataFrame()

    return concatenated_table_data

In [25]:
# gathering all liv tournaments
tournament_ids = list(range(1, 37))
tournament_ids.remove(8)

liv_results = process_tournaments_liv(tournament_ids)
liv_results

Processing tournaments:  40%|████      | 14/35 [00:02<00:03,  5.33it/s]

Error processing tournament: 'R2_topar'


Processing tournaments: 100%|██████████| 35/35 [00:06<00:00,  5.25it/s]


Unnamed: 0,name,position,totalScore,player_id_liv,R1_topar,R2_topar,R3_topar,tournament_id_liv,R1,R2,R3,AGG,R4_topar
0,Branden Grace,1.0,-13,9,-3.0,-3.0,-7.0,2,69.0,69.0,65.0,203.0,
1,Carlos Ortiz,2.0,-11,56,-5.0,-3.0,-3.0,2,67.0,69.0,69.0,205.0,
2,Patrick Reed,3.0,-9,54,0.0,-4.0,-5.0,2,72.0,68.0,67.0,207.0,
3,Dustin Johnson,4.0,-9,12,-4.0,-4.0,-1.0,2,68.0,68.0,71.0,207.0,
4,Louis Oosthuizen,5.0,-7,27,-1.0,-3.0,-3.0,2,71.0,69.0,69.0,209.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,Branden Grace,52.0,+8,9,4.0,2.0,2.0,31,76.0,74.0,74.0,224.0,
1687,Ben Campbell,53.0,,137,-5.0,,,31,67.0,,,,
1688,Wade Ormsby,54.0,,28,1.0,-1.0,,31,73.0,71.0,,,
1689,Louis Oosthuizen,,-3,27,-3.0,,,31,69.0,,,,


In [28]:
liv_names = liv_results[["name", "player_id_liv"]].drop_duplicates("name")
liv_names

Unnamed: 0,name,player_id_liv
0,Branden Grace,9
1,Carlos Ortiz,56
2,Patrick Reed,54
3,Dustin Johnson,12
4,Louis Oosthuizen,27
...,...,...
862,Lucas Herbert,150
886,Adrian Meronk,152
994,Anthony Kim,151
1260,Ben Campbell,137


In [26]:
liv_results.tournament_id_liv.unique()

array(['2', '3', '7', '9', '1', '5', '6', '14', '11', '15', '12', '4',
       '13', '20', '19', '18', '16', '23', '24', '10', '26', '25', '27',
       '30', '17', '28', '33', '29', '32', '22', '34', '36', '35', '31'],
      dtype=object)

In [28]:
tournament_id = 30
tournament_id = str(tournament_id)
url = "https://web-common.livgolf.com/api/leaderboard/players/" + tournament_id

html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup

<html><body><p>{"timeStamp":1728660189484,"players":[{"id":"52","name":"Brooks Koepka","firstName":"Brooks","lastName":"Koepka","media":{"bynderMobileImage":null,"mobileImage":{"url":"https://images.ctfassets.net/24ldqbofbj92/awUPgl59AJbNaP0xwHL3Z/9a0c030ea09b3a86f180aed16c86eb7d/Koepka_Brooks_1.png"}},"playerImages":[],"position":1,"rank":"1","status":"Playing","team":{"colour":"#1e00e9","media":{"bynderMobileImage":null,"mobileImage":{"url":"https://images.ctfassets.net/24ldqbofbj92/7AOCF2AOb1yKMaqMxucpuP/4b0d41f45bdd859ddda3a2bce1e58cbd/Logos_Smash_New_png.png"}},"name":"Smash GC","isCaptain":true,"isCoCaptain":false},"rounds":[{"id":"1","score":"-5"},{"id":"2","score":"-7"},{"id":"3","score":"-3"}],"totalScore":"-15"},{"id":"63","name":"Cameron Smith","firstName":"Cameron","lastName":"Smith","media":{"bynderMobileImage":null,"mobileImage":{"url":"https://images.ctfassets.net/24ldqbofbj92/5s8AFuRWPTfzGZSuXw5gxZ/8c9d561e45267a1686e22b277d54f47a/Smith_Cameron_05.PNG"}},"playerImages":

In [29]:
# Extract the text from the <p> tag containing the JSON-like content
json_text = soup.find('p').get_text()

# Convert the JSON text to a Python dictionary
data = json.loads(json_text)

player_results = []

for player_dict in data["players"]:
    # List of keys to keep
    keys_to_keep = ['id', 'name', 'position', 'totalScore']

    # Create a new dictionary with only the keys you want
    new_d = {key: player_dict[key] for key in keys_to_keep if key in player_dict}

    new_d['player_id'] = new_d.pop('id')

    # Add round scores as separate keys in the new dictionary
    for idx, round_info in enumerate(player_dict['rounds'], start=1):
        new_d[f'R{idx}_topar'] = round_info['score']

    player_results.append(new_d)

tournament_results = pd.DataFrame(player_results)
tournament_results

Unnamed: 0,name,position,totalScore,player_id,R1_topar,R2_topar,R3_topar
0,Brooks Koepka,1,-15,52,-5,-7,-3
1,Cameron Smith,2,-13,63,E,-6,-7
2,Marc Leishman,3,-13,66,-4,-4,-5
3,Talor Gooch,4,-12,8,-3,-5,-4
4,Tyrrell Hatton,5,-11,148,-3,-3,-5
5,Thomas Pieters,6,-11,78,-5,-4,-2
6,Dustin Johnson,7,-10,12,-4,-1,-5
7,Joaquin Niemann,8,-10,64,-4,-2,-4
8,Kevin Na,9,-10,24,-5,-1,-4
9,Jon Rahm,10,-9,147,-2,-3,-4


In [20]:
year = 2024
year = str(year)
url = "https://web-common.livgolf.com/api/events/" + year

html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup

<html><body><p>{"events":[{"day1Itinerary":null,"day2Itinerary":null,"day3Itinerary":null,"day4Itinerary":null,"eventId":"24","eventLivId":"10022","eventTitle":"Mayakoba","bynderEventLogoNextEvent":null,"bynderEventLogoListing":null,"eventLogoNextEvent":null,"eventLogoListing":null,"eventLogoListingMobile":null,"eventType":"League","startDate":"2024-02-02T13:15:00.000-05:00","endDate":"2024-02-04T18:00:00.000-05:00","sponsor":null,"bynderMegaNavImage":null,"bynderEventListingImage":null,"day1TeeTime":"2024-02-02T13:15:00.000-05:00","day2TeeTime":"2024-02-03T12:45:00.000-05:00","day3TeeTime":"2024-02-04T13:05:00.000-05:00","day4TeeTime":null,"day1FeaturedPairings":null,"day2FeaturedPairings":null,"day3FeaturedPairings":null,"day4FeaturedPairings":null,"isChampionship":null,"megaNavImage":{"title":"MYK - El Cama4","url":"https://images.ctfassets.net/24ldqbofbj92/PYXP7a0NMSZrgnEW09NKh/0d374a0f5f2285f0b1c4bb2001e2739b/MYK_-_El_Camaleo__n_4.jpg","width":2048,"height":1323},"eventListingImag

In [21]:
# Extract the text from the <p> tag containing the JSON-like content
json_text = soup.find('p').get_text()

# Convert the JSON text to a Python dictionary
data = json.loads(json_text)

tournaments = []

for tournament_dict in data["events"]:
    # List of keys to keep
    keys_to_keep = ['eventId', 'eventLivId', 'eventTitle', 'endDate']

    # Create a new dictionary with only the keys you want
    new_d = {key: tournament_dict[key] for key in keys_to_keep if key in tournament_dict}

    tournaments.append(new_d)

tournament_info = pd.DataFrame(tournaments)
tournament_info

Unnamed: 0,eventId,eventLivId,eventTitle,endDate
0,24,10022,Mayakoba,2024-02-04T18:00:00.000-05:00
1,25,10021,Las Vegas,2024-02-10T15:40:00.000-08:00
2,26,10024,Jeddah,2024-03-03T16:00:00.000+03:00
3,27,10019,Hong Kong,2024-03-10T17:30:00.000+08:00
4,28,10028,Miami,2024-04-07T18:00:00.000-04:00
5,29,10016,Adelaide,2024-04-28T16:30:00.000+09:30
6,30,10025,Singapore,2024-05-05T13:15:00.000+08:00
7,31,10020,Houston,2024-06-09T17:00:00.000-06:00
8,32,10023,Nashville,2024-06-23T17:30:00.000-05:00
9,33,10017,Andalucía,2024-07-14T18:15:00.000+02:00


In [30]:
tournament_id = 30
tournament_id = str(tournament_id)
url = "https://web-common.livgolf.com/api/leaderboard/courses/" + tournament_id

html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup

<html><body><p>{"holes":[{"id":"1","par":"4","yardage":"427"},{"id":"2","par":"3","yardage":"168"},{"id":"3","par":"4","yardage":"495"},{"id":"4","par":"5","yardage":"587"},{"id":"5","par":"4","yardage":"486"},{"id":"6","par":"4","yardage":"444"},{"id":"7","par":"5","yardage":"587"},{"id":"8","par":"3","yardage":"216"},{"id":"9","par":"4","yardage":"452"},{"id":"10","par":"4","yardage":"402"},{"id":"11","par":"4","yardage":"412"},{"id":"12","par":"4","yardage":"493"},{"id":"13","par":"4","yardage":"450"},{"id":"14","par":"3","yardage":"202"},{"id":"15","par":"4","yardage":"428"},{"id":"16","par":"4","yardage":"412"},{"id":"17","par":"3","yardage":"182"},{"id":"18","par":"5","yardage":"563"}],"rounds":[{"id":"1","totalPar":"71","totalYardage":"7426","holes":[{"id":"1","par":"4","yardage":"427"},{"id":"2","par":"3","yardage":"176"},{"id":"3","par":"4","yardage":"495"},{"id":"4","par":"5","yardage":"587"},{"id":"5","par":"4","yardage":"486"},{"id":"6","par":"4","yardage":"444"},{"id":"7",

In [34]:
# Extract the text from the <p> tag containing the JSON-like content
json_text = soup.find('p').get_text()

# Convert the JSON text to a Python dictionary
data = json.loads(json_text)

round_pars = {}

for tournament_dict in data["rounds"]:
    
    # Create a new dictionary with only the par
    round_pars[f"R{tournament_dict['id']}"] = int(tournament_dict['totalPar'])

round_pars

{'R1': 71, 'R2': 71, 'R3': 71}

In [35]:
for key in round_pars.keys():
    tournament_results[f"{key}_topar"] = tournament_results[f"{key}_topar"].replace('E', '0')
    tournament_results[key] = tournament_results[f"{key}_topar"].astype(int) + round_pars[key]

tournament_results

Unnamed: 0,name,position,totalScore,player_id,R1_topar,R2_topar,R3_topar,R1,R2,R3
0,Brooks Koepka,1,-15,52,-5,-7,-3,66,64,68
1,Cameron Smith,2,-13,63,0,-6,-7,71,65,64
2,Marc Leishman,3,-13,66,-4,-4,-5,67,67,66
3,Talor Gooch,4,-12,8,-3,-5,-4,68,66,67
4,Tyrrell Hatton,5,-11,148,-3,-3,-5,68,68,66
5,Thomas Pieters,6,-11,78,-5,-4,-2,66,67,69
6,Dustin Johnson,7,-10,12,-4,-1,-5,67,70,66
7,Joaquin Niemann,8,-10,64,-4,-2,-4,67,69,67
8,Kevin Na,9,-10,24,-5,-1,-4,66,70,67
9,Jon Rahm,10,-9,147,-2,-3,-4,69,68,67
