In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: bradklassen
"""

from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np

from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.error import URLError
from tqdm import tqdm
import re
import json

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [28]:
def process_tournament(tournament_id):
    tournament_id = str(tournament_id)
    url = "https://www.owgr.com/events/" + tournament_id

    try:
        # get the Scores:

        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        
        # Find the table
        table = soup.find('table')

        # Get all rows from the table
        rows = table.find_all('tr')

        # Initialize a list to store table data
        table_data = []

        # Iterate through each row
        for row in rows:
            # Get all cells (both headers and regular data)
            cells = row.find_all(['th', 'td'])
            
            # Initialize a list to store row data
            row_data = []
            
            # Iterate through each cell
            for cell in cells:
                # Check if the cell contains a link (for player profile)
                a_tag = cell.find('a', href=True)
                if a_tag:
                    # Extract the href attribute (player profile link)
                    player_id = a_tag['href'].split('-')[-1]  # Extract player ID (e.g., 'greg-dalziel-28451')
                    row_data.append(player_id)    
                       
                # adding data
                text = cell.get_text(strip=True)

                # this is special case for first row, want to add in player ids, which we do above by searching a tags
                if text == "NAME":
                    row_data.append("player_id")

                # another special case when bonus points are awarded
                if "Bonus Points" in text:
                    row_data.append("")

                # always append the text
                row_data.append(text)
            
            # Add non-empty row data to the table_data list
            if row_data:
                table_data.append(row_data)

        # Get metadata:
        
        # Dictionary to store the event information
        event_info = {}

        # Find the div containing the week info (span with the specific classes)
        week_spans = soup.find_all('span', class_='boldedString_content__bolded__QAKW8')
        if week_spans and len(week_spans) >= 2:
            week_number = week_spans[0].get_text(strip=True)
            week_value = week_spans[1].get_text(strip=True)
            event_info['Week'] = f"{week_number} {week_value}"

        # Find the exact event date by targeting spans with specific text pattern
        date_spans = soup.find_all('span', class_='boldedString_content__normal__NreGx')
        if date_spans and len(date_spans) >= 2:
            # Ensuring that only the relevant spans are captured (month/day/year)
            event_date = ' '.join([span.get_text(strip=True) for span in date_spans[:3]])
            event_info['Event Date'] = event_date

        # Find the div with event title (event name)
        event_title = soup.find('div', class_='eventTitleComponent_name__C2ZKJ')
        if event_title:
            event_info['Event Title'] = event_title.get_text(strip=True)

        # Find the div with the field rating
        field_rating = soup.find('div', class_='eventTitleComponent_rating__4UcQg')
        if field_rating:
            event_info['Field Rating'] = re.search(r'\d+\.\d+', field_rating.get_text(strip=True)).group()

        event_info["tournament_id"] = tournament_id
        # tour information

        # Find the script tag with the specific id and type
        script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})

        # Extract the content inside the tag (which is a JSON string)
        json_data = script_tag.string

        # Parse the JSON string into a Python dictionary
        data = json.loads(json_data)

        tour_info = data["props"]["pageProps"]["eventDetailsData"]["eventDetails"]["tours"][0]
        
        merged_dict = dict(event_info, **tour_info)

        scores_df = pd.DataFrame(table_data[1:], columns=table_data[0])
        scores_df["tournament_id"] = tournament_id

        return scores_df, pd.DataFrame([merged_dict])
    
    except URLError as e:
        print(f"Error opening URL for tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on error
    
    except Exception as e:
        print(f"Error processing tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on any other error

def process_tournaments(tournament_ids):
    all_table_data = []
    all_merged_data = []

    # Using ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_tournament, tid): tid for tid in tournament_ids}

        # Use tqdm to show progress
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing tournaments"):
            try:
                table_data, merged_data = future.result()
                all_table_data.append(table_data)
                all_merged_data.append(merged_data)
            except Exception as e:
                print(f"Error processing tournament: {e}")

    # Concatenate all DataFrames collected
    concatenated_table_data = pd.concat(all_table_data, ignore_index=True) if all_table_data else pd.DataFrame()
    concatenated_merged_data = pd.concat(all_merged_data, ignore_index=True) if all_merged_data else pd.DataFrame()

    return concatenated_table_data, concatenated_merged_data

In [33]:
x, y = process_tournament(1)
y

Unnamed: 0,Week,Event Date,Event Title,Field Rating,tournament_id,tourId,currencyId,name,fullName,code,isEligible
0,WEEK 2,12TH JANUARY 1986,Tourn. of Champ.,116.0,1,23,0,PGA Tour,PGA Tour,PGAT,True


In [34]:
x

Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,POINTS WON,RANK FROM,RANK TO,tournament_id
0,1,,160,Calvin Peete,52.0,-,-,1
1,2,,154,Mark O'Meara,32.0,-,-,1
2,3,,13,Phil Blackmar,20.0,-,-,1
3,T4,,57,Danny Edwards,12.0,-,-,1
4,T4,,220,Scott Verplank,12.0,-,-,1
5,T4,,290,Bernhard Langer,12.0,-,-,1
6,T7,,190,Tim Simpson,8.0,-,-,1
7,T7,,205,Hal Sutton,8.0,-,-,1
8,T7,,211,Jim Thorpe,8.0,-,-,1
9,10,,114,Tom Kite,8.0,-,-,1


In [29]:
tournament_ids = list(range(10251, 10300))  # Example tournament IDs

tournament_scores, metadata = process_tournaments(tournament_ids)
tournament_scores

Processing tournaments: 100%|██████████| 49/49 [00:16<00:00,  3.06it/s]


Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,R1,R2,R3,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id,R4
0,1,,26608,Reece McKain,68,67,70,205,0.38773,3383,2072,10256,
1,2,,27415,Samuel Simpson,64,71,71,206,0.23264,1920,1722,10256,
2,3,,22178,Philip Geerts,64,68,75,207,0.15509,2474,2163,10256,
3,T4,,3913,Andrew McLardy,71,65,72,208,0.10468,2563,2329,10256,
4,T4,,27314,Conner Mackenzie,68,66,74,208,0.10468,1696,1635,10256,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6099,MC,,17543,Michael Kim,74,75,-,149,0.00000,109,110,10297,-
6100,MC,,7959,J.B. Holmes,68,81,-,149,0.00000,1822,1838,10297,-
6101,MC,,25002,Dylan Wu,74,75,-,149,0.00000,172,179,10297,-
6102,MC,,14181,Thomas Detry,73,80,-,153,0.00000,70,74,10297,-


In [30]:
tournament_scores[tournament_scores["Finish Pos."] == ""]

Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,R1,R2,R3,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id,R4
100,,,,Bonus Points 70%,-,-,-,-,0.56807,-,-,10257,
167,,,,Bonus Points 60%,-,-,-,-,4.0,-,-,10251,-
500,,,,Bonus Points 70%,-,-,-,-,4.0,-,-,10258,-
945,,,,Bonus Points 70%,-,-,-,-,4.0,-,-,10252,-
1246,,,,Bonus Points 60%,-,-,-,-,4.0,-,-,10264,-
1403,,,,Bonus Points 60%,-,-,-,-,4.0,-,-,10263,-
1536,,,,Bonus Points 60%,-,-,-,-,1.48947,-,-,10267,-
1872,,,,Bonus Points 60%,-,-,-,-,1.47391,-,-,10268,-
2016,,,,Bonus Points 70%,-,-,-,-,4.0,-,-,10271,
2409,,,,Bonus Points 60%,-,-,-,-,1.84201,-,-,10274,-


In [24]:
df = tournament_scores.astype({
    "tournament_id": "int"
})
df

Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,R1,R2,R3,R4,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id
0,1,,12423,Chris Kirk,67,65,66,65,263,60.00488,52,21,10251
1,,,Bonus Points 60%,-,-,-,-,-,4.00000,-,-,,10251
2,2,,23014,Sahith Theegala,64,69,68,63,264,34.80283,32,20,10251
3,3,,14636,Jordan Spieth,66,67,67,65,265,22.80185,15,13,10251
4,4,,14459,Byeong Hun An,68,64,68,66,266,16.80136,60,52,10251
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6099,MC,,27720,Patrick Dam Schou,79,82,-,,161,0.00000,2040,2051,10299
6100,MC,,31935,Milan Triepels(Am),87,80,-,,167,0.00000,-,4198,10299
6101,MC,,30726,Anis Le Jeune,80,88,-,,168,0.00000,4172,4198,10299
6102,MC,,31936,Maximilian-Leopold Andrä(Am),87,90,-,,177,0.00000,-,4198,10299


In [25]:
df.query("tournament_id == 10298")

Unnamed: 0,Finish Pos.,CTRY,player_id,NAME,R1,R2,R3,R4,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id
5816,1,,23540,Mason Andersen,64,67,69,63,263,11.87809,271,156,10298
5817,2,,15651,Kristoffer Ventura,66,65,64,68,263,7.12685,667,396,10298
5818,T3,,20547,Dalton Ward,68,64,64,68,264,4.15733,1043,633,10298
5819,T3,,18628,Brian Campbell,64,67,66,67,264,4.15733,320,253,10298
5820,T5,,15499,Wilson Bateman,67,66,66,66,265,2.20932,296,274,10298
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5967,MC,,12069,Kyle Stanley,72,76,-,-,148,0.00000,1164,1173,10298
5968,MC,,24718,Taylor Funk,76,72,-,-,148,0.00000,1238,1246,10298
5969,MC,,30429,Patrick Welch,72,77,-,-,149,0.00000,521,523,10298
5970,MC,,29084,Juan Martin Loureiro(Am),76,74,-,-,150,0.00000,4172,4198,10298


In [4]:
metadata

Unnamed: 0,Week,Event Date,Event Title,Field Rating,tournament_id,tourId,currencyId,name,fullName,code,isEligible
0,WEEK 1,7TH JANUARY 2024,The Sentry,283.24703,10251,23,0,PGA Tour,PGA Tour,PGAT,True
1,WEEK 3,21ST JANUARY 2024,El Salvador Open Championship,3.5395,10257,36,0,Gira de Golf Professional Mexicana,Gira de Golf Professional Mexicana,MEX,True
2,WEEK 2,14TH JANUARY 2024,Dubai Invitational,95.00971,10253,13,0,DP World Tour,DP World Tour,DPWT,True
3,WEEK 3,21ST JANUARY 2024,Altron Big Easy Tour 9,1.88399,10256,8,0,Big Easy Tour,Big Easy Tour,BET,True
4,WEEK 3,21ST JANUARY 2024,Hero Dubai Desert Classic,172.84312,10258,13,0,DP World Tour,DP World Tour,DPWT,True
5,WEEK 2,14TH JANUARY 2024,Altron Big Easy Tour 8,1.60716,10255,8,0,Big Easy Tour,Big Easy Tour,BET,True
6,WEEK 2,14TH JANUARY 2024,Heritage Classic,10.48982,10254,4,0,PGA Tour of Australasia,PGA Tour of Australasia,ANZ,True
7,WEEK 2,14TH JANUARY 2024,Sony Open in Hawaii,284.67509,10252,23,0,PGA Tour,PGA Tour,PGAT,True
8,WEEK 3,21ST JANUARY 2024,Webex Players Series Murray River in honour of...,8.18202,10261,4,0,PGA Tour of Australasia,PGA Tour of Australasia,ANZ,True
9,WEEK 4,28TH JANUARY 2024,Red Sea Ain Sokhna Open,2.17275,10262,24,0,ProGolf Tour,ProGolf Tour,PGT,True


In [12]:
def process_player(row, tournament_id):
    columns = row.find_all("td")

    try:
        # Create a dictionary to store the data
        player_data = {
            "Tournament Id": tournament_id,
            "Position": columns[0].text.strip(),
            "Name": columns[2].text.strip(),
            "Id": ''.join(filter(lambda x: x.isdigit(), columns[2].find('a')["href"])),
            "Round 1": columns[3].text.strip(),
            "Round 2": columns[4].text.strip(),
            "Round 3": columns[5].text.strip(),
            "Round 4": columns[6].text.strip(),
            "Total Score": columns[7].text.strip(),
            "Ranking Points": columns[8].text.strip(),
            "Rank From": columns[9].text.strip(),
            "Rank To": columns[10].text.strip(),
        }

        return player_data
    
    except:
        return None

def process_tournament(tournament_id):
    try:
        tournament_id = str(tournament_id)

        url = "https://www.owgr.com/events/" + tournament_id
        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        
        rows = soup.find_all(role="row")

        processed_rows = []

        for row in rows:
            processed_rows.append(process_player(row, tournament_id))

        clean_rows = [row for row in processed_rows if row is not None]

        tournament_results = pd.DataFrame(clean_rows)
        
        return tournament_results
    except:
        pd.DataFrame()

In [4]:
tournaments_2024 = []

from tqdm import tqdm

for i in tqdm(range(10251, 10609)):
    tournaments_2024.append(process_tournament(i))

df_2024 = pd.concat(tournaments_2024).reset_index(drop=True)
df_2024

  4%|▎         | 13/358 [00:21<09:54,  1.72s/it]

In [15]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.error import URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm

def process_player(row, tournament_id):
    columns = row.find_all("td")

    try:
        player_data = {
            "Tournament Id": tournament_id,
            "Position": columns[0].text.strip(),
            "Name": columns[2].text.strip(),
            "Id": ''.join(filter(lambda x: x.isdigit(), columns[2].find('a')["href"])),
            "Round 1": columns[3].text.strip(),
            "Round 2": columns[4].text.strip(),
            "Round 3": columns[5].text.strip(),
            "Round 4": columns[6].text.strip(),
            "Total Score": columns[7].text.strip(),
            "Ranking Points": columns[8].text.strip(),
            "Rank From": columns[9].text.strip(),
            "Rank To": columns[10].text.strip(),
        }
        return player_data
    
    except Exception as e:
        print(f"Error processing player in tournament {tournament_id}: {e}")
        return None

def process_tournament(tournament_id):
    tournament_id = str(tournament_id)
    url = "https://www.owgr.com/events/" + tournament_id

    try:
        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        rows = soup.find_all(role="row")

        processed_rows = [process_player(row, tournament_id) for row in rows]
        clean_rows = [row for row in processed_rows if row is not None]

        return pd.DataFrame(clean_rows)
    
    except URLError as e:
        print(f"Error opening URL for tournament {tournament_id}: {e}")
        return pd.DataFrame()  # Return an empty dataframe on error
    
    except Exception as e:
        print(f"Error processing tournament {tournament_id}: {e}")
        return pd.DataFrame()  # Return an empty dataframe on any other error

def process_tournaments(tournament_ids):
    tournaments_2024 = []

    # Using ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_tournament, tid): tid for tid in tournament_ids}

        for future in as_completed(futures):
            try:
                result = future.result()
                if not result.empty:
                    tournaments_2024.append(result)
            except Exception as e:
                print(f"Error processing tournament: {e}")

    return pd.concat(tournaments_2024).reset_index(drop=True) if tournaments_2024 else pd.DataFrame()

In [16]:
# List of tournament IDs
tournament_ids = list(range(10251, 10609))  # Example tournament IDs

# Process all tournaments using multithreading
df_2024 = process_tournaments(tournament_ids)
df_2024

Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: 'NoneType' object is not subscriptable
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing player in tournament 10257: list index out of range
Error processing pl

Unnamed: 0,Tournament Id,Position,Name,Id,Round 1,Round 2,Round 3,Round 4,Total Score,Ranking Points,Rank From,Rank To
0,10253,1,Tommy Fleetwood,12294,66,69,63,67,265,20.09001,15,11
1,10253,T2,Rory McIlroy,10091,62,70,67,67,266,9.64320,2,2
2,10253,T2,Thriston Lawrence,18105,65,70,67,64,266,9.64320,81,67
3,10253,4,Jordan Smith,18586,68,67,67,66,268,5.62520,73,71
4,10253,5,Francesco Molinari,7655,68,70,68,63,269,4.41980,256,206
...,...,...,...,...,...,...,...,...,...,...,...,...
29184,10606,MC,Manu Gandas,18058,73,-,-,-,73,0.00000,842,852
29185,10606,MC,Anil Bajrang Mane,17278,73,-,-,-,73,0.00000,3467,3507
29186,10606,MC,Indrajit Bhalotia,2037,87,-,-,-,87,0.00000,4606,4603
29187,10606,MC,G DURGA PRASAD(Am),32980,93,-,-,-,93,0.00000,-,4603


In [18]:
tournament_id = 10589

tournament_id = str(tournament_id)
url = "https://www.owgr.com/events/" + tournament_id

html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

In [22]:
menu = soup.find(role="menu")
menu

<div class="webNavbarLink_nav__button__Z832R" role="menu" tabindex="0"><a href="/news" role="menuitem" tabindex="-1">NEWS </a></div>

In [24]:
menu

<div class="webNavbarLink_nav__button__Z832R" role="menu" tabindex="0"><a href="/news" role="menuitem" tabindex="-1">NEWS </a></div>

In [28]:
menu.get_text(separator=' ', strip=True)

'NEWS'

In [78]:
def process_tournament(tournament_id):
    tournament_id = str(tournament_id)
    url = "https://www.owgr.com/events/" + tournament_id

    try:
        # get the Scores:

        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        # Find the table
        table = soup.find('table')

        # Get all rows from the table
        rows = table.find_all('tr')

        # Initialize a list to store table data
        table_data = []

        # Iterate through each row
        for row in rows:
            # Get all cells (both headers and regular data)
            cells = row.find_all(['th', 'td'])
            
            # Extract text from each cell and add to a list
            row_data = [cell.get_text(strip=True) for cell in cells]
            
            # Add non-empty row data to the table_data list
            if row_data:
                table_data.append(row_data)

        # Get metadata:
        
        # Dictionary to store the event information
        event_info = {}

        # Find the div containing the week info (span with the specific classes)
        week_spans = soup.find_all('span', class_='boldedString_content__bolded__QAKW8')
        if week_spans and len(week_spans) >= 2:
            week_number = week_spans[0].get_text(strip=True)
            week_value = week_spans[1].get_text(strip=True)
            event_info['Week'] = f"{week_number} {week_value}"

        # Find the exact event date by targeting spans with specific text pattern
        date_spans = soup.find_all('span', class_='boldedString_content__normal__NreGx')
        if date_spans and len(date_spans) >= 2:
            # Ensuring that only the relevant spans are captured (month/day/year)
            event_date = ' '.join([span.get_text(strip=True) for span in date_spans[:3]])
            event_info['Event Date'] = event_date

        # Find the div with event title (event name)
        event_title = soup.find('div', class_='eventTitleComponent_name__C2ZKJ')
        if event_title:
            event_info['Event Title'] = event_title.get_text(strip=True)

        # Find the div with the field rating
        field_rating = soup.find('div', class_='eventTitleComponent_rating__4UcQg')
        if field_rating:
            event_info['Field Rating'] = re.search(r'\d+\.\d+', field_rating.get_text(strip=True)).group()

        event_info["tournament_id"] = tournament_id
        # tour information

        # Find the script tag with the specific id and type
        script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})

        # Extract the content inside the tag (which is a JSON string)
        json_data = script_tag.string

        # Parse the JSON string into a Python dictionary
        data = json.loads(json_data)

        tour_info = data["props"]["pageProps"]["eventDetailsData"]["eventDetails"]["tours"][0]
        
        merged_dict = dict(event_info, **tour_info)

        scores_df = pd.DataFrame(table_data[1:], columns=table_data[0])
        scores_df["tournament_id"] = tournament_id

        return scores_df, pd.DataFrame([merged_dict])
    
    except URLError as e:
        print(f"Error opening URL for tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on error
    
    except Exception as e:
        print(f"Error processing tournament {tournament_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()  # Return an empty dataframe on any other error

In [1]:
def process_tournaments(tournament_ids):
    all_table_data = []
    all_merged_data = []

    # Using ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_tournament, tid): tid for tid in tournament_ids}

        # Use tqdm to show progress
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing tournaments"):
            try:
                table_data, merged_data = future.result()
                all_table_data.append(table_data)
                all_merged_data.append(merged_data)
            except Exception as e:
                print(f"Error processing tournament: {e}")

    # Concatenate all DataFrames collected
    concatenated_table_data = pd.concat(all_table_data, ignore_index=True) if all_table_data else pd.DataFrame()
    concatenated_merged_data = pd.concat(all_merged_data, ignore_index=True) if all_merged_data else pd.DataFrame()

    return concatenated_table_data, concatenated_merged_data

tournament_ids = list(range(10251, 10609))  # Example tournament IDs

tournament_scores, metadata = process_tournaments(tournament_ids)
tournament_scores, metadata

NameError: name 'ThreadPoolExecutor' is not defined

In [80]:
tournament_scores

Unnamed: 0,Finish Pos.,CTRY,NAME,R1,R2,R3,R4,AGG,POINTS WON,RANK FROM,RANK TO,tournament_id
0,1,,Tommy Fleetwood,66,69,63,67,265,20.09001,15,11,10253
1,T2,,Rory McIlroy,62,70,67,67,266,9.64320,2,2,10253
2,T2,,Thriston Lawrence,65,70,67,64,266,9.64320,81,67,10253
3,4,,Jordan Smith,68,67,67,66,268,5.62520,73,71,10253
4,5,,Francesco Molinari,68,70,68,63,269,4.41980,256,206,10253
...,...,...,...,...,...,...,...,...,...,...,...,...
46653,MC,,Thabiso Ngcobo,81,79,-,,160,0.00000,2728,2735,10608
46654,MC,,Steven Le Roux,83,78,-,,161,0.00000,2811,2815,10608
46655,MC,,Ricco Motsa,86,78,-,,164,0.00000,4606,4603,10608
46656,MC,,Terence Boardman,84,81,-,,165,0.00000,4031,4027,10608


In [88]:
metadata["tournament_id"] = metadata["tournament_id"].astype(int)
metadata

Unnamed: 0,Week,Event Date,Event Title,Field Rating,tournament_id,tourId,currencyId,name,fullName,code,isEligible
0,WEEK 2,14TH JANUARY 2024,Dubai Invitational,95.00971,10253,13,0,DP World Tour,DP World Tour,DPWT,True
1,WEEK 1,7TH JANUARY 2024,The Sentry,283.24703,10251,23,0,PGA Tour,PGA Tour,PGAT,True
2,WEEK 3,21ST JANUARY 2024,El Salvador Open Championship,3.53950,10257,36,0,Gira de Golf Professional Mexicana,Gira de Golf Professional Mexicana,MEX,True
3,WEEK 2,14TH JANUARY 2024,Altron Big Easy Tour 8,1.60716,10255,8,0,Big Easy Tour,Big Easy Tour,BET,True
4,WEEK 3,21ST JANUARY 2024,Altron Big Easy Tour 9,1.88399,10256,8,0,Big Easy Tour,Big Easy Tour,BET,True
...,...,...,...,...,...,...,...,...,...,...,...
346,WEEK 40,6TH OCTOBER 2024,ACN Championship Golf Tournament,33.52272,10603,15,0,Japan Golf Tour,Japan Golf Tour,Jpn,True
347,WEEK 40,6TH OCTOBER 2024,Mercuries Taiwan Masters,24.52166,10605,5,0,Asian Tour,Asian Tour,Asa,True
348,WEEK 40,6TH OCTOBER 2024,Vizag Open,4.99762,10606,25,0,Professional Golf Tour of India,Professional Golf Tour of India,PGTI,True
349,WEEK 40,6TH OCTOBER 2024,Hyundai Marine & Fire Insurance KJ Choi Invita...,25.55945,10607,17,0,KPGA Tour,KPGA Tour,Kor,True


In [89]:
metadata.sort_values(by="tournament_id").reset_index(drop=True)

Unnamed: 0,Week,Event Date,Event Title,Field Rating,tournament_id,tourId,currencyId,name,fullName,code,isEligible
0,WEEK 1,7TH JANUARY 2024,The Sentry,283.24703,10251,23,0,PGA Tour,PGA Tour,PGAT,True
1,WEEK 2,14TH JANUARY 2024,Sony Open in Hawaii,284.67509,10252,23,0,PGA Tour,PGA Tour,PGAT,True
2,WEEK 2,14TH JANUARY 2024,Dubai Invitational,95.00971,10253,13,0,DP World Tour,DP World Tour,DPWT,True
3,WEEK 2,14TH JANUARY 2024,Heritage Classic,10.48982,10254,4,0,PGA Tour of Australasia,PGA Tour of Australasia,ANZ,True
4,WEEK 2,14TH JANUARY 2024,Altron Big Easy Tour 8,1.60716,10255,8,0,Big Easy Tour,Big Easy Tour,BET,True
...,...,...,...,...,...,...,...,...,...,...,...
346,WEEK 40,6TH OCTOBER 2024,Ryo Ishikawa everyone PROJECT Challenge,7.23574,10604,7,0,Abema TV Tour,Abema TV Tour,ATVT,True
347,WEEK 40,6TH OCTOBER 2024,Mercuries Taiwan Masters,24.52166,10605,5,0,Asian Tour,Asian Tour,Asa,True
348,WEEK 40,6TH OCTOBER 2024,Vizag Open,4.99762,10606,25,0,Professional Golf Tour of India,Professional Golf Tour of India,PGTI,True
349,WEEK 40,6TH OCTOBER 2024,Hyundai Marine & Fire Insurance KJ Choi Invita...,25.55945,10607,17,0,KPGA Tour,KPGA Tour,Kor,True


In [69]:
res = process_tournament(tournament_id)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Finish Pos.,CTRY,NAME,R1,R2,R3,AGG,POINTS WON,RANK FROM,RANK TO
1,1,,Greg Dalziel,70,68,66,204,0.68323,1876,1485
2,2,,Graeme Robertson,67,68,71,206,0.40994,1006,955
3,T3,,John Vogelpohl,68,69,70,207,0.19472,2542,2203
4,T3,,Jake Hapgood,68,69,70,207,0.19472,2199,1995
...,...,...,...,...,...,...,...,...,...,...
89,MC,,Ben Robinson,79,85,-,164,0.00000,1904,1904
90,WD,,Oliver Roberts,72,-,-,72,0.00000,2532,2551
91,WD,,Alex Walker,78,-,-,78,0.00000,3858,3864
92,WD,,Max Rogers,-,-,-,-,0.00000,4601,4606


In [70]:
res[1]

Unnamed: 0,Week,Event Date,Event Title,Field Rating,tournament_id,tourId,currencyId,name,fullName,code,isEligible
0,WEEK 39,29TH SEPTEMBER 2024,Gleneagles Masters presented by Insights,Field Rating - 3.31985,10589,40,0,Tartan Pro Tour,Tartan Pro Tour,TPT,True


In [46]:
soup.find(type="application/json")

<script id="__NEXT_DATA__" type="application/json">{"props":{"siteSetupData":{"isEmpty":false,"siteName":{"isEmpty":false,"value":"Official World Golf Ranking"},"siteDomain":{"isEmpty":false,"value":"owgr.com"},"replacementPicture":{"isEmpty":false,"key":"23766166","alt":"No Player Image","name":"noPicturePlayer.jpg","size":7361,"type":"image/jpeg","url":"https://assets-us-01.kc-usercontent.com:443/00be6aeb-6ab1-00f0-f77a-4c8f38e69314/e1e48dfd-23bb-4675-998d-0b76ecd67076/noPicturePlayer.jpg","height":500,"width":333},"system":{"isEmpty":false,"codename":"owgr_site_setup","collection":"default","id":"6aa39324-c833-4c77-b6e9-63f6ffc8b4ff","language":"default","lastModified":"Wed Jan 10 2024 11:12:39 GMT+0000 (Coordinated Universal Time)","name":"OWGR Site Setup","sitemapLocations":[],"type":"site_setup"},"socialShareImage":{"isEmpty":false,"key":"23766167","alt":null,"name":"socialShare.png","size":90706,"type":"image/png","url":"https://assets-us-01.kc-usercontent.com:443/00be6aeb-6ab1-

In [50]:
import json

# Find the script tag with the specific id and type
script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})

# Extract the content inside the tag (which is a JSON string)
json_data = script_tag.string

# Parse the JSON string into a Python dictionary
data = json.loads(json_data)

tour_info = data["props"]["pageProps"]["eventDetailsData"]["eventDetails"]["tours"][0]
tour_info

{'tourId': 40,
 'currencyId': 0,
 'name': 'Tartan Pro Tour',
 'fullName': 'Tartan Pro Tour',
 'code': 'TPT',
 'isEligible': True}

In [58]:
merged_dict = dict(event_info, **tour_info)
merged_dict

{'Week': 'WEEK 39',
 'Event Date': '29TH SEPTEMBER 2024',
 'Event Title': 'Gleneagles Masters presented by Insights',
 'Field Rating': '3.31985',
 'tournament_id': '10589',
 'tourId': 40,
 'currencyId': 0,
 'name': 'Tartan Pro Tour',
 'fullName': 'Tartan Pro Tour',
 'code': 'TPT',
 'isEligible': True}

In [None]:
["props"]["pageProps"]["eventDetailsData"]["eventDetails"]["tours"]

In [34]:
from bs4 import BeautifulSoup

# Find the table
table = soup.find('table')

# Get all rows from the table
rows = table.find_all('tr')

# Initialize a list to store table data
table_data = []

# Iterate through each row
for row in rows:
    # Get all cells (both headers and regular data)
    cells = row.find_all(['th', 'td'])
    
    # Extract text from each cell and add to a list
    row_data = [cell.get_text(strip=True) for cell in cells]
    
    # Add non-empty row data to the table_data list
    if row_data:
        table_data.append(row_data)

table_data

[['Finish Pos.',
  'CTRY',
  'NAME',
  'R1',
  'R2',
  'R3',
  'AGG',
  'POINTS WON',
  'RANK FROM',
  'RANK TO'],
 ['1', '', 'Greg Dalziel', '70', '68', '66', '204', '0.68323', '1876', '1485'],
 ['2',
  '',
  'Graeme Robertson',
  '67',
  '68',
  '71',
  '206',
  '0.40994',
  '1006',
  '955'],
 ['T3',
  '',
  'John Vogelpohl',
  '68',
  '69',
  '70',
  '207',
  '0.19472',
  '2542',
  '2203'],
 ['T3',
  '',
  'Jake Hapgood',
  '68',
  '69',
  '70',
  '207',
  '0.19472',
  '2199',
  '1995'],
 ['T3', '', 'Sam Locke', '72', '64', '71', '207', '0.19472', '973', '954'],
 ['T3',
  '',
  'Kieran Cantley',
  '68',
  '68',
  '71',
  '207',
  '0.19472',
  '1543',
  '1467'],
 ['T7', '', 'John Henry', '70', '69', '70', '209', '0.11615', '1554', '1509'],
 ['T7',
  '',
  'James Wilson(July1999)',
  '66',
  '71',
  '72',
  '209',
  '0.11615',
  '1493',
  '1454'],
 ['9',
  '',
  'Rory Franssen',
  '69',
  '72',
  '69',
  '210',
  '0.10248',
  '2142',
  '2043'],
 ['T10', '', 'Calum Fyfe', '70', '73', '

In [57]:
import re

# Dictionary to store the event information
event_info = {}

# Find the div containing the week info (span with the specific classes)
week_spans = soup.find_all('span', class_='boldedString_content__bolded__QAKW8')
if week_spans and len(week_spans) >= 2:
    week_number = week_spans[0].get_text(strip=True)
    week_value = week_spans[1].get_text(strip=True)
    event_info['Week'] = f"{week_number} {week_value}"

# Find the exact event date by targeting spans with specific text pattern
date_spans = soup.find_all('span', class_='boldedString_content__normal__NreGx')
if date_spans and len(date_spans) >= 2:
    # Ensuring that only the relevant spans are captured (month/day/year)
    event_date = ' '.join([span.get_text(strip=True) for span in date_spans[:3]])
    event_info['Event Date'] = event_date

# Find the div with event title (event name)
event_title = soup.find('div', class_='eventTitleComponent_name__C2ZKJ')
if event_title:
    event_info['Event Title'] = event_title.get_text(strip=True)

# Find the div with the field rating
field_rating = soup.find('div', class_='eventTitleComponent_rating__4UcQg')
if field_rating:
    event_info['Field Rating'] = re.search(r'\d+\.\d+', field_rating.get_text(strip=True)).group()

event_info["tournament_id"] = tournament_id

event_info

{'Week': 'WEEK 39',
 'Event Date': '29TH SEPTEMBER 2024',
 'Event Title': 'Gleneagles Masters presented by Insights',
 'Field Rating': '3.31985',
 'tournament_id': '10589'}

In [39]:
pd.DataFrame(table_data[1:], columns=table_data[0])

Unnamed: 0,Finish Pos.,CTRY,NAME,R1,R2,R3,AGG,POINTS WON,RANK FROM,RANK TO
0,1,,Greg Dalziel,70,68,66,204,0.68323,1876,1485
1,2,,Graeme Robertson,67,68,71,206,0.40994,1006,955
2,T3,,John Vogelpohl,68,69,70,207,0.19472,2542,2203
3,T3,,Jake Hapgood,68,69,70,207,0.19472,2199,1995
4,T3,,Sam Locke,72,64,71,207,0.19472,973,954
...,...,...,...,...,...,...,...,...,...,...
88,MC,,Ben Robinson,79,85,-,164,0.00000,1904,1904
89,WD,,Oliver Roberts,72,-,-,72,0.00000,2532,2551
90,WD,,Alex Walker,78,-,-,78,0.00000,3858,3864
91,WD,,Max Rogers,-,-,-,-,0.00000,4601,4606


In [32]:
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd

# Function to process tournaments using multithreading (8 threads per process)
def process_tournaments_in_threads(tournament_ids):
    tournaments_2024 = []

    # 8 threads per process
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_tournament, tid): tid for tid in tournament_ids}

        for future in as_completed(futures):
            try:
                tournaments_2024.append(future.result())
            except Exception as e:
                print(f"Error processing tournament {futures[future]}: {e}")

    return pd.concat(tournaments_2024).reset_index(drop=True)

In [14]:
# List of tournament IDs
# tournament_ids = list(range(10251, 10609))  # Example tournament IDs

tournament_ids = list(range(10251, 10300))  # Example tournament IDs
# Split the list of tournament IDs into 8 parts, one for each core
num_processes = 8
chunk_size = len(tournament_ids) // num_processes
tournament_chunks = [tournament_ids[i:i + chunk_size] for i in range(0, len(tournament_ids), chunk_size)]

# Spawn 8 processes (one per core), each processing its own chunk of tournaments
with ProcessPoolExecutor(max_workers=num_processes) as executor:
    futures = [executor.submit(process_tournaments_in_threads, chunk) for chunk in tournament_chunks]

    all_results = []
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            all_results.append(future.result())
        except Exception as e:
            print(f"Error processing a chunk: {e}")

# Combine the results from all processes
df_2024 = pd.concat(all_results).reset_index(drop=True)
df_2024


Process SpawnProcess-35:
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/concurrent/futures/process.py", line 237, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_tournaments_in_threads' on <module '__main__' (built-in)>
Process SpawnProcess-33:
Traceback (most recent call last)

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [11]:
url = "http://www.owgr.com/playerprofile/scottie-scheffler-18417"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup

<!DOCTYPE html>
<html data-theme="dark" style="color-scheme:dark"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>!function(){try {var d=document.documentElement;var e=localStorage.getItem('theme');d.setAttribute('data-theme', 'dark');if("system"===e||(!e&&false)){var t="(prefers-color-scheme: dark)",m=window.matchMedia(t);m.media!==t||m.matches?d.setAttribute('data-theme', 'dark'):d.setAttribute('data-theme', 'light')}else if(e) d.setAttribute('data-theme', e)}catch(e){}}()</script><title>Official World Golf Ranking - Player Profile</title><meta content="index,follow" name="robots"/><meta content="index,follow" name="googlebot"/><meta content="Official World Golf Ranking - Player Profile" name="description"/><meta content="summary_large_image" name="twitter:card"/><meta content="Player Profile" property="og:title"/><meta content="Player Profile" property="og:description"/><meta content="/playerprofile/scottie-scheffler-18417" property="og:url"/

In [16]:
soup.find_all('td')

[]

In [33]:
url = "https://www.owgr.com/events/10545"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup

<!DOCTYPE html>
<html data-theme="dark" style="color-scheme:dark"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>!function(){try {var d=document.documentElement;var e=localStorage.getItem('theme');d.setAttribute('data-theme', 'dark');if("system"===e||(!e&&false)){var t="(prefers-color-scheme: dark)",m=window.matchMedia(t);m.media!==t||m.matches?d.setAttribute('data-theme', 'dark'):d.setAttribute('data-theme', 'light')}else if(e) d.setAttribute('data-theme', e)}catch(e){}}()</script><title>Official World Golf Ranking</title><meta content="index,follow" name="robots"/><meta content="index,follow" name="googlebot"/><meta content="ELIGIBLE TOURS. OFFICIAL WORLD GOLF RANKING FOUNDERS. STAY IN TOUCH. © 2022 - OFFICIAL WORLD GOLF RANKING." name="description"/><meta content="summary_large_image" name="twitter:card"/><meta content="Official World Golf Ranking" property="og:title"/><meta content="ELIGIBLE TOURS. OFFICIAL WORLD GOLF RANKING FOUNDERS. STAY

In [34]:
soup.find_all(role="row")[0]

<tr role="row"><th class="eventDetailsTableComponent_table__pos__header__yHt03" colspan="1" role="columnheader"><div class="headerItem_container__G7Gi1" role="button" tabindex="0">Finish Pos.<div class="headerItem_icon__container__3S0cb"><svg class="headerItem_icon__KG2Rw" fill="none" height="9" width="7" xmlns="http://www.w3.org/2000/svg"><path clip-rule="evenodd" d="m0 3.629 3.474-3.5 3.474 3.5H0Zm6.948 1.414"></path></svg></div></div></th><th colspan="1" role="columnheader"><div class="headerItem_container__G7Gi1" role="button" tabindex="0">CTRY<div class="headerItem_icon__container__3S0cb"><svg class="headerItem_icon__KG2Rw" fill="none" height="9" width="7" xmlns="http://www.w3.org/2000/svg"><path clip-rule="evenodd" d="m0 3.629 3.474-3.5 3.474 3.5H0Zm6.948 1.414-3.474 3.5L0 5.043h6.948Z"></path></svg></div></div></th><th class="table__text-header" colspan="1" role="columnheader"><div class="headerItem_container__G7Gi1" role="button" tabindex="0">NAME<div class="headerItem_icon__co

In [38]:
soup.find_all(role="row")[1].find_all('a')[0].find_all('div')

[<div class="table__link-column">Collin Morikawa</div>]

In [43]:
profile_link = soup.find_all(role="row")[1].find('a')["href"]
id = ''.join(filter(lambda x: x.isdigit(), profile_link))
id

'22085'

In [44]:
player = soup.find_all(role="row")[1].find('a').find('div').get_text()
player

'Collin Morikawa'

In [45]:
soup.find_all(role="row")[1].find_all("td")

[<td class="table__default-column eventDetailsTableComponent_table__pos__AZZ6H" role="cell">1</td>,
 <td class="table__default-column eventDetailsTableComponent_table__ctry__KYWRe" role="cell"><div class="countryCell_icon__1mk7o"><div><img data-testid="circle-country-flag" height="16" src="https://hatscripts.github.io/circle-flags/flags/us.svg" title=" "/></div></div></td>,
 <td class="table__text-column eventDetailsTableComponent_table__name__jYI9S" role="cell"><a class="textLink_container__WoRMe undefined" href="/playerprofile/collin-morikawa-22085"><div class="table__link-column">Collin Morikawa</div></a></td>,
 <td class="table__default-column eventDetailsTableComponent_table__round__M1CjE" role="cell">66</td>,
 <td class="table__default-column eventDetailsTableComponent_table__round__M1CjE" role="cell">63</td>,
 <td class="table__default-column eventDetailsTableComponent_table__round__M1CjE" role="cell">67</td>,
 <td class="table__default-column eventDetailsTableComponent_table__r

In [69]:
def process_player(row, tournament_id):
    columns = row.find_all("td")

    try:
        # Create a dictionary to store the data
        player_data = {
            "Tournament Id": tournament_id,
            "Position": columns[0].text.strip(),
            "Name": columns[2].text.strip(),
            "Id": ''.join(filter(lambda x: x.isdigit(), columns[2].find('a')["href"])),
            "Round 1": int(columns[3].text.strip()),
            "Round 2": int(columns[4].text.strip()),
            "Round 3": int(columns[5].text.strip()),
            "Round 4": int(columns[6].text.strip()),
            "Total Score": int(columns[7].text.strip()),
            "Ranking Points": float(columns[8].text.strip()),
            "Rank From": int(columns[9].text.strip()),
            "Rank To": int(columns[10].text.strip()),
        }

        return player_data
    
    except:
        return None

In [70]:
rows = soup.find_all(role="row")

processed_rows = []

for row in rows:
    processed_rows.append(process_player(row, tournament_id='10545'))

clean_rows = [row for row in processed_rows if row is not None]

tournament_results = pd.DataFrame(clean_rows)
tournament_results

Unnamed: 0,Tournament Id,Position,Name,Id,Round 1,Round 2,Round 3,Round 4,Total Score,Ranking Points,Rank From,Rank To
0,10545,1,Collin Morikawa,22085,66,63,67,66,262,47.62836,6,4
1,10545,2,Sahith Theegala,23014,67,66,66,64,263,27.62445,15,12
2,10545,3,Scottie Scheffler,18417,65,66,66,67,264,18.09877,1,1
3,10545,4,Russell Henley,14578,67,71,67,62,267,13.33594,14,15
4,10545,5,Adam Scott,6430,66,67,68,67,268,10.47824,21,18
5,10545,6,Sungjae Im,17488,69,68,68,64,269,8.5731,20,20
6,10545,7,Justin Thomas,14139,66,69,70,65,270,7.62053,33,30
7,10545,T8,Wyndham Clark,23604,67,67,68,69,271,5.95354,5,6
8,10545,T8,Shane Lowry,13900,69,69,65,68,271,5.95354,32,32
9,10545,T8,Viktor Hovland,18841,69,71,65,66,271,5.95354,8,8


In [77]:
process_tournament(10479)

Unnamed: 0,Tournament Id,Position,Name,Id,Round 1,Round 2,Round 3,Round 4,Total Score,Ranking Points,Rank From,Rank To
0,10479,1,Xander Schauffele,19895,69,72,69,65,275,100.00000,3,2
1,10479,T2,Billy Horschel,11276,72,68,69,68,277,50.00000,62,33
2,10479,T2,Justin Rose,6093,69,68,73,67,277,50.00000,67,34
3,10479,4,Thriston Lawrence,18105,71,74,65,68,278,30.00000,98,71
4,10479,5,Russell Henley,14578,69,75,66,69,279,24.00000,20,12
...,...,...,...,...,...,...,...,...,...,...,...,...
152,10479,MC,Todd Hamilton,915,82,80,-,-,162,0.00000,-,4564
153,10479,MC,Aguri Iwasaki,28203,74,91,-,-,165,0.00000,569,585
154,10479,WD,Ernie Els,874,82,-,-,-,82,0.00000,1784,1818
155,10479,WD,John Daly,863,82,-,-,-,82,0.00000,4553,4564


In [79]:
process_tournament(10606)

Unnamed: 0,Tournament Id,Position,Name,Id,Round 1,Round 2,Round 3,Round 4,Total Score,Ranking Points,Rank From,Rank To
0,10606,1,Angad Cheema,15816,69,61,70,68,268,0.90063,917,831
1,10606,2,Aman Raj,19418,66,68,69,65,268,0.54038,896,854
2,10606,T3,Aryan Roopa Anand,27401,67,68,72,67,274,0.31522,1938,1690
3,10606,T3,Kartik Sharma,24465,71,69,64,70,274,0.31522,1709,1588
4,10606,T5,Jairaj Singh Sandhu,28071,69,69,68,69,275,0.17562,1396,1343
...,...,...,...,...,...,...,...,...,...,...,...,...
121,10606,MC,Manu Gandas,18058,73,-,-,-,73,0.00000,842,852
122,10606,MC,Anil Bajrang Mane,17278,73,-,-,-,73,0.00000,3467,3507
123,10606,MC,Indrajit Bhalotia,2037,87,-,-,-,87,0.00000,4606,4603
124,10606,MC,G DURGA PRASAD(Am),32980,93,-,-,-,93,0.00000,-,4603


In [47]:
row = soup.find_all(role="row")[1]
columns = row.find_all("td")

# Create a dictionary to store the data
player_data = {
    "Position": columns[0].text.strip(),
    "Country": columns[1].find("img")["src"].split("/")[-1].split('.')[0],  # Extracting country code from the flag URL
    "Name": columns[2].text.strip(),
    "Round 1": int(columns[3].text.strip()),
    "Round 2": int(columns[4].text.strip()),
    "Round 3": int(columns[5].text.strip()),
    "Round 4": int(columns[6].text.strip()),
    "Total Score": int(columns[7].text.strip()),
    "Ranking Points": float(columns[8].text.strip()),
    "Rank From": int(columns[9].text.strip()),
    "Rank To": int(columns[10].text.strip()),
}

# Print the resulting dictionary
print(player_data)

{'Position': '1', 'Country': 'us', 'Name': 'Collin Morikawa', 'Round 1': 66, 'Round 2': 63, 'Round 3': 67, 'Round 4': 66, 'Total Score': 262, 'Ranking Points': 47.62836, 'Rank From': 6, 'Rank To': 4}


In [19]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'DEFAULT_INTERESTING_STRING_TYPES',
 'EMPTY_ELEMENT_EVENT',
 'END_ELEMENT_EVENT',
 'ROOT_TAG_NAME',
 'START_ELEMENT_EVENT',
 'STRING_ELEMENT_EVENT',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_clone',
 '_decode_markup',
 '_event_stream',
 '_feed',
 '_find_all',
 '_find_one',
 '_format_tag',
 '_indent_string',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_marku

In [20]:
soup.index

<bound method Tag.index of <!DOCTYPE html>
<html data-theme="dark" style="color-scheme:dark"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>!function(){try {var d=document.documentElement;var e=localStorage.getItem('theme');d.setAttribute('data-theme', 'dark');if("system"===e||(!e&&false)){var t="(prefers-color-scheme: dark)",m=window.matchMedia(t);m.media!==t||m.matches?d.setAttribute('data-theme', 'dark'):d.setAttribute('data-theme', 'light')}else if(e) d.setAttribute('data-theme', e)}catch(e){}}()</script><title>Official World Golf Ranking</title><meta content="index,follow" name="robots"/><meta content="index,follow" name="googlebot"/><meta content="ELIGIBLE TOURS. OFFICIAL WORLD GOLF RANKING FOUNDERS. STAY IN TOUCH. © 2022 - OFFICIAL WORLD GOLF RANKING." name="description"/><meta content="summary_large_image" name="twitter:card"/><meta content="Official World Golf Ranking" property="og:title"/><meta content="ELIGIBLE TOURS. OFFICIAL WORLD 

In [None]:
#%% Event data for each player

def acquire_data(player_ids):
    
    # Urls
    url_base_1 = 'http://www.owgr.com/Ranking/PlayerProfile.aspx?playerID='
    url_base_2 = '&year='
    
    # Creates empty lists
    events = []
    names = []
    
    # Player ID Loop
    for player_id in player_ids:
        
        print('ID: ' + str(player_id))
        
        html = urlopen(url_base_1 + str(player_id))
        soup = BeautifulSoup(html, 'lxml')
        
        # Get years athlete played
        years = []
        for option in soup.find_all('option'):
            years.append(option.text)
            
        years.remove('Counting Events')
        
        # Year loop
        for year in years:
            try:
                html = urlopen(url_base_1 + str(player_id) + url_base_2 + str(year))
                soup = BeautifulSoup(html, 'lxml')
                name = str(soup.find('h2').contents[0])
                trs = soup.find_all(id = 'player_results')
                for tr in trs:
                      tds = tr.find_all('td')
                      for td in tds:
                          events.append(td.text)
                          names.append(name)
            except:
                pass
            
    return(names, events)

names, events = acquire_data(player_ids)

#%% Clean data and create DataFrame

def clean_data(names, events):
    
    # Keeps every 9th element in list
    names_list = names[0::9]
    
    # Combines numerous lists into one for each record
    composite_list = [events[x:x+9] for x in range(0, len(events), 9)]
    
    # List of columns
    column_list = ['Event', 'Tour', 'Week', 'Year', 'Finish', 'Rank_Points', 
                   'Weight', 'Adjusted_Points', 'Rank_After']
    
    # Creates dataframe
    player_df = pd.DataFrame(composite_list, columns = column_list)
    
    # Creates Name column usign names_list
    player_df['Name'] = names_list
    
    # Replaces '-' with ''
    player_df.replace(to_replace = ['-',''], value = np.nan, inplace = True)
    
    # Re-orders dataframe
    player_df = player_df[['Name', 'Event', 'Tour', 'Week', 'Year', 'Finish',
                           'Rank_Points', 'Weight', 'Adjusted_Points', 'Rank_After']]
    
    # Convert to string and remove text from column
    player_df['Rank_After'] = player_df['Rank_After'].astype(str).str.extract('(\d+)')
    
    # Convert columns to numeric
    cols = player_df.columns.drop(['Name','Event','Tour', 'Finish'])
    player_df[cols] = player_df[cols].apply(pd.to_numeric, errors = 'coerce')
    
    # Sorts by dataframe year and week ascending
    player_df = player_df.sort_values(['Name', 'Year', 'Week'], ascending = True)
    
    # Creates new column indicating Professional or Amateur
    player_df['Pro/Am'] = 'Pro'
    
    # Assigns 'Am' to column for Amateur players
    player_df.loc[player_df['Name'].str.contains('\(Am\)'), 'Pro/Am'] = 'Am'
    player_df.loc[player_df['Name'].str.contains('\(AM\)'), 'Pro/Am'] = 'Am'
    player_df.loc[player_df['Name'].str.contains('\(am\)'), 'Pro/Am'] = 'Am'
    player_df.loc[player_df['Name'].str.contains('\(Am'), 'Pro/Am'] = 'Am'
    player_df.loc[player_df['Name'].str.contains('\(A\)'), 'Pro/Am'] = 'Am'
    player_df.loc[player_df['Name'].str.contains('\(A'), 'Pro/Am'] = 'Am'
    
    # Removes any indication of amateur from 'Name' column
    player_df['Name'] = player_df['Name'].str.replace('\(Am\)', '').str.replace('\(AM\)', '')\
    .str.replace('\(am\)', '').str.replace('\(Am', '').str.replace('\(A\)', '')\
    .str.replace('\(A', '')
    
    # Strips leading and trailing whitespace and period
    player_df['Name'] = player_df['Name'].str.strip().str.strip('\. ')
    
    # Strips leading and trailing whitespace
    player_df['Event'] = player_df['Event'].str.strip()
    
    # Deletes records where 'Name' is 'missed missed' or 'Missed missed'
    player_df = player_df[player_df['Name'] != 'missed missed']
    player_df = player_df[player_df['Name'] != 'Missed missed']
    
    # Ensures there are no duplicates
    player_df.drop_duplicates(inplace = True)
    
    return(player_df)

player_df = clean_data(names, events)

# Output as CSV
player_df.to_csv('../OWGR_Player.csv', index = False)