## Load In Historical Data From OWGR PDF's

In [1]:
import pandas as pd
import re
import os 
import json
import pdfplumber
from datetime import datetime, timedelta 

def parse_owgr_pdf(pdf_path, date, top_n=200):
    """
    Parse OWGR PDF and return top N players as a pandas dataframe
    """

    rows = []   

    # Known countries in the OWGR
    countries = [
        'United States', 'England', 'Scotland', 'Ireland', 'Northern Ireland',
        'Austrailia', 'South Korea', 'Japan', 'Sweden', 'Canada', 'Spain',
        'France', 'Germany', 'Italy', 'South Africa', 'Belgium', 'Austria', 
        'Norway', 'Denmark', 'Finland', 'New Zealand', 'Netherlands', 'Colombia', 
        'Argentina', 'Venezuela', 'Chile', 'Mexico', 'China', 'Philippines', 
        'Zimbabwe', 'Taiwan (Chinese Taipei)', 'Poland', 'Wales', 'Czech Republic', 
        'Switzerland', 'India', 'Thailand', 'Singapore', 'Malaysia', 'Indonesia', 
        'Hong Kong', 'Puerto Rico', 'Brazil', 'Portugal', 'Hungary', 'Greece', 
        'Paraguay', 'Dominican Republic', 'Peru', 'Korea', 'Chinese Taipei'
    ]

    # Sort by length descending so longer names match first (e.g., "South Korea" before "Korea")
    countries_sorted = sorted(countries, key=len, reverse=True)
    countries_pattern = '|'.join(re.escape(c) for c in countries_sorted)

    # Pattern to match the beginning (ranks) and end (numeric stats) of each row
    # Format: Rank (LastWeek) <End2025> Name Country Average Total Played Lost Won Played
    pattern = re.compile(
        r'^(\d+)\s+'                           # This week rank
        r'\((\d+)\)\s+'                         # Last week rank in ()
        r'<(\d+)>\s+'                           # End 2025 rank in <>
        r'(.+?)\s+'                             # Name + Country (captured together)
        r'(-?\d+\.\d+)\s+'                      # Average points
        r'(-?\d+\.\d+)\s+'                      # Total points
        r'(\d+)\s+'                             # Events played (divisor)
        r'(-?\d+\.\d+)\s+'                      # Points lost
        r'(-?\d+\.\d+)\s+'                      # Points won
        r'(\d+)$'                               # Events played (actual)
    )

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            for line in text.split('\n'):
                line = line.strip()

                # Skip header/footer lines
                if not line or line.startswith('Week') or line.startswith('Ending') or \
                   line.startswith('Events') or line.startswith('This') or \
                   line.startswith('Min-') or line.startswith('World') or \
                   line.startswith('Ranking') or line.startswith('Page') or \
                   '25 January' in line:
                    continue

                match = pattern.match(line)
                if match:
                    rank = int(match.group(1))
                    if rank <= top_n:
                        name_country = match.group(4).strip()

                        # Find the country in the name_country string
                        player = name_country
                        country = ''
                        for c in countries_sorted:
                            if name_country.endswith(c):
                                player = name_country[:-len(c)].strip()
                                country = c
                                break

                        rows.append({
                            'Rank': rank,
                            'Last_Week_Rank': int(match.group(2)),
                            'End_2025_Rank': int(match.group(3)),
                            'Player': player,
                            'Country': country,
                            'Avg_Points': float(match.group(5)),
                            'Total_Points': float(match.group(6)),
                            'Events_Divisor': int(match.group(7)),
                            'Points_Lost_2026': float(match.group(8)),
                            'Points_Won_2026': float(match.group(9)),
                            'Events_Actual': int(match.group(10))
                        })

    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=['Rank', 'Player']).sort_values('Rank').reset_index(drop=True)
    df['Date'] = date

    return df.head(top_n)

def owgr_df_to_dict(df, date_str):
    """
    Convert OWGR DataFrame to nested dictionary format.
    """
    result = {}

    for _, row in df.iterrows():
        player = row['Player']
        result[player] = {
            date_str: {
                'Avg_Points': row['Avg_Points'],
                'Total_Points': row['Total_Points'],
                'Events_Divisor': row['Events_Divisor'],
                'Events_Actual': row['Events_Actual']
            }
        }

    return result

def add_owgr_to_dict(df, date_str, existing_dict=None,):
    """
    Add OWGR data to an existing dictionary (or create new one).
    """
    if existing_dict is None:
        existing_dict = {}

    for _, row in df.iterrows():
        player = row['Player']

        if player not in existing_dict:
            existing_dict[player] = {}

        existing_dict[player][date_str] = {
            'Avg_Points': row['Avg_Points'],
            'Total_Points': row['Total_Points'],
            'Events_Divisor': row['Events_Divisor'],
            'Events_Actual': row['Events_Actual']
        }

    return existing_dict

def get_owgr_date(file):
    """
    Extract date from OWGR filename dynamically.
    Filename format: owgrWWfYYYY.pdf (e.g., owgr04f2026.pdf)
    Returns the Saturday of that ISO week as YYYY-MM-DD string.
    """
    import re

    match = re.match(r'owgr(\d{2})f(\d{4})\.pdf', file)
    if not match:
        raise ValueError(f"Invalid OWGR filename format: {file}")

    week_num = int(match.group(1))
    year = int(match.group(2))

    # Get the Saturday of the given ISO week
    # ISO week 1 is the week containing January 4th
    # We use %G (ISO year) and %V (ISO week) to get Monday of that week, then add 5 days for Saturday
    monday_of_week = datetime.strptime(f'{year}-W{week_num:02d}-1', '%G-W%V-%u')
    saturday_of_week = monday_of_week + timedelta(days=5)

    return saturday_of_week.strftime('%Y-%m-%d')

last_updated_date = "2026-01-25"
folder_path = '/Users/holdenbridge/Desktop/golfer-recent-performance/owgr_historical_records'

# Load In Existing OWGR Dictionary
if not os.path.exists(f'{folder_path}/owgr_dict.json'):
    owgr_dict = {}
else:
    with open(f'{folder_path}/owgr_dict.json', 'r') as f:
        owgr_dict = json.load(f)

# Parse New OWGR Data
for file in os.listdir(folder_path):
    if not file.endswith('.pdf'):
        continue
    owgr_df = parse_owgr_pdf(f'{folder_path}/{file}', date=last_updated_date, top_n=200)
    owgr_dict = add_owgr_to_dict(owgr_df, date_str= get_owgr_date(file), existing_dict=owgr_dict)
    print(f"Finished Loading {file}")

# Save New OWGR Data 
with open(f'{folder_path}/owgr_dict.json', 'w') as f:
    json.dump(owgr_dict, f, indent=2)

Finished Loading owgr01f2026.pdf
Finished Loading owgr04f2026.pdf
Finished Loading owgr02f2026.pdf
Finished Loading owgr03f2026.pdf


In [5]:
def create_player_window(owgr_dict, player_name, window_size=10):

    player_dict = owgr_dict[player_name]
    # Sort the keys in chronological order
    player_dict = dict(sorted(player_dict.items(), key=lambda x: x[0]))
    all_dates = list(player_dict.keys())

    windows_data = []
    idx = 0

    while True:
        window_dates_before = all_dates[idx : idx + window_size]
        after_start_idx = idx + window_size
        window_dates_after = all_dates[after_start_idx : after_start_idx + window_size]
        # Only continue if both before and after are full windows
        if len(window_dates_before) < window_size or len(window_dates_after) < window_size:
            break

        before_window_start_points = player_dict[window_dates_before[0]]['Avg_Points']
        before_window_end_points = player_dict[window_dates_before[-1]]['Avg_Points']
        before_window_change = before_window_end_points - before_window_start_points

        after_window_start_points = player_dict[window_dates_after[0]]['Avg_Points']
        after_window_end_points = player_dict[window_dates_after[-1]]['Avg_Points']
        after_window_change = after_window_end_points - after_window_start_points

        windows_data.append({
            "PlayerName": player_name,
            "WindowSize": window_size,
            "ChangeBefore": before_window_change,
            "ChangeAfter": after_window_change
        })

        idx += 1

    windows = pd.DataFrame(windows_data)
    return windows

df_all = pd.DataFrame()
for player in list(owgr_dict.keys()):
    player = create_player_window(owgr_dict, player, window_size=2)
    df_all = pd.concat([df_all, player])

df_all

Unnamed: 0,PlayerName,WindowSize,ChangeBefore,ChangeAfter
0,Scottie Scheffler,2,-0.3586,0.9404
0,Rory McIlroy,2,0.0117,-0.3253
0,Tommy Fleetwood,2,0.0224,-0.1731
0,Xander Schauffele,2,-0.1514,-0.1467
0,Russell Henley,2,-0.0963,0.1356
...,...,...,...,...
0,Taylor Montgomery,2,0.0007,-0.0136
0,Marcus Armitage,2,-0.0104,0.0859
0,Hank Lebioda,2,-0.0125,-0.0124
0,Matthew Jordan,2,-0.0157,-0.0158


In [6]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr

def plot_change_scatter(change_before, change_after, title="Change Before vs Change After"):
    """
    Plot change_before (y-axis) vs change_after (x-axis) with correlation in legend.

    Args:
        change_before: array-like, values for y-axis
        change_after: array-like, values for x-axis
        title: str, plot title
    """
    # Calculate correlation
    corr, p_value = pearsonr(change_after, change_before)

    # Create plot
    fig, ax = plt.subplots(figsize=(8, 6))

    ax.scatter(change_after, change_before, alpha=0.6, edgecolors='k', linewidth=0.5)

    # Add trend line
    z = np.polyfit(change_after, change_before, 1)
    p = np.poly1d(z)
    x_line = np.linspace(min(change_after), max(change_after), 100)
    ax.plot(x_line, p(x_line), 'r--', alpha=0.8, label=f'Correlation: {corr:.3f} (p={p_value:.3f})')

    # Add reference lines at 0
    ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)

    ax.set_xlabel('Change After')
    ax.set_ylabel('Change Before')
    ax.set_title(title)
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    return corr, p_value

# Usage example:
corr, p = plot_change_scatter(df_all['ChangeBefore'], df_all['ChangeAfter'], title="OWGR Points Change")

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'scipy'