In [3]:
import pandas as pd

# Load data from the parquet file
def load_data():
    return pd.read_parquet('../data/all-data.parquet')
data = pd.read_parquet('../data/all-data.parquet')

In [None]:

# Generalized aggregation function with dynamic level of aggregation
def aggregate_data(data, aggregation_level, measures=['Sc', 'GrossVP', 'NetVP', 'Stableford']):
    levels = {
        'Pl': ['Pl', 'Player'],
        'TEG': ['Pl', 'TEG', 'Player', 'TEGNum'],
        'Round': ['Pl', 'TEG', 'Round', 'Player', 'TEGNum'],
        'FrontBack': ['Pl', 'TEG', 'Round', 'FrontBack', 'Player', 'TEGNum']
    }
    
    if aggregation_level not in levels:
        raise ValueError(f"Invalid aggregation level: {aggregation_level}. Choose from: {list(levels.keys())}")
    
    group_columns = levels[aggregation_level]
    return data.groupby(group_columns, as_index=False)[measures].sum().sort_values(by=group_columns)

data = pd.read_parquet('../data/all-data.parquet')
#print(data.head())

teg_data = aggregate_data(data,'TEG')
print(teg_data.head())
print(teg_data.shape[0])

rd_data = aggregate_data(data,'Round')
print(rd_data.head())
print(rd_data.shape[0])

nine_data = aggregate_data(data,'FrontBack')
print(nine_data.head())
print(nine_data.shape[0])

In [20]:
import pandas as pd
from utils import aggregate_data, format_vs_par

# Load the data from the Parquet file
all_data = pd.read_parquet('../data/all-data.parquet')

# Filter out TEG 2 and TEG 50 (in place by reassigning to itself)
all_data = all_data[~all_data['TEG'].isin(['TEG 2', 'TEG 50'])]

# Aggregate the data by 'TEG'
teg_data = aggregate_data(all_data, 'TEG')

# Define the fields & number of rows to keep
teg_fields = ['Player', 'TEG', 'GrossVP']
n_keep = 10

# Find the n lowest 'Sc' values and return the corresponding rows
lowest_sc_rows = teg_data[teg_fields].nsmallest(n_keep, 'GrossVP').sort_values(by='GrossVP', ascending=True)
lowest_sc_rows['Rank'] = lowest_sc_rows['GrossVP'].rank(method='min').astype(int).astype(str)
lowest_sc_rows.loc[lowest_sc_rows.duplicated('Rank', keep=False), 'Rank'] += '='
lowest_sc_rows = lowest_sc_rows[['Rank', 'Player', 'TEG', 'GrossVP']]
lowest_sc_rows.rename(columns={'GrossVP': 'Gross'}, inplace=True)
lowest_sc_rows['Gross'] = lowest_sc_rows['Gross'].apply(format_vs_par)

# Print the rows with the lowest 'Sc' values
print(lowest_sc_rows)


def find_lowest_sc_rows(data, level_of_aggregation, fields_to_keep, top_n=10):
    # Aggregate the data based on the provided level of aggregation
    aggregated_data = aggregate_data(data, level_of_aggregation)
    
    # Find the n lowest 'GrossVP' values and return the corresponding rows
    lowest_sc_rows = aggregated_data[fields_to_keep].nsmallest(top_n, 'GrossVP').sort_values(by='GrossVP', ascending=True)
    
    # Add ranking column
    lowest_sc_rows['Rank'] = lowest_sc_rows['GrossVP'].rank(method='min').astype(int).astype(str)
    lowest_sc_rows.loc[lowest_sc_rows.duplicated('Rank', keep=False), 'Rank'] += '='
    
    # Reorder and rename columns
    lowest_sc_rows = lowest_sc_rows[['Rank'] + fields_to_keep]
    lowest_sc_rows.rename(columns={'GrossVP': 'Gross'}, inplace=True)
    
    # Apply formatting to 'Gross' column
    lowest_sc_rows['Gross'] = lowest_sc_rows['Gross'].apply(format_vs_par)
    
    return lowest_sc_rows


rd_fields = ['Player', 'TEG', 'Round', 'GrossVP']
lowest_rounds = find_lowest_sc_rows(all_data,'Round',rd_fields)
print(lowest_rounds)


ImportError: cannot import name 'format_vs_par' from 'utils' (c:\Users\JBA33\OneDrive - Sky\Documents\python\TEG\streamlit\utils.py)

In [21]:
import pandas as pd
from utils import aggregate_data, format_vs_par

# Load the data from the Parquet file & exclude teg 2 and 50
all_data = pd.read_parquet('../data/all-data.parquet')
all_data = all_data[~all_data['TEG'].isin(['TEG 2', 'TEG 50'])]



def find_lowest_sc_rows(data, level_of_aggregation, fields_to_keep, field='GrossVP', top_n=10):
    # Aggregate the data based on the provided level of aggregation
    aggregated_data = aggregate_data(data, level_of_aggregation)
    
    # Define properties for each field
    field_properties = {
        'GrossVP': {'new_name': 'Gross', 'ascending': True, 'formatter': format_vs_par, 'additional_field': 'Sc'},
        'NetVP': {'new_name': 'Net', 'ascending': True, 'formatter': format_vs_par, 'additional_field': None},
        'Sc': {'new_name': 'Gross Score', 'ascending': True, 'formatter': lambda x: int(x), 'additional_field': 'GrossVP'},
        'Stableford': {'new_name': 'Stableford', 'ascending': False, 'formatter': lambda x: int(x), 'additional_field': None},
    }
    
    # Get the properties for the selected field
    properties = field_properties.get(field)
    if not properties:
        raise ValueError(f"Invalid field: {field}")
    
    # Append additional_field to fields_to_keep if it's not None
    additional_field = properties['additional_field']
    print(f"\nField is: {field};\n additional_field is: {additional_field}\nfields to keep: {fields_to_keep}")

    fields_to_keep += [additional_field] if additional_field else []

    print(f"\nfields to keep: {fields_to_keep}\n")
    
    all_fields = fields_to_keep + [field]

    print(f"\nall_fields: {all_fields}")


    # Sort the data based on the 'ascending' property
    sorted_data = (aggregated_data[all_fields]
                   .sort_values(by=field, ascending=properties['ascending'])
                   .head(top_n))

    # Add ranking column (ranking order follows the 'ascending' property)
    sorted_data['Rank'] = sorted_data[field].rank(ascending=properties['ascending'], method='min').astype(int).astype(str)
    sorted_data.loc[sorted_data.duplicated('Rank', keep=False), 'Rank'] += '='
    
    # Reorder and rename columns
    sorted_data = sorted_data[['Rank'] + all_fields]
    sorted_data.rename(columns={field: properties['new_name']}, inplace=True)
    
    # Apply formatting to the chosen field
    sorted_data[properties['new_name']] = sorted_data[properties['new_name']].apply(properties['formatter'])
    
    return sorted_data

n_keep = 10
rd_fields = ['Player', 'TEG', 'Round']

lowest_rounds_gross = find_lowest_sc_rows(all_data,'Round',rd_fields,'GrossVP' ,n_keep)
print('\nBest Gross')
print(lowest_rounds_gross)

print('rd_fields')
print(rd_fields)

lowest_rounds_sc = find_lowest_sc_rows(all_data,'Round',rd_fields,'Sc' ,n_keep)
print('\nBest Score')
print(lowest_rounds_sc)

lowest_rounds_net = find_lowest_sc_rows(all_data,'Round',rd_fields,'NetVP' ,n_keep)
print('\nBest Net')
print(lowest_rounds_net)

best_rounds_stableford = find_lowest_sc_rows(all_data,'Round',rd_fields,'Stableford' ,n_keep)
print('\n=======\nBest Stableford\n========')
print(best_rounds_stableford)

ImportError: cannot import name 'format_vs_par' from 'utils' (c:\Users\JBA33\OneDrive - Sky\Documents\python\TEG\streamlit\utils.py)

In [6]:
df = data
# Print the columns
print("\nColumns in the DataFrame:")
for col in df.columns:
    print(f"- {col}")

# Print the first few rows
print("\nFirst few rows of the DataFrame:")
print(df.head())



Columns in the DataFrame:
- TEG
- Round
- Hole
- PAR
- SI
- Pl
- Sc
- HC
- HCStrokes
- GrossVP
- Net
- NetVP
- Stableford
- TEGNum
- HoleID
- Player
- FrontBack
- Date
- Course
- Hole Order Ever
- Sc Cum Round
- Sc Cum TEG
- Sc Cum Career
- GrossVP Cum Round
- GrossVP Cum TEG
- GrossVP Cum Career
- NetVP Cum Round
- NetVP Cum TEG
- NetVP Cum Career
- Stableford Cum Round
- Stableford Cum TEG
- Stableford Cum Career
- TEG Count
- Career Count
- Sc Round Avg
- Sc TEG Avg
- Sc Career Avg
- GrossVP Round Avg
- GrossVP TEG Avg
- GrossVP Career Avg
- NetVP Round Avg
- NetVP TEG Avg
- NetVP Career Avg
- Stableford Round Avg
- Stableford TEG Avg
- Stableford Career Avg

First few rows of the DataFrame:
     TEG  Round  Hole  PAR  SI  Pl   Sc    HC  HCStrokes  GrossVP  ...  \
0  TEG 7      1     1    5   7  AB  8.0  36.0          2      3.0  ...   
1  TEG 7      1     2    3  13  AB  4.0  36.0          2      1.0  ...   
2  TEG 7      1     3    5  11  AB  6.0  36.0          2      1.0  ...   