In [125]:
import pandas as pd
from typing import Optional

In [126]:
def get_category_in_taxonomy(taxonomy_df: pd.DataFrame, category_code: str, taxonomy_index: int) -> str:
    # get the value from the dataframe
    taxonomy_res = taxonomy_df.loc[taxonomy_df['category_code'] == category_code, 'overture_taxonomy']

    # if something to work with
    if len(taxonomy_res) > 0:

        # get the list out of the result
        taxonomy_lst = taxonomy_res.iat[0]

        # pull out the code from the taxonomy at the index
        taxonomy_code = taxonomy_lst[taxonomy_index] if taxonomy_index < len(taxonomy_lst) else None

    else:
        taxonomy_code = None

    return taxonomy_code

def get_overture_taxonomy_dataframe():
    """
    Retrieve the Overture categories taxonomy as a pandas DataFrame.

    Returns:
        DataFrame containing the Overture categories taxonomy.
    """
    # Use the raw GitHub URL for the CSV file
    url = "https://raw.githubusercontent.com/OvertureMaps/schema/main/docs/schema/concepts/by-theme/places/overture_categories.csv"
    
    # Read the CSV using semicolon as delimiter
    df = pd.read_csv(url, sep=';', header=0, dtype='string')
    
    # format the column names
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
    
    # Convert the 'Overture Taxonomy' column from string to actual list of strings
    df['overture_taxonomy'] = df['overture_taxonomy'].apply(lambda val: val.strip().strip('[]').split(','))
    
    # strip whitespace from each taxonomy item
    df['overture_taxonomy'] = df['overture_taxonomy'].apply(lambda lst: [item.strip() for item in lst])
    
    # get the maximum depth for category taxonomies
    df['list_length'] = df['overture_taxonomy'].str.len()
    max_lst_len = df['list_length'].max()

    # iterate into maximum depth for 
    for idx in range(max_lst_len):

        # create the name for the category
        col_nm = f'category_{idx + 1:02d}'
    
        # get the overture taxonomy value for the index corresponding to the code
        df[col_nm] = df['category_code'].apply(lambda cat_code: get_category_in_taxonomy(df, cat_code, idx))

    return df


def get_overture_taxonomy(df: Optional[pd.DataFrame] = None) -> dict[str, dict[str, str]]:
    """
    Retrieve the Overture categories taxonomy as a nested dictionary.

    Returns:
        Nested dictionary containing the Overture categories taxonomy.
    """
    # get the taxonomy dataframe if not provided
    if df is None:
        df = get_overture_taxonomy_dataframe()

    # set index for the dataframe
    df.set_index('category_code', inplace=True, drop=True)

    # only keep columns describing category levels and convert to dict
    cols = [c for c in df.columns if c.startswith('category')]
    taxonomy_dict = df.loc[:,cols].to_dict(orient='index')

    return taxonomy_dict


def get_overture_taxonomy_category_field_max_lengths(df: Optional[pd.DataFrame] = None) -> dict[str, int]:
    """
    Retrieve the maximum lengths of each category field in the Overture taxonomy.

    Returns:
        Dictionary containing the maximum lengths of each category field.
    """
    # get the taxonomy dataframe if not provided
    if df is None:
        df = get_overture_taxonomy_dataframe()

    # only keep columns describing category levels
    cols = [c for c in df.columns if c.startswith('category')]

    # create dictionary to hold max lengths
    max_lengths = {}

    # iterate through the columns and get the max length
    for col in cols:
        max_len = df[col].str.len().max()
        max_lengths[col] = int(max_len) if pd.notnull(max_len) else 0

    return max_lengths

In [127]:
taxonomy_df = get_overture_taxonomy_dataframe()

In [128]:
taxonomy_dict = get_overture_taxonomy(taxonomy_df)

In [129]:
get_overture_taxonomy_category_field_max_lengths(taxonomy_df)

{'category_01': 37,
 'category_02': 41,
 'category_03': 52,
 'category_04': 55,
 'category_05': 38,
 'category_06': 26}