In [115]:
"""
#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings
# based on some processes in analyze_loc_matches_local.ipynb
# focusing on summarization to help select candidate counties
"""

'\n#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings\n# based on some processes in analyze_loc_matches_local.ipynb\n# focusing on summarization to help select candidate counties\n'

In [116]:
import pandas as pd
import matplotlib
import re

In [117]:
bels_locs_path = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/data/TORCH-data_snapshots_TX_OK_2024-12-06/torch_bels_locs.tsv'
df_occ = pd.read_csv(bels_locs_path, low_memory=False, sep='\t')

In [118]:
df_occ.shape

(1146433, 119)

In [119]:
# Drop irrelevant colums
# NOTE not all of these columns occur across all collections
drop_columns= ['higherClassification','kingdom','phylum','class','order','identificationReferences','identificationRemarks','taxonRemarks','identificationQualifier','typeStatus','fieldNumber','eventID','informationWithheld','dataGeneralizations','dynamicProperties','associatedSequences','associatedTaxa','reproductiveCondition','establishmentMeans','lifeStage','sex','individualCount','samplingProtocol','preparations','continent','waterBody','islandGroup','island','rights','rightsHolder','accessRights','recordID','type','license','bibliographicCitation','datasetName','fieldNotes','countryCode','nomenclaturalCode','nomenclaturalStatus','associatedMedia','higherGeography','institutionID','georeferencedDate','datasetID','occurrenceStatus','verbatimLocality','organismID','previousIdentifications','eventTime','eventRemarks','locationAccordingTo','verbatimCoordinateSystem','footprintWKT','earliestEonOrLowestEonothem','earliestEraOrLowestErathem','earliestPeriodOrLowestSystem','earliestEpochOrLowestSeries','earliestAgeOrLowestStage','group','formation','member','identificationVerificationStatus','scientificNameID']
df_occ = df_occ.drop(columns=drop_columns, errors='ignore')

In [120]:
df_occ.shape

(1146433, 62)

In [121]:
# Find duplicate records

#bels_matchwithcoords
df_matches = df_occ[df_occ['bels_location_string'].duplicated(keep=False)]
#df_matches = df_denton[df_denton.bels_matchwithcoords

In [122]:
df_matches.shape

(812448, 62)

In [123]:
# Add location ID
# first create a copy to avoid copy of a slice of a DF
df_matches = df_matches.copy()
# https://stackoverflow.com/a/51110197 or https://stackoverflow.com/a/51110205
df_matches['loc_id'] = df_matches.groupby(['bels_location_string']).ngroup()

In [124]:
# Add dup loc count column for the size of each location cluster (number of duplicate locations)
#https://stackoverflow.com/a/46768694
#TODO - better name and documentation?
# This is 
df_matches['dup_loc_count'] = df_matches.groupby(['bels_location_string']).transform('size')

In [125]:
# find groups that have at least one georeference
# from https://claude.ai/chat/f9eab37d-6cc0-459c-9d24-fd78d6152d10

def filter_locations_with_coordinates(df):
    """
    Filter location groups based on having at least one record with coordinates.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with location_string and coordinates columns
    
    Returns:
    pandas.DataFrame: Filtered DataFrame containing all records from location groups
                     that have at least one set of coordinates
    """
    
    # Calculate the number of coordinates for each location group
    coord_counts = df[df['decimalLatitude'].notna() & (df['decimalLatitude'] != '')].groupby('bels_location_string').size()

    # Find locations that have at least one non-empty coordinates value
    # This handles various types of "empty" values (None, NaN, empty string)
    #locations_with_coords = df[df['decimalLatitude'].notna() & (df['decimalLatitude'] != '')]['bels_location_string'].unique()
    
    # Find locations that have at least one non-empty coordinates value
    # changed approach now that we have coord_counts
    locations_with_coords = coord_counts.index
    
    # Filter the original DataFrame to keep all records from matching locations
    #filtered_df = df[df['bels_location_string'].isin(locations_with_coords)]
    # (New approach) Filter the original DataFrame to keep all records from matching locations
    filtered_df = df[df['bels_location_string'].isin(locations_with_coords)].copy()

    # Add the coordinate count for each location
    filtered_df['recs_w_geo_count'] = filtered_df['bels_location_string'].map(coord_counts)
    
    return filtered_df


# Filter locations that have at least one set of coordinates
#result = filter_locations_with_coordinates(df)
df_matches_wgeo = filter_locations_with_coordinates(df_matches)

print("\nOriginal DataFrame:")
print(df_matches.shape)
print("\nFiltered DataFrame (locations with at least one set of coordinates):")
print(df_matches_wgeo.shape)


Original DataFrame:
(812448, 64)

Filtered DataFrame (locations with at least one set of coordinates):
(500081, 65)


In [126]:
# find groups that have at least one georeference
# from https://claude.ai/chat/f9eab37d-6cc0-459c-9d24-fd78d6152d10
#import pandas as pd

def filter_locations_without_coordinates(df):
    """
    Filter location groups based on having NO coordinates in any record.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with location_string and coordinates columns
    
    Returns:
    pandas.DataFrame: Filtered DataFrame containing all records from location groups
                     that have no coordinates at all
    """
    # Group by location_string and check if ALL coordinates are empty for that group
    locations_without_coords = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: all(pd.isna(x) | (x == ''))
    })
    
    # Get the location strings where the condition is True
    locations_no_coords = locations_without_coords[
        locations_without_coords['decimalLatitude']
    ].index
    
    # Filter the original DataFrame to keep all records from matching locations
    filtered_df = df[df['bels_location_string'].isin(locations_no_coords)]
    
    return filtered_df

#df = pd.DataFrame(data)

# Filter locations that have at least one set of coordinates
#result = filter_locations_with_coordinates(df)
df_matches_wo_geo = filter_locations_without_coordinates(df_matches)

print("\nOriginal DataFrame:")
print(df_matches.shape)
print("\nFiltered DataFrame (locations with at least one set of coordinates):")
print(df_matches_wo_geo.shape)

# Print summary
#print("\nSummary of locations without any coordinates:")
#for loc in df_matches_wo_geo['bels_location_string'].unique():
#    print(f"- {loc}")


Original DataFrame:
(812448, 64)

Filtered DataFrame (locations with at least one set of coordinates):
(312367, 64)


In [127]:
# Drop records with only the state name in bels string
#df_nogeo_with_geolocdups_nostates = df_nogeo_with_geolocdups[df_nogeo_with_geolocdups[column1] != df[column2]]

#df_nogeo_with_geolocdups_nostates = df_nogeo_with_geolocdups[~df_nogeo_with_geolocdups.apply(lambda row: row['stateProvince'].lower() == row['bels_location_string'].lower(), axis=1)]
#df_matches_wo_geo
df_wo_geo_nostates = df_matches_wo_geo[~df_matches_wo_geo.apply(lambda row: row['stateProvince'].lower() == row['bels_location_string'].lower(), axis=1)]


In [128]:
# remove state + county matches
#df_nogeo_with_geolocdups_nostatecounty = df_nogeo_with_geolocdups_nostates[~df_nogeo_with_geolocdups_nostates.apply(lambda row: (str(row['stateProvince']) + str(row['county'])).lower() == row['bels_location_string'].lower(), axis=1)]
#df_wo_geo_nostates
df_wo_geo_nostatecounty = df_wo_geo_nostates[~df_wo_geo_nostates.apply(lambda row: (str(row['stateProvince']) + str(row['county'])).lower() == row['bels_location_string'].lower(), axis=1)]


In [131]:
def filtered_mean(x):
    if len(x) <= 1:
        return pd.NA
    filtered = x[x > 1]
    if len(filtered) == 0:
        return pd.NA
    filtered = filtered[filtered < filtered.max()]
    if len(filtered) == 0:
        return pd.NA
    return filtered.mean()

In [168]:
def filtered_median(x):
    if len(x) <= 1:
        return pd.NA
    filtered = x[x > 1]
    if len(filtered) == 0:
        return pd.NA
    filtered = filtered[filtered < filtered.max()]
    if len(filtered) == 0:
        return pd.NA
    return filtered.median()

In [176]:
def normalize_county_names(df, county_column):
    """
    Normalize county names by removing variations of 'County', question marks,
    and converting to title case.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing county names
    county_column (str): Name of the column containing county names
    
    Returns:
    pandas.Series: Series containing normalized county names
    """
    # Handle None/NaN values first
    normalized = df[county_column].fillna('')
    
    # Convert to string type to ensure string operations work
    normalized = normalized.astype(str)
    
    # Strip whitespace and remove patterns
    normalized = normalized.str.strip()
    
    # Remove various forms of "County" and question marks (case insensitive)
    patterns = [
        (r'\?+', ''),  # Remove one or more question marks anywhere in the string
        (r'\s*county\s*$', ''),
        (r'\s*co\.\s*$', ''),
        (r'\s*co\s*$', ''),
        (r'\s*parish\s*$', '')
    ]
    
    for pattern, replacement in patterns:
        normalized = normalized.str.replace(pattern, replacement, case=False, regex=True)
    
    # Convert to title case and strip any remaining whitespace
    normalized = normalized.str.title().str.strip()
    
    # Replace empty strings back with None/NaN
    normalized = normalized.replace('', pd.NA)
    
    return normalized

In [177]:
# Add the normalized county names as a new column
df_no_geo_normalized = df_wo_geo_nostatecounty.copy()
# Add the normalized county names as a new column
df_no_geo_normalized.loc[:, 'county_normalized'] = normalize_county_names(df_no_geo_normalized, 'county')
#df_no_geo_normalized['county_normalized'] = normalize_county_names(df_no_geo_normalized, 'county')

In [178]:
df_wo_geo_nostatecounty

Unnamed: 0,id,institutionCode,collectionCode,ownerInstitutionCode,collectionID,basisOfRecord,occurrenceID,catalogNumber,otherCatalogNumbers,family,...,verbatimDepth,verbatimElevation,disposition,language,recordEnteredBy,modified,references,bels_location_string,loc_id,dup_loc_count
0,14218975,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,ce2d9b58-9322-479a-b94a-97bf9afde740,BRIT122345,,Equisetaceae,...,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texasnewtonforestofftexas87nearyellowbayouca3m...,96353,2
4,14218979,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,f00e3cf9-1054-49ae-84cb-fedea90f7191,BRIT122349,,Osmundaceae,...,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texashardincollectedfrombigthicketarea,72338,5
11,14218986,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,11faf6fe-c6ce-4b64-965b-f9bba6dd8f92,BRIT217405,,Fabaceae,...,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texasmatagordamadislandmarsh,91540,6
14,14218989,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,fab21772-233b-430a-9ef6-8bcd8c5f65fb,BRIT217402,,Fabaceae,...,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texassmithtylertx,108107,20
15,14218990,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,080d313d-1369-4303-9f2f-38c0af17da8e,BRIT217401,,Fabaceae,...,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texassmithtylertx,108107,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146409,13452611,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,ffebcae7-0385-48e9-b9c3-a77c847cb6ba,LL00376874,,Salicaceae,...,,,,,,2024-06-25 12:55:56,https://portal.torcherbaria.org/portal/collect...,texaspresidioalongriograndebetweenredfordandpr...,101263,6
1146410,13436220,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,ffed45a4-d168-490b-8972-d82c8a3b2ca8,LL00310293,,Ephedraceae,...,,,,,,2024-06-25 12:56:08,https://portal.torcherbaria.org/portal/collect...,texaswebb4milesnorthoflaredotexasmoran'sranch,121205,9
1146414,13442112,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fff0157a-d475-4a8a-ab3d-20bfd5c091a4,LL00330218,,Rhamnaceae,...,,,,,,2024-06-25 12:56:02,https://portal.torcherbaria.org/portal/collect...,texasvalverdemilecanyononeastedgeoflangtry,119282,16
1146416,13438994,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fff18500-8c1e-46f8-9ac8-de334646deab,LL00319073,,Poaceae,...,,,,,,2024-06-25 12:55:25,https://portal.torcherbaria.org/portal/collect...,texasculbersonwallaceprattranchguadalupemtsbet...,58088,3


In [152]:
#df_no_geo_normalized_means = calculate_and_add_county_means(df_no_geo_normalized, 'dup_loc_count', 'county')

In [182]:
county_summary = df_no_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
    total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    dup_loc_count=('dup_loc_count', 'max'),
    regular_mean=('dup_loc_count', 'mean'),
    filtered_mean=('dup_loc_count', filtered_mean)
)

In [183]:
county_summary = df_no_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
    total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    dup_loc_count=('dup_loc_count', 'max'),
    regular_mean=('dup_loc_count', 'mean'),
    filtered_mean=('dup_loc_count', filtered_mean),
    regular_median=('dup_loc_count', 'median'),
    filtered_median=('dup_loc_count', filtered_median)
)

In [184]:
# Convert filtered_mean to float and then round
county_summary['filtered_mean'] = pd.to_numeric(county_summary['filtered_mean']).round(2)
county_summary['regular_mean'] = county_summary['regular_mean'].round(2)
#county_summary['filtered_mean'] = county_summary['filtered_mean'].round(2)

In [185]:
def update_county_status(county_summary):
    """
    Update county_summary DataFrame with status based on county lists in text files
    for Texas and Oklahoma.
    
    Parameters:
    county_summary (pandas.DataFrame): DataFrame with county summary data indexed by state and county
    
    Returns:
    pandas.DataFrame: Updated county_summary DataFrame with new status column
    """
    # File paths for each state
    state_files = {
        'Texas': {
            'assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_assigned.txt',
            'not_assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_not_assigned.txt'
        },
        'Oklahoma': {
            'assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/oklahoma_counties_assigned.txt',
            'not_assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/oklahoma_counties_not_assigned.txt'
        }
    }
    
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = county_summary.copy()
    
    # Add status column initialized with None
    result_df['status'] = None
    
    # Load and process each state's counties
    for state, files in state_files.items():
        # Read assigned counties
        try:
            with open(files['assigned'], 'r') as f:
                assigned_counties = set(line.strip().lower() for line in f)
        except FileNotFoundError:
            print(f"Warning: Could not find assigned counties file for {state}")
            assigned_counties = set()
            
        # Read not assigned counties
        try:
            with open(files['not_assigned'], 'r') as f:
                not_assigned_counties = set(line.strip().lower() for line in f)
        except FileNotFoundError:
            print(f"Warning: Could not find not-assigned counties file for {state}")
            not_assigned_counties = set()
        
        # Update status for this state's counties
        state_mask = result_df.index.get_level_values('stateProvince') == state
        for idx in result_df[state_mask].index:
            county_name = idx[1].lower()  # Access county name from MultiIndex
            if county_name in assigned_counties:
                result_df.at[idx, 'status'] = 'assigned'
            elif county_name in not_assigned_counties:
                result_df.at[idx, 'status'] = 'not assigned'
    
    return result_df

In [186]:
#not_assigned_file = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_not_assigned.txt'
#assigned_file = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_assigned.txt'
#county_summary_status = update_county_status(county_summary, assigned_file, not_assigned_file)
county_summary_status = update_county_status(county_summary)

In [187]:
county_summary_status.to_csv('TORCH_no_geo_county_summary.tsv', sep='\t')

In [188]:
county_summary_status

Unnamed: 0_level_0,Unnamed: 1_level_0,total_locations,unique_locations,dup_loc_count,regular_mean,filtered_mean,regular_median,filtered_median,status
stateProvince,county_normalized,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Oklahoma,Adair,1381,294,57,14.77,12.95,7.0,6.0,
Oklahoma,Alfalfa,265,72,18,6.41,5.57,4.0,4.0,
Oklahoma,Angelina,2,1,2,2.00,,2.0,,
Oklahoma,Atoka,1227,255,107,13.01,12.93,7.0,7.0,
Oklahoma,Beaver,267,59,30,9.62,7.04,6.0,5.0,
...,...,...,...,...,...,...,...,...,...
Texas,Yoakum,26,9,5,3.23,2.81,3.0,3.0,assigned
Texas,Young,105,22,34,14.49,5.14,8.0,4.0,assigned
Texas,Zapata,467,124,28,6.91,5.56,4.0,4.0,not assigned
Texas,Zavala,237,65,20,6.60,5.37,4.0,3.0,assigned
