In [1]:
"""
#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings
# based on some processes in analyze_loc_matches_local.ipynb
# focusing on summarization to help select candidate counties
"""

'\n#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings\n# based on some processes in analyze_loc_matches_local.ipynb\n# focusing on summarization to help select candidate counties\n'

In [3]:
import re
from pathlib import Path

import pandas as pd
import matplotlib

In [5]:
#input_path = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/data/TORCH-data_snapshots_TX_OK_2024-12-06/'
input_path = '/media/jbest/data3/BRIT_git/TORCH_georeferencing/data/TORCH-data_snapshots_TX_OK_2024-12-06/'
input_filename = 'torch_bels_metrics.tsv'
bels_locs_path = Path(input_path) / input_filename
#bels_locs_path = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/data/TORCH-data_snapshots_TX_OK_2024-12-06/torch_bels_locs.tsv'
#df_occ = pd.read_csv(bels_locs_path, low_memory=False, sep='\t')

In [6]:
# import data with BELS augmentation/metrics
df_occ = pd.read_csv(bels_locs_path, low_memory=False, sep='\t')

In [7]:
df_occ.shape

(1146433, 124)

In [8]:
# Drop irrelevant colums
# NOTE not all of these columns occur across all collections
drop_columns= ['higherClassification','kingdom','phylum','class','order','identificationReferences','identificationRemarks','taxonRemarks','identificationQualifier','typeStatus','fieldNumber','eventID','informationWithheld','dataGeneralizations','dynamicProperties','associatedSequences','associatedTaxa','reproductiveCondition','establishmentMeans','lifeStage','sex','individualCount','samplingProtocol','preparations','continent','waterBody','islandGroup','island','rights','rightsHolder','accessRights','recordID','type','license','bibliographicCitation','datasetName','fieldNotes','countryCode','nomenclaturalCode','nomenclaturalStatus','associatedMedia','higherGeography','institutionID','georeferencedDate','datasetID','occurrenceStatus','verbatimLocality','organismID','previousIdentifications','eventTime','eventRemarks','locationAccordingTo','verbatimCoordinateSystem','footprintWKT','earliestEonOrLowestEonothem','earliestEraOrLowestErathem','earliestPeriodOrLowestSystem','earliestEpochOrLowestSeries','earliestAgeOrLowestStage','group','formation','member','identificationVerificationStatus','scientificNameID']
df_occ = df_occ.drop(columns=drop_columns, errors='ignore')

In [9]:
df_occ.shape

(1146433, 67)

In [10]:
# Find duplicate records
df_matches = df_occ[df_occ['bels_location_string'].duplicated(keep=False)]

In [11]:
df_matches.shape

(812448, 67)

In [12]:
# Add location ID
# first create a copy to avoid copy of a slice of a DF
df_matches = df_matches.copy()
# https://stackoverflow.com/a/51110197 or https://stackoverflow.com/a/51110205
df_matches['loc_id'] = df_matches.groupby(['bels_location_string']).ngroup()

In [13]:
# Add dup loc count column for the size of each location cluster (number of duplicate locations)
#https://stackoverflow.com/a/46768694
#TODO - better name and documentation?\
df_matches['dup_loc_count'] = df_matches.groupby(['bels_location_string']).transform('size')

In [14]:
# find groups that have at least one georeference
# from https://claude.ai/chat/f9eab37d-6cc0-459c-9d24-fd78d6152d10

def filter_locations_with_coordinates(df):
    """
    Filter location groups based on having at least one record with coordinates.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with location_string and coordinates columns
    
    Returns:
    pandas.DataFrame: Filtered DataFrame containing all records from location groups
                     that have at least one set of coordinates
    """
    
    # Calculate the number of coordinates for each location group
    coord_counts = df[df['decimalLatitude'].notna() & (df['decimalLatitude'] != '')].groupby('bels_location_string').size()

    # Find locations that have at least one non-empty coordinates value
    # This handles various types of "empty" values (None, NaN, empty string)
    #locations_with_coords = df[df['decimalLatitude'].notna() & (df['decimalLatitude'] != '')]['bels_location_string'].unique()
    
    # Find locations that have at least one non-empty coordinates value
    # changed approach now that we have coord_counts
    locations_with_coords = coord_counts.index
    
    # Filter the original DataFrame to keep all records from matching locations
    #filtered_df = df[df['bels_location_string'].isin(locations_with_coords)]
    # (New approach) Filter the original DataFrame to keep all records from matching locations
    filtered_df = df[df['bels_location_string'].isin(locations_with_coords)].copy()

    # Add the coordinate count for each location
    filtered_df['recs_w_geo_count'] = filtered_df['bels_location_string'].map(coord_counts)
    
    return filtered_df


# Filter locations that have at least one set of coordinates
#result = filter_locations_with_coordinates(df)
df_matches_w_geo = filter_locations_with_coordinates(df_matches)

print("\nOriginal DataFrame:")
print(df_matches.shape)
print("\nFiltered DataFrame (locations with at least one set of coordinates):")
print(df_matches_w_geo.shape)


Original DataFrame:
(812448, 69)

Filtered DataFrame (locations with at least one set of coordinates):
(500081, 70)


In [15]:
# find groups that have no georeference
# from https://claude.ai/chat/f9eab37d-6cc0-459c-9d24-fd78d6152d10

def filter_locations_without_coordinates(df):
    """
    Filter location groups based on having NO coordinates in any record.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with location_string and coordinates columns
    
    Returns:
    pandas.DataFrame: Filtered DataFrame containing all records from location groups
                     that have no coordinates at all
    """
    # Group by location_string and check if ALL coordinates are empty for that group
    locations_without_coords = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: all(pd.isna(x) | (x == ''))
    })
    
    # Get the location strings where the condition is True
    locations_no_coords = locations_without_coords[
        locations_without_coords['decimalLatitude']
    ].index
    
    # Filter the original DataFrame to keep all records from matching locations
    filtered_df = df[df['bels_location_string'].isin(locations_no_coords)]
    
    return filtered_df

#df = pd.DataFrame(data)

# Filter locations that have at least one set of coordinates
#result = filter_locations_with_coordinates(df)
df_matches_no_geo = filter_locations_without_coordinates(df_matches)

print("\nOriginal DataFrame:")
print(df_matches.shape)
print("\nFiltered DataFrame (locations with at least one set of coordinates):")
print(df_matches_no_geo.shape)

# Print summary
#print("\nSummary of locations without any coordinates:")
#for loc in df_matches_wo_geo['bels_location_string'].unique():
#    print(f"- {loc}")


Original DataFrame:
(812448, 69)

Filtered DataFrame (locations with at least one set of coordinates):
(312367, 69)


In [16]:
# Drop records with only the state name in bels string
#df_nogeo_with_geolocdups_nostates = df_nogeo_with_geolocdups[df_nogeo_with_geolocdups[column1] != df[column2]]

#df_nogeo_with_geolocdups_nostates = df_nogeo_with_geolocdups[~df_nogeo_with_geolocdups.apply(lambda row: row['stateProvince'].lower() == row['bels_location_string'].lower(), axis=1)]
#df_matches_wo_geo
df_no_geo_nostates = df_matches_no_geo[~df_matches_no_geo.apply(lambda row: row['stateProvince'].lower() == row['bels_location_string'].lower(), axis=1)]


In [17]:
# remove state + county matches
#df_nogeo_with_geolocdups_nostatecounty = df_nogeo_with_geolocdups_nostates[~df_nogeo_with_geolocdups_nostates.apply(lambda row: (str(row['stateProvince']) + str(row['county'])).lower() == row['bels_location_string'].lower(), axis=1)]
#df_wo_geo_nostates
df_no_geo_nostatecounty = df_no_geo_nostates[~df_no_geo_nostates.apply(lambda row: (str(row['stateProvince']) + str(row['county'])).lower() == row['bels_location_string'].lower(), axis=1)]


In [18]:
def filtered_mean(x):
    if len(x) <= 1:
        return pd.NA
    filtered = x[x > 1]
    if len(filtered) == 0:
        return pd.NA
    filtered = filtered[filtered < filtered.max()]
    if len(filtered) == 0:
        return pd.NA
    return filtered.mean()

In [19]:
def filtered_median(x):
    if len(x) <= 1:
        return pd.NA
    filtered = x[x > 1]
    if len(filtered) == 0:
        return pd.NA
    filtered = filtered[filtered < filtered.max()]
    if len(filtered) == 0:
        return pd.NA
    return filtered.median()

In [20]:
def normalize_county_names(df, county_column):
    """
    Normalize county names by removing variations of 'County', question marks,
    and converting to title case.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing county names
    county_column (str): Name of the column containing county names
    
    Returns:
    pandas.Series: Series containing normalized county names
    """
    # Handle None/NaN values first
    normalized = df[county_column].fillna('')
    
    # Convert to string type to ensure string operations work
    normalized = normalized.astype(str)
    
    # Strip whitespace and remove patterns
    normalized = normalized.str.strip()
    
    # Remove various forms of "County" and question marks (case insensitive)
    patterns = [
        (r'\?+', ''),  # Remove one or more question marks anywhere in the string
        (r'\s*county\s*$', ''),
        (r'\s*co\.\s*$', ''),
        (r'\s*co\s*$', ''),
        (r'\s*parish\s*$', '')
    ]
    
    for pattern, replacement in patterns:
        normalized = normalized.str.replace(pattern, replacement, case=False, regex=True)
    
    # Convert to title case and strip any remaining whitespace
    normalized = normalized.str.title().str.strip()
    
    # Replace empty strings back with None/NaN
    normalized = normalized.replace('', pd.NA)
    
    return normalized

In [21]:
# Add the normalized county names as a new column
df_no_geo_normalized = df_no_geo_nostatecounty.copy()
# Add the normalized county names as a new column
df_no_geo_normalized.loc[:, 'county_normalized'] = normalize_county_names(df_no_geo_normalized, 'county')
#df_no_geo_normalized['county_normalized'] = normalize_county_names(df_no_geo_normalized, 'county')

In [22]:
# groups with coordinates
df_w_geo_normalized = df_matches_w_geo.copy()
# Add the normalized county names as a new column
df_w_geo_normalized.loc[:, 'county_normalized'] = normalize_county_names(df_w_geo_normalized, 'county')

In [23]:
df_w_geo_normalized

Unnamed: 0,id,institutionCode,collectionCode,ownerInstitutionCode,collectionID,basisOfRecord,occurrenceID,catalogNumber,otherCatalogNumbers,family,...,bels_location_string,bels_location_id,coord_group_id,coord_group_match_count,bels_group_coord_count,bels_group_rec_count,loc_id,dup_loc_count,recs_w_geo_count,county_normalized
6,14218981,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,c00dfba3-4509-4516-8c02-41a4d74d7b02,BRIT217077,,Osmundaceae,...,texasmorrisdaingerfieldstatepark,6,,,25,200,95194,200,25,Morris
7,14218982,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,b510084b-7011-43b8-953a-4916f5a92d7a,BRIT217409,,Fabaceae,...,texaserathhunewellranch,7,2.0,104.0,79,79,64194,79,79,Erath
59,14219035,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,9c4331aa-1246-434b-a809-0d6d4b6f8a2e,BRIT217341,,Fabaceae,...,texaserathhunewellranch,7,2.0,104.0,79,79,64194,79,79,Erath
63,14219039,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,0d575a12-ac2b-48f5-86b0-066dca4a49a2,BRIT217337,,Fabaceae,...,texaserathhunewellrancharea4.,58,2.0,104.0,3,3,64263,3,3,Erath
66,14219042,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,4d4d37ab-5d08-4ada-b640-7668388ae325,BRIT217334,,Fabaceae,...,texascherokeecareylakerd3milesnorthofcuneytx,61,,,1,3,52831,3,1,Cherokee
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146420,27083785,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fff4ede9-9481-4a93-a482-27a1c8c5434a,LL00492158,,Ranunculaceae,...,texasculberson,2651,,,19,2088,56389,2088,19,Culberson
1146422,13436617,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fff5853a-b880-453b-86cb-6db57861f27c,LL00311039,,Potamogetonaceae,...,texasmenardsansabariverjustnortheastofftmckavett,162378,58697.0,5.0,4,7,93704,7,4,Menard
1146424,30071521,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fff98d6b-5b14-4b2b-a32e-f38b6fc24ea8,LL00566769,,Lamiaceae,...,texasaransasgooseisland,48538,,,6,38,35413,38,6,Aransas
1146425,13429866,LL,,,3433d090-b098-4832-92ff-06b8c4b2edfd,PreservedSpecimen,fffa8fc3-322f-4d7c-9e67-9ea56c4ffbb4,LL00288215,,Salicaceae,...,"texasculbersonoffhighwayus62,alongcreeknorthof...",441212,101792.0,2.0,2,2,57742,2,2,Culberson


In [24]:
#df_no_geo_normalized_means = calculate_and_add_county_means(df_no_geo_normalized, 'dup_loc_count', 'county')

In [28]:
# mean only
#county_summary_v1 = df_no_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
county_summary = df_no_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
    total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    dup_loc_count=('dup_loc_count', 'max'),
    regular_mean=('dup_loc_count', 'mean'),
    filtered_mean=('dup_loc_count', filtered_mean)
)

In [29]:
#summary with filtered and regular mean and median
county_summary_no_geo = df_no_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
    total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    dup_loc_count=('dup_loc_count', 'max'),
    regular_mean=('dup_loc_count', 'mean'),
    filtered_mean=('dup_loc_count', filtered_mean),
    regular_median=('dup_loc_count', 'median'),
    filtered_median=('dup_loc_count', filtered_median)
)

In [30]:
# Convert filtered_mean to float and then round
county_summary_no_geo['filtered_mean'] = pd.to_numeric(county_summary['filtered_mean']).round(2)
county_summary_no_geo['regular_mean'] = county_summary['regular_mean'].round(2)
#county_summary['filtered_mean'] = county_summary['filtered_mean'].round(2)

In [31]:
# add summary to groups with coords
#summary with filtered and regular mean and median
county_summary_w_geo = df_w_geo_normalized.groupby(['stateProvince', 'county_normalized']).agg(
    total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    dup_loc_count=('dup_loc_count', 'max'),
    regular_mean=('dup_loc_count', 'mean'),
    filtered_mean=('dup_loc_count', filtered_mean),
    regular_median=('dup_loc_count', 'median'),
    filtered_median=('dup_loc_count', filtered_median)
)

In [34]:
def update_county_status(county_summary):
    """
    Update county_summary DataFrame with status based on county lists in text files
    for Texas and Oklahoma.
    
    Parameters:
    county_summary (pandas.DataFrame): DataFrame with county summary data indexed by state and county
    
    Returns:
    pandas.DataFrame: Updated county_summary DataFrame with new status column
    """
    # File paths for each state
    state_files = {
        'Texas': {
            #'assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_assigned.txt',
            'assigned': '/media/jbest/data3/BRIT_git/TORCH_georeferencing/texas_counties_assigned.txt',
            #'not_assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_not_assigned.txt'
            'not_assigned': '/media/jbest/data3/BRIT_git/TORCH_georeferencing/texas_counties_not_assigned.txt'
        },
        'Oklahoma': {
            #'assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/oklahoma_counties_assigned.txt',
            'assigned': '/media/jbest/data3/BRIT_git/TORCH_georeferencing/oklahoma_counties_assigned.txt',
            #'not_assigned': '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/oklahoma_counties_not_assigned.txt'
            'not_assigned': '/media/jbest/data3/BRIT_git/TORCH_georeferencing/oklahoma_counties_not_assigned.txt'
        }
    }
    
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = county_summary.copy()
    
    # Add status column initialized with None
    result_df['status'] = None
    
    # Load and process each state's counties
    for state, files in state_files.items():
        # Read assigned counties
        try:
            with open(files['assigned'], 'r') as f:
                assigned_counties = set(line.strip().lower() for line in f)
        except FileNotFoundError:
            print(f"Warning: Could not find assigned counties file for {state}")
            assigned_counties = set()
            
        # Read not assigned counties
        try:
            with open(files['not_assigned'], 'r') as f:
                not_assigned_counties = set(line.strip().lower() for line in f)
        except FileNotFoundError:
            print(f"Warning: Could not find not-assigned counties file for {state}")
            not_assigned_counties = set()
        
        # Update status for this state's counties
        state_mask = result_df.index.get_level_values('stateProvince') == state
        for idx in result_df[state_mask].index:
            county_name = idx[1].lower()  # Access county name from MultiIndex
            if county_name in assigned_counties:
                result_df.at[idx, 'status'] = 'assigned'
            elif county_name in not_assigned_counties:
                result_df.at[idx, 'status'] = 'not assigned'
    
    return result_df

In [35]:
#not_assigned_file = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_not_assigned.txt'
#assigned_file = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/texas_counties_assigned.txt'
#county_summary_status = update_county_status(county_summary, assigned_file, not_assigned_file)
county_summary_no_geo_status = update_county_status(county_summary_no_geo)

In [36]:
county_summary_w_geo_status = update_county_status(county_summary_w_geo)

In [37]:
county_summary_no_geo_status.to_csv('TORCH_no_geo_county_summary.tsv', sep='\t', index=False)

In [38]:
county_summary_w_geo_status.to_csv('TORCH_w_geo_county_summary.tsv', sep='\t', index=False)

In [39]:
# with geo export
df_matches_w_geo.to_csv('TORCH_w_geo.tsv', sep='\t', index=False)

In [40]:
# Texas only
df_matches_tx_w_geo = df_matches_w_geo[(df_matches_w_geo['stateProvince'] == 'Texas')]
df_matches_tx_w_geo.to_csv('TORCH_TX_w_geo.tsv', sep='\t', index=False)

In [41]:
# Oklahoma only
df_matches_ok_w_geo = df_matches_w_geo[(df_matches_w_geo['stateProvince'] == 'Oklahoma')]
df_matches_ok_w_geo.to_csv('TORCH_OK_w_geo.tsv', sep='\t', index=False)
