In [None]:
# Generated using Claude.ai
#https://claude.ai/chat/18fa6039-d306-449d-958b-08ad823fe977

In [12]:
import pandas as pd

def analyze_locations(df):
    """
    Creates location IDs and analyzes location groups.
    
    Parameters:
    df (pd.DataFrame): DataFrame with 'bels_location_string' and coordinate columns
    
    Returns:
    pd.DataFrame: DataFrame with location IDs and group statistics
    """
    # Create location IDs
    unique_locations = df['bels_location_string'].unique()
    location_to_id = {loc: idx for idx, loc in enumerate(unique_locations)}
    df['bels_location_id'] = df['bels_location_string'].map(location_to_id)
    
    # Calculate stats per location
    location_stats = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: x.notna().sum(),
        'bels_location_string': 'count'
    }).rename(columns={
        'decimalLatitude': 'coordinate_count',
        'bels_location_string': 'bels_group_rec_count'
    })
    
    # Merge stats back to original dataframe
    return df.merge(location_stats, on='bels_location_string')

In [22]:
def summarize_locations_by_region_v1(df):
    """
    Generates location group statistics for each state/county combination.
    /tmp/ipykernel_516747/3985458850.py:15: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  return df.groupby(['stateProvince', 'county']).apply(get_stats).reset_index()
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum()
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats).reset_index()

In [30]:
def summarize_locations_by_region_v2(df):
    """
    Generates location group statistics for each state/county combination.
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum()
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats, include_groups=False).reset_index()

In [33]:
def summarize_locations_by_region(df):
    """
    Generates location group statistics for each state/county combination.
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        total_records = len(group)
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum(),
            'total_records': total_records
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats, include_groups=False).reset_index()

In [34]:
# Example usage
if __name__ == "__main__":
    # Input DataFrame
    df_occ = pd.read_csv('torch_bels_locs_SAMPLE.tsv', low_memory=False, sep='\t')
    
    # Process the DataFrame
    #result = add_coordinate_counts(data)
    result = analyze_locations(df_occ)
    
    # Display all columns
    pd.set_option('display.max_columns', None)
    print("\nProcessed DataFrame with coordinate counts:")
    print(result)



Processed DataFrame with coordinate counts:
      Unnamed: 0                                    id institutionCode  \
0         890363                              32169803             CSU   
1         123678                              26792115            BRIT   
2         419345                               8774006            SRSC   
3          80310                              24845952            BRIT   
4         147243                              26816060             SMU   
...          ...                                   ...             ...   
9995      718090  e1bd18a5-13a0-4cfc-8086-a2dd72cf525a             TEX   
9996      100960                              24932632            BRIT   
9997      574179                              26647152             OKL   
9998      643238                              26591222             OKL   
9999      669058  c989068e-cf28-4b58-a0c2-9b7613367b56             TEX   

     collectionCode ownerInstitutionCode  \
0               NaN   

In [35]:
result
#df_sorted = result.sort_values(by=['coordinate_count'], ascending=False)
#df_sorted

Unnamed: 0.1,Unnamed: 0,id,institutionCode,collectionCode,ownerInstitutionCode,collectionID,basisOfRecord,occurrenceID,catalogNumber,otherCatalogNumbers,family,scientificName,taxonID,scientificNameAuthorship,genus,subgenus,specificEpithet,verbatimTaxonRank,infraspecificEpithet,taxonRank,identifiedBy,dateIdentified,recordedBy,recordNumber,eventDate,year,month,day,startDayOfYear,endDayOfYear,verbatimEventDate,occurrenceRemarks,habitat,associatedOccurrences,locationID,country,stateProvince,county,municipality,locality,locationRemarks,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,verbatimCoordinates,georeferencedBy,georeferenceProtocol,georeferenceSources,georeferenceVerificationStatus,georeferenceRemarks,minimumElevationInMeters,maximumElevationInMeters,minimumDepthInMeters,maximumDepthInMeters,verbatimDepth,verbatimElevation,disposition,language,recordEnteredBy,modified,references,bels_location_string,bels_location_id,coordinate_count,bels_group_rec_count
0,890363,32169803,CSU,,,871cbe2c-c361-4cdb-9c3a-75a7a4ce5669,PreservedSpecimen,385fb31a-fc8c-11e7-8e1c-e3cf2f9a05b7,,CSU5504,Lamiaceae,Physostegia angustifolia,95415.0,Fernald,Physostegia,,angustifolia,,,Species,,,M. Hamilton,78 (2of2),1994-06-18,1994.0,6.0,18.0,169.0,,,terminal spike loosely flowered; white petals,prairie habitat,,,United States,Oklahoma,Rogers,,Ramona; 6 mi E of Ramona on road 403 between r...,,36.371597,-95.604379,,5000.0,,,,,False,Needs to be georeferenced. Coordinates based o...,,,,,,,,,VCK (OVPD),2021-03-30 00:00:00,https://portal.torcherbaria.org/portal/collect...,oklahomarogersramona6mieoframonaonroad403betwe...,0,1,1
1,123678,26792115,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,40939571-993a-456e-964c-0f39a2f6f092,BRIT413097,,Poaceae,Erioneuron pulchellum,8502.0,(Kunth) Tateoka,Erioneuron,,pulchellum,,,Species,,,Toney M. Keeney,9043,1989-05-06,1989.0,5.0,6.0,126.0,,6 May 1989,,,,,United States,Texas,Kinney,,On the west slopes of the mountain east of the...,,,,,,,,,,,,,,,,,,nfn:2021-12-07,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texaskinneyonthewestslopesofthemountaineastoft...,1,0,1
2,419345,8774006,SRSC,,,b18db63b-13de-4aa8-b16a-615f312e66df,PreservedSpecimen,3b049768-988f-4b3b-9719-3ba6980f956d,00001874,,Euphorbiaceae,Euphorbia cinerascens,45590.0,Engelm. in Torr.,Euphorbia,,cinerascens,,,Species,,,,,,,,,,,,,,,,USA,Texas,Brewster,,,,,,,,,,,,,,,,,,,,,,,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texasbrewster,2,0,110
3,80310,24845952,BRIT,SMU,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,b3cd1a98-bf27-45d1-a7f7-99cc39838817,BRIT360272,,Boraginaceae,Nemophila phacelioides,92516.0,Nutt.,Nemophila,,phacelioides,,,Species,,,Fred B. Jones,1735A,1958-04-04,1958.0,4.0,4.0,94.0,,1958-4-4,In semi-shade. Rare.; Flowers light violet. St...,Sandy loam or alluvial soil.,,,United States,Texas,San Patricio,,About 10 miles north-east of Sinton on Welder ...,,,,,,,,,,,,,,,,,,,,TEX4BRITleg2572,2020-07-15 10:57:06,https://portal.torcherbaria.org/portal/collect...,texassanpatricioabout10milesnortheastofsintono...,3,0,1
4,147243,26816060,SMU,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,de72884c-4b98-451d-b977-52043bb0d77c,BRIT411756,,Poaceae,Briza minor,643.0,L.,Briza,,minor,,,Species,,,J. F. Hennen,393,1949-05-15,1949.0,5.0,15.0,135.0,,"May 15, 1949",,,herbariumSpecimenDuplicate: https://portal.tor...,,United States,Texas,Smith,,Tyler State Park (10 miles north of Tyler); at...,,,,,,,,,,,,,,,,,,TORCH-NfN-30,,,2024-10-29 08:00:38,https://portal.torcherbaria.org/portal/collect...,texassmithtylerstatepark10milesnorthoftylerate...,4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,718090,e1bd18a5-13a0-4cfc-8086-a2dd72cf525a,TEX,TEX,,,PreservedSpecimen,e1bd18a5-13a0-4cfc-8086-a2dd72cf525a,TEX00296496,,Amaranthaceae,Atriplex matamorensis,,,Atriplex,,matamorensis,,,Species,"Turner, Billie Lee",2000,Robert Runyon,1528,1933-05-01,1933.0,5.0,1.0,,,,,"Erect straight herb, leafy and pale green. Soi...",,,United States,Texas,Cameron,,On clay dune near coast.,,,,,,,,,,,,,,,,,,,,,,,texascamerononclaydunenearcoast,8379,0,1
9996,100960,24932632,BRIT,BRIT,,fea81a47-2365-45cc-bef9-b6bbff7457e6,PreservedSpecimen,543656ef-e2a2-4d08-86ec-fcd645af7edf,BRIT388547,,Polygalaceae,Polygala incarnata,96229.0,L.,Polygala,,incarnata,,,Species,,,J. Reverchon,1906,1900-06-13,1900.0,6.0,13.0,164.0,,13 June 1900,,Sands,,,United States,Texas,Wood,,Mineola,,,,,,,,,,,,,,,,,,nfn:britjb,,preprocessed,2023-07-16 07:02:03,https://portal.torcherbaria.org/portal/collect...,texaswoodmineola,4510,0,2
9997,574179,26647152,OKL,,OKL,ee375191-fabb-4bc6-8180-ede944cd1edf,PreservedSpecimen,177737fc-6915-4d37-9cd1-faeee5ef1c51,,OKL250893,Poaceae,Pascopyrum smithii,1665.0,(Rydb.) Barkworth & D. R. Dewey,Pascopyrum,,smithii,,,Species,C. A. Morse,2001-05-30,C. A. Morse; C. C. Freeman,6159,2001-05-30,2001.0,5.0,30.0,150.0,,,,"Eroded, rocky slopes on divide between Broken ...",herbariumSpecimenDuplicate: https://portal.tor...,,United States,Oklahoma,Roger Mills,,"Cheyenne; 2.75 mi S, 0.75 mi W of Cheyenne, Th...",,35.588300,-99.793400,,1609.0,13N 24W 19 S/2 of SW/4; T13N R24W sec19.0 S/2 ...,,,,False,Coordinates based on TRS centroid,,,,,,,,,JAM,2010-01-20 00:00:00,https://portal.torcherbaria.org/portal/collect...,oklahomarogermillscheyenne2.75mis0.75miwofchey...,7296,2,2
9998,643238,26591222,OKL,,OKL,ee375191-fabb-4bc6-8180-ede944cd1edf,PreservedSpecimen,03e521e3-df6f-4a3a-ab44-d1c560335a27,,DA-OKW-225,Asteraceae,Arctium minus,2768.0,(Hill) Bernh.,Arctium,,minus,,,Species,Amy Buthod,2017-07-12,Amy Buthod; Bruce Hoagland,OKW-225,2017-07-12,2017.0,7.0,12.0,193.0,,,,,,,United States,Oklahoma,Johnston,,The Nature Conservancy's Oka' Yanahli Preserve...,,34.441200,-96.644600,,1609.0,1S 6E 26; T1S R6E sec26.0,,,,False,Coordinates based on TRS centroid,,,,,,,,,TDF,2019-08-22 00:00:00,https://portal.torcherbaria.org/portal/collect...,oklahomajohnstonthenatureconservancy'soka'yana...,5008,1,2


In [36]:
df_summary = summarize_locations_by_region(result)

In [37]:
df_summary

Unnamed: 0,stateProvince,county,total_location_groups,groups_with_coordinates,groups_without_coordinates,total_records
0,Oklahoma,Adair,35,16,19,38
1,Oklahoma,Alfalfa,11,7,4,11
2,Oklahoma,Atoka,37,18,19,41
3,Oklahoma,Beaver,10,8,2,10
4,Oklahoma,Beckham,9,5,4,13
...,...,...,...,...,...,...
393,Texas,Yoakum,2,0,2,3
394,Texas,Young,3,2,1,3
395,Texas,Zapata,16,7,9,17
396,Texas,Zavala,4,1,3,4
