In [19]:
# Generated using Claude.ai
#https://claude.ai/chat/18fa6039-d306-449d-958b-08ad823fe977

In [20]:
from pathlib import Path

import pandas as pd

data_path = '/mnt/DATA3-4TB/BRIT_git/TORCH_georeferencing/data/TORCH-data_snapshots_TX_OK_2024-12-06/'
input_file = 'torch_bels_locs_SAMPLE.tsv' # stored in data_path

def analyze_locations_v1(df):
    """
    Creates location IDs and analyzes location groups.
    
    Parameters:
    df (pd.DataFrame): DataFrame with 'bels_location_string' and coordinate columns
    
    Returns:
    pd.DataFrame: DataFrame with location IDs and group statistics
    """
    # Create location IDs
    unique_locations = df['bels_location_string'].unique()
    location_to_id = {loc: idx for idx, loc in enumerate(unique_locations)}
    df['bels_location_id'] = df['bels_location_string'].map(location_to_id)
    
    # Calculate stats per location
    location_stats = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: x.notna().sum(),
        'bels_location_string': 'count'
    }).rename(columns={
        'decimalLatitude': 'coordinate_count',
        'bels_location_string': 'bels_group_rec_count'
    })
    
    # Merge stats back to original dataframe
    return df.merge(location_stats, on='bels_location_string')

In [21]:
def analyze_locations_v2(df):
    """
    Creates location IDs, coordinate pair IDs, and analyzes location groups.
    
    Parameters:
    df (pd.DataFrame): DataFrame with 'bels_location_string' and coordinate columns
    
    Returns:
    pd.DataFrame: DataFrame with location IDs, coordinate IDs, and group statistics
    """
    # Create location IDs
    unique_locations = df['bels_location_string'].unique()
    location_to_id = {loc: idx for idx, loc in enumerate(unique_locations)}
    df['bels_location_id'] = df['bels_location_string'].map(location_to_id)
    
    # Create unique IDs for distinct coordinate pairs
    # First, create a tuple of coordinates to use as a key
    df['coord_pair'] = df.apply(
        lambda row: (row['decimalLongitude'], row['decimalLatitude'])
        if pd.notna(row['decimalLongitude']) and pd.notna(row['decimalLatitude'])
        else None,
        axis=1
    )
    
    # Create mapping of unique coordinate pairs to IDs
    unique_coords = df['coord_pair'].dropna().unique()
    coord_to_id = {coord: idx for idx, coord in enumerate(unique_coords)}
    
    # Map the IDs back to the dataframe
    df['coordinate_pair_id'] = df['coord_pair'].map(coord_to_id)
    
    # Drop the temporary coord_pair column
    df = df.drop('coord_pair', axis=1)
    
    # Calculate stats per location
    location_stats = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: x.notna().sum(),
        'bels_location_string': 'count'
    }).rename(columns={
        'decimalLatitude': 'coordinate_count',
        'bels_location_string': 'bels_group_rec_count'
    })
    
    # Merge stats back to original dataframe
    return df.merge(location_stats, on='bels_location_string')

In [37]:
def analyze_locations(df):
    """
    Creates location IDs, coordinate pair IDs, and analyzes location groups.
    Includes counts of matching coordinate pairs.
    
    Parameters:
    df (pd.DataFrame): DataFrame with 'bels_location_string' and coordinate columns
    
    Returns:
    pd.DataFrame: DataFrame with location IDs, coordinate IDs, and various statistics
    """
    # Create location IDs
    unique_locations = df['bels_location_string'].unique()
    location_to_id = {loc: idx for idx, loc in enumerate(unique_locations)}
    df['bels_location_id'] = df['bels_location_string'].map(location_to_id)
    
    # Create unique IDs for distinct coordinate pairs
    # First, create a tuple of coordinates to use as a key
    df['coord_pair'] = df.apply(
        lambda row: (row['decimalLongitude'], row['decimalLatitude'])
        if pd.notna(row['decimalLongitude']) and pd.notna(row['decimalLatitude'])
        else None,
        axis=1
    )
    
    # Create mapping of unique coordinate pairs to IDs
    unique_coords = df['coord_pair'].dropna().unique()
    coord_to_id = {coord: idx for idx, coord in enumerate(unique_coords)}
    
    # Map the IDs back to the dataframe and convert to integer
    df['coordinate_pair_id'] = df['coord_pair'].map(coord_to_id).astype('Int64')
    
    # Calculate count of matching coordinate pairs and convert to integer
    coord_counts = df['coord_pair'].value_counts().to_dict()
    df['coord_match_count'] = df['coord_pair'].map(coord_counts).astype('Int64')
    
    # Drop the temporary coord_pair column
    df = df.drop('coord_pair', axis=1)
    
    # Calculate stats per location
    location_stats = df.groupby('bels_location_string').agg({
        'decimalLatitude': lambda x: x.notna().sum(),
        'bels_location_string': 'count'
    }).rename(columns={
        'decimalLatitude': 'bels_group_coord_count',
        'bels_location_string': 'bels_group_rec_count'
    })
    
    # Merge stats back to original dataframe
    return df.merge(location_stats, on='bels_location_string')

In [38]:
def summarize_locations_by_region_v1(df):
    """
    Generates location group statistics for each state/county combination.
    /tmp/ipykernel_516747/3985458850.py:15: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  return df.groupby(['stateProvince', 'county']).apply(get_stats).reset_index()
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum()
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats).reset_index()

In [23]:
def summarize_locations_by_region_v2(df):
    """
    Generates location group statistics for each state/county combination.
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum()
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats, include_groups=False).reset_index()

In [24]:
def summarize_locations_by_region(df):
    """
    Generates location group statistics for each state/county combination.
    """
    def get_stats(group):
        total_groups = group['bels_location_id'].nunique()
        groups_with_coords = group.groupby('bels_location_id')['coordinate_count'].first()
        total_records = len(group)
        records_with_coords = group['decimalLatitude'].notna().sum()
        
        return pd.Series({
            'total_location_groups': total_groups,
            'groups_with_coordinates': (groups_with_coords > 0).sum(),
            'groups_without_coordinates': (groups_with_coords == 0).sum(),
            'total_records': total_records,
            'records_with_coordinates': records_with_coords,
            'records_without_coordinates': total_records - records_with_coords
        })
    
    return df.groupby(['stateProvince', 'county']).apply(get_stats, include_groups=False).reset_index()

In [39]:
# Example usage
if __name__ == "__main__":
    # Input DataFrame
    input_path = Path(data_path) / input_file
    df_occ = pd.read_csv(input_path, low_memory=False, sep='\t')
    
    # Process the DataFrame
    #result = add_coordinate_counts(data)
    result = analyze_locations(df_occ)
    
    # Display all columns
    pd.set_option('display.max_columns', None)
    print("\nProcessed DataFrame with coordinate counts:")
    print(result)



Processed DataFrame with coordinate counts:
             id institutionCode collectionCode ownerInstitutionCode  \
0      14218975            BRIT           BRIT                  NaN   
1      14218976            BRIT           BRIT                  NaN   
2      14218977            BRIT           BRIT                  NaN   
3      14218978            BRIT           BRIT                  NaN   
4      14218979            BRIT           BRIT                  NaN   
...         ...             ...            ...                  ...   
99994  24931667            BRIT           BRIT                  NaN   
99995  24931668            BRIT           BRIT                  NaN   
99996  24931669            BRIT           BRIT                  NaN   
99997  24931670            BRIT            SMU                  NaN   
99998  24931671            BRIT            SMU                  NaN   

                               collectionID      basisOfRecord  \
0      fea81a47-2365-45cc-bef9-b6bbf

In [36]:
result.shape
#df_sorted = result.sort_values(by=['coordinate_count'], ascending=False)
#df_sorted

(99999, 124)

In [28]:
df_summary = summarize_locations_by_region(result)

In [29]:
df_summary

Unnamed: 0,stateProvince,county,total_location_groups,groups_with_coordinates,groups_without_coordinates,total_records,records_with_coordinates,records_without_coordinates
0,Oklahoma,Adair,19,4,15,27,4,23
1,Oklahoma,Alfalfa,1,1,0,1,1,0
2,Oklahoma,Atoka,47,2,45,50,2,48
3,Oklahoma,Beaver,7,6,1,8,7,1
4,Oklahoma,Beckham,4,1,3,4,1,3
...,...,...,...,...,...,...,...,...
322,Texas,Yoakum,7,7,0,8,8,0
323,Texas,Young,78,75,3,143,138,5
324,Texas,Zapata,191,11,180,319,13,306
325,Texas,Zavala,60,0,60,98,0,98


In [33]:
print(result.head())

         id institutionCode collectionCode ownerInstitutionCode  \
0  14218975            BRIT           BRIT                  NaN   
1  14218976            BRIT           BRIT                  NaN   
2  14218977            BRIT           BRIT                  NaN   
3  14218978            BRIT           BRIT                  NaN   
4  14218979            BRIT           BRIT                  NaN   

                           collectionID      basisOfRecord  \
0  fea81a47-2365-45cc-bef9-b6bbff7457e6  PreservedSpecimen   
1  fea81a47-2365-45cc-bef9-b6bbff7457e6  PreservedSpecimen   
2  fea81a47-2365-45cc-bef9-b6bbff7457e6  PreservedSpecimen   
3  fea81a47-2365-45cc-bef9-b6bbff7457e6  PreservedSpecimen   
4  fea81a47-2365-45cc-bef9-b6bbff7457e6  PreservedSpecimen   

                           occurrenceID catalogNumber otherCatalogNumbers  \
0  ce2d9b58-9322-479a-b94a-97bf9afde740    BRIT122345                 NaN   
1  58730970-7f7d-4a7d-8968-14f8658971a1    BRIT122346                 