In [106]:
"""
#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings
"""

'\n#Loads a CSV DwC occurrence file that has been augmented with BELS locality strings\n'

In [107]:
import pandas as pd
import matplotlib

# switch to suppress CSV writing (which is slow)
write_csv = True

In [108]:
# BRIT UT colab
df_occ = pd.read_csv('torch_bels_BRIT_UT_locs.csv', low_memory=False, sep='\t')


In [109]:
# file name components to make writing CSVs dynamic and prevent overwrites
batch_prefix = 'TORCH'
batch_designator = 'BRIT_UT'

In [110]:
df_occ.shape

(513076, 96)

In [165]:
# Collections to include via institution code
BRIT_plus = ['BRIT','VDB','NLU','HSU','ACU','TAC','NTSC','TCSW','SMU']
UT_plus = ['TEX', 'LL']
BRIT_UT_all = BRIT_plus + UT_plus

In [112]:
# Counties for initial comparision
tx_transpecos_counties = ['Brewster','Jeff Davis','Presidio','El Paso','Val Verde','Culberson','Hudspeth','Terrell','Reeves','Winkler','Ward','Crockett','Loving','Crane','Upton']

In [113]:
# Drop irrelevant colums
# NOTE not all of these columns occur across all collections
#drop_columns= ['higherClassification','kingdom','phylum','class','order','identificationReferences','identificationRemarks','taxonRemarks','identificationQualifier','typeStatus','fieldNumber','eventID','informationWithheld','dataGeneralizations','dynamicProperties','associatedSequences','associatedTaxa','reproductiveCondition','establishmentMeans','lifeStage','sex','individualCount','samplingProtocol','preparations','continent','waterBody','islandGroup','island','rights','rightsHolder','accessRights','recordID','type','license','bibliographicCitation','datasetName','fieldNotes','countryCode','nomenclaturalCode','nomenclaturalStatus','associatedMedia','higherGeography','institutionID','georeferencedDate','datasetID','occurrenceStatus','verbatimLocality','organismID','previousIdentifications','eventTime','eventRemarks','locationAccordingTo','verbatimCoordinateSystem','footprintWKT','earliestEonOrLowestEonothem','earliestEraOrLowestErathem','earliestPeriodOrLowestSystem','earliestEpochOrLowestSeries','earliestAgeOrLowestStage','group','formation','member','identificationVerificationStatus','scientificNameID']

# more agressive drop
drop_columns= ['higherClassification','kingdom','phylum','class','order','identificationReferences',
               'identificationRemarks','taxonRemarks','identificationQualifier','typeStatus','fieldNumber',
               'eventID','informationWithheld','dataGeneralizations','dynamicProperties','associatedSequences',
               'associatedTaxa','reproductiveCondition','establishmentMeans','lifeStage','sex','individualCount',
               'samplingProtocol','preparations','continent','waterBody','islandGroup','island','rights','rightsHolder',
               'accessRights','recordID','type','license','bibliographicCitation','datasetName','fieldNotes','countryCode',
               'nomenclaturalCode','nomenclaturalStatus','associatedMedia','higherGeography','institutionID','georeferencedDate',
               'datasetID','occurrenceStatus','verbatimLocality','organismID','previousIdentifications','eventTime',
               'eventRemarks','locationAccordingTo','verbatimCoordinateSystem','footprintWKT','earliestEonOrLowestEonothem',
               'earliestEraOrLowestErathem','earliestPeriodOrLowestSystem','earliestEpochOrLowestSeries','earliestAgeOrLowestStage',
               'group','formation','member','identificationVerificationStatus','scientificNameID',
               'basisOfRecord','subgenus','specificEpithet','verbatimTaxonRank','infraspecificEpithet','taxonRank','identifiedBy',
               'dateIdentified','year','month','day','startDayOfYear','endDayOfYear'
              ]


df_occ = df_occ.drop(columns=drop_columns, errors='ignore')

In [114]:
df_occ.shape

(513076, 49)

In [115]:
# Find duplicate records
df_matches = df_occ[df_occ['bels_location_string'].duplicated(keep=False)]
#df_matches = df_denton[df_denton.bels_matchwithcoords

In [116]:
df_matches.shape

(327700, 49)

In [117]:
# Add location ID
# first create a copy to avoid copy of a slice of a DF
df_matches = df_matches.copy()
# https://stackoverflow.com/a/51110197 or https://stackoverflow.com/a/51110205
df_matches['loc_id'] = df_matches.groupby(['bels_location_string']).ngroup()

In [118]:
# Add dup loc count column for the size of each location cluster (number of duplicate locations)
#https://stackoverflow.com/a/46768694
df_matches['dup_loc_count'] = df_matches.groupby(['bels_location_string']).transform('size')

In [119]:
# Add a count of the number of records in each dup gruoup that have a geocoord
# https://www.statology.org/pandas-groupby-count-with-condition/
#df.groupby('var1')['var2'].apply(lambda x: (x=='val').sum()).reset_index(name='count')
#['decimalLatitude'].isna()
df_counts = df_matches.groupby('loc_id')['decimalLatitude'].apply(lambda x: (x.isna()).sum()).reset_index(name='count')

In [120]:
# With help from Claude.ai, a new approach to store the results into the original dataframe:
df_matches['loc_geo_count'] = df_matches.groupby('loc_id')['decimalLatitude'].transform(lambda x: (x.notna()).sum())

In [121]:
df_matches.shape

(327700, 52)

In [122]:
# count number of locations with no georef
df_zero_geo = df_matches[df_matches['loc_geo_count'] == 0]

In [123]:
df_zero_geo.shape

(153264, 52)

In [124]:
# number of unique locations with zero geo
#df_zero_geo['loc_id'].nunique()

In [125]:
# in Texas
df_zero_geo_tx = df_zero_geo[df_zero_geo['stateProvince'] == 'Texas']

In [126]:
df_zero_geo_tx.shape

(128698, 52)

In [127]:
if write_csv:
    filename = batch_prefix+'_'+batch_designator+'_zero_geo_tx.csv'
    df_matches.to_csv(filename)

In [128]:
# filter loc with insufficient locality string
# Drop records with only the state name in bels string
df_zero_geo_no_state_only = df_zero_geo_tx[~df_zero_geo_tx.apply(lambda row: row['stateProvince'].lower() == row['bels_location_string'].lower(), axis=1)]

# remove state + county matches
df_zero_geo_no_state_county_only = df_zero_geo_state_only[~df_zero_geo_state_only.apply(lambda row: (str(row['stateProvince']) + str(row['county'])).lower() == row['bels_location_string'].lower(), axis=1)]

df_zero_geo_no_state_county_only.shape

(124596, 52)

In [129]:
# TODO
# remove other strings that indicate low likelyhood of being able to georef
#texasnoadditionallocalitydataonsheet
#texastarrantnodata
#texasdentondentonconoadditionallocalitydataonsheet
#texashendersonnoadditionallocalitydataonsheet
# texashendersonhendersonconoadditionallocalitydataonsheet
no_locality_strings = ['noadditionallocalitydataonsheet', 'nodata']

# Drop rows where the column contains any of the patterns
df_zero_geo_tx_cleaned = df_zero_geo_no_state_county_only[~df_zero_geo_no_state_county_only['bels_location_string'].str.contains('|'.join(no_locality_strings), case=True, na=False)]
df_zero_geo_tx_cleaned.shape

(122131, 52)

In [130]:
# add counts by county
#df_nogeo_with_geolocdups_nostatecounty
county_summary = df_zero_geo_state_county_only.groupby(['stateProvince','county']).agg(
    total_location_recs=('loc_id', 'count'),
    #sum_id_count=('dup_loc_count', 'sum'),
    unique_locations=('loc_id', 'nunique'),
    #dup_loc_count=('dup_loc_count', 'min'),
    loc_geo_count=('loc_geo_count', 'max')
)

In [131]:
county_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,total_location_recs,unique_locations,loc_geo_count
stateProvince,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Texas,Anderson,1445,265,0
Texas,Andrews,146,44,0
Texas,Angelina,715,166,0
Texas,Aransas,941,207,0
Texas,Archer,232,43,0
Texas,...,...,...,...
Texas,Yoakum,18,5,0
Texas,Young,137,42,0
Texas,Zapata,246,72,0
Texas,Zavala,163,45,0


In [132]:
if write_csv:
    filename = batch_prefix+'_'+batch_designator+'_county_summary.csv'
    #df_matches.to_csv('TORCH_bels_matches_loc_id.csv')
    county_summary.to_csv(filename)

In [133]:
# add counts by location
loc_summary = df_zero_geo_state_county_only.groupby(['stateProvince','county', 'loc_id']).agg(
    dup_loc_count=('loc_id', 'count'),
    #sum_id_count=('dup_loc_count', 'sum'),
    #dup_loc_count=('dup_loc_count', 'max'),
    #loc_geo_count=('loc_geo_count', 'max')
)

In [134]:
loc_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dup_loc_count
stateProvince,county,loc_id,Unnamed: 3_level_1
Texas,Anderson,6811,4
Texas,Anderson,6812,2
Texas,Anderson,6813,4
Texas,Anderson,6814,2
Texas,Anderson,6815,4
Texas,...,...,...
Texas,Zavala,65426,3
Texas,Zavala,65427,2
Texas,Zavala,65428,2
Texas,Zavala,65429,3


In [135]:
if write_csv:
    filename = batch_prefix+'_'+batch_designator+'_loc_summary.csv'
    #df_matches.to_csv('TORCH_bels_matches_loc_id.csv')
    loc_summary.to_csv(filename)

In [136]:
# dups with goecoords
dup_loc_count_geo = df_zero_geo_tx_cleaned.pivot_table(index = ['bels_location_string'], aggfunc ='size')

In [137]:
dup_loc_count_geo

bels_location_string
texas1misofhillister                                                        3
texas5mileseastofnacogdochesonhwy7.                                         2
texas5minofjacksboro                                                        2
texas5thflieshle                                                            2
texasab                                                                    29
                                                                           ..
texaszavalaushighway83,nuecesriverbottom10milesnorthoflapryortexas          3
texaszavalaushighway83,nuecesriverbottom9milesnorthoflapryor                3
texaszavalaushighway83,nuecesriverbottom9milesnorthoflapryorinsandyloam     2
texaszavalaushwy83,11milesnorthofcrystalcity                                2
texaszavalauvaldecrystalcity                                                3
Length: 30421, dtype: int64

In [138]:
#TODO
#Load ids of elite georeferencers - EG
#df_georeferencers = pd.read_csv('TORCH-georeferencers_tested.csv', low_memory=False)

#find records with geo that have been done by EGs
#find records without geo that match those done by EGs

In [139]:
# find locations that match loc_id of those done by EG
#df_loc_match_by_egeo = df_matches[df_matches['loc_id'].isin(df_by_egeo['loc_id'])]

In [140]:
#df_loc_match_by_egeo.shape

In [141]:
county_summary_nogeo = df_zero_geo_tx_cleaned.groupby(['stateProvince','county']).agg(
    #total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    #sum_id_count=('dup_loc_count', 'sum')
    dup_count=('loc_id', 'count')
)
# Reset the index to make 'state' and 'county' regular columns
county_summary_nogeo = county_summary_nogeo.reset_index()
print("\nSummary with reset index:")
print(county_summary_nogeo)


Summary with reset index:
    stateProvince    county  unique_locations  dup_count
0           Texas  Anderson               264       1443
1           Texas   Andrews                44        146
2           Texas  Angelina               166        715
3           Texas   Aransas               207        941
4           Texas    Archer                42        230
..            ...       ...               ...        ...
259         Texas    Yoakum                 5         18
260         Texas     Young                41        135
261         Texas    Zapata                70        242
262         Texas    Zavala                45        163
263         Texas  mitchell                 1          1

[264 rows x 4 columns]


In [166]:
# BRIT collections only (including recent adoptions)
#df_torch_texas_brit_zero_geo = df_zero_geo_tx_cleaned[(df_zero_geo_tx_cleaned['institutionCode'] == 'BRIT') | (df_zero_geo_tx_cleaned['institutionCode'] == 'VDB')]
df_torch_texas_brit_zero_geo = df_zero_geo_tx_cleaned[(df_zero_geo_tx_cleaned['institutionCode'].isin(BRIT_plus))]


df_torch_texas_brit_zero_geo.shape


(62028, 52)

In [162]:
# UT collections only
#df_torch_texas_ut_zero_geo = df_zero_geo_tx_cleaned[(df_zero_geo_tx_cleaned['institutionCode'] == 'TEX') | (df_zero_geo_tx_cleaned['institutionCode'] == 'LL')]
df_torch_texas_ut_zero_geo = df_zero_geo_tx_cleaned[(df_zero_geo_tx_cleaned['institutionCode'].isin(UT_plus))]

df_torch_texas_ut_zero_geo.shape

(60103, 52)

In [167]:
# Summarize BRIT Texas records
brit_texas_county_summary_nogeo = df_torch_texas_brit_zero_geo.groupby(['stateProvince','county']).agg(
    #total_locations=('loc_id', 'count'),
    unique_locations=('loc_id', 'nunique'),
    #sum_id_count=('dup_loc_count', 'sum')
    loc_count=('loc_id', 'count'),
    largest_dups = ('dup_loc_count', 'max')
)
# Reset the index to make 'state' and 'county' regular columns
brit_texas_county_summary_nogeo = brit_texas_county_summary_nogeo.reset_index()
print("\nSummary with reset index:")
print(brit_texas_county_summary_nogeo)


Summary with reset index:
    stateProvince    county  unique_locations  loc_count  largest_dups
0           Texas  Anderson               186        718            77
1           Texas   Andrews                 3          4             4
2           Texas  Angelina               105        348            70
3           Texas   Aransas               130        463            53
4           Texas    Archer                38        174            44
..            ...       ...               ...        ...           ...
223         Texas      Wood               172        811            58
224         Texas     Young                35        110             9
225         Texas    Zapata                51        158            14
226         Texas    Zavala                28         85            16
227         Texas  mitchell                 1          1             2

[228 rows x 5 columns]


In [168]:
# export BRIT Texas summary
#brit_texas_county_summary_nogeo.to_csv('TORCH_brit_texas_county_summary_zerogeo.csv')
if write_csv:
    filename = batch_prefix+'_'+batch_designator+'_brit_texas_county_summary_nogeo.csv'
    #df_matches.to_csv('TORCH_bels_matches_loc_id.csv')
    brit_texas_county_summary_nogeo.to_csv(filename)

In [181]:
#county_summary_nogeo.to_csv('TORCH_county_summary_nogeo.csv')

In [182]:
# Sampling individual counties
#df_torch_texas_brit_nogeo
# df_torch_texas_brit_nogeo = df_torch_texas_nogeo_nostatecounty[(df_torch_texas_nogeo_nostatecounty['institutionCode'] == 'BRIT') | (df_torch_texas_nogeo_nostatecounty['institutionCode'] == 'VDB')]

df_brit_young_nogeo = df_torch_texas_brit_nogeo[df_torch_texas_brit_nogeo['county'] == 'Young']

df_brit_young_nogeo.shape



(99, 64)

In [169]:
# Sampling Texas regions
# Trans Pecos

df_torch_texas_transpecos_zero_geo = df_zero_geo_tx_cleaned[(df_zero_geo_tx_cleaned['county'].isin(tx_transpecos_counties))]


In [170]:
df_torch_texas_transpecos_zero_geo.shape

(21352, 52)

In [171]:
df_torch_texas_transpecos_BRIT_zero_geo = df_torch_texas_transpecos_zero_geo[(df_torch_texas_transpecos_zero_geo['institutionCode'].isin(BRIT_plus))]

In [172]:
df_torch_texas_transpecos_BRIT_zero_geo.shape

(7778, 52)

In [173]:
df_torch_texas_transpecos_UT_zero_geo = df_torch_texas_transpecos_zero_geo[(df_torch_texas_transpecos_zero_geo['institutionCode'].isin(UT_plus))]

In [174]:
df_torch_texas_transpecos_UT_zero_geo.shape

(13574, 52)

In [176]:
# unique loc in TP
#df['hID'].nunique()
df_torch_texas_transpecos_zero_geo['loc_id'].nunique()

5871

In [177]:
# Compare to total number of records held by BRIT and UT
df_transpecos_occ = df_occ[(df_occ['county'].isin(tx_transpecos_counties))]

In [178]:
df_transpecos_occ.shape

(68173, 49)