In [1]:
import requests
import pandas as pd

def get_gbif_occurrences(species_name, limit=300, max_records=200000):
    """
    Descarga todas las ocurrencias de una especie desde la API de GBIF.
    Pagina automáticamente hasta conseguir max_records o que no haya más datos.
    """
    offset = 0
    all_records = []

    print("Descargando ocurrencias desde GBIF...")

    while True:
        url = (
            "https://api.gbif.org/v1/occurrence/search?"
            f"scientificName={species_name}&"
            "hasCoordinate=true&"
            "limit=300&"
            f"offset={offset}"
        )

        r = requests.get(url)
        data = r.json()

        batch = data.get("results", [])
        all_records.extend(batch)

        print(f"Descargados: {len(all_records)} registros", end="\r")

        # detener si no hay más datos
        if len(batch) < limit:
            break

        # detener si excedemos lo deseado
        if len(all_records) >= max_records:
            print("\n⚠️ Se alcanzó max_records.")
            break

        offset += limit

    print(f"\nTotal descargado: {len(all_records)} registros")
    return pd.DataFrame(all_records)

In [2]:
# -------------------------
# USO
# -------------------------

species = "Pycnoporus sanguineus"
df_raw = get_gbif_occurrences(species)

print("\nColumnas disponibles:")
print(df_raw.columns)

print(f"\nFilas totales: {len(df_raw)}")

Descargando ocurrencias desde GBIF...
Descargados: 3849 registros
Total descargado: 3849 registros

Columnas disponibles:
Index(['key', 'datasetKey', 'publishingOrgKey', 'installationKey',
       'hostingOrganizationKey', 'publishingCountry', 'protocol',
       'lastCrawled', 'lastParsed', 'crawlId',
       ...
       'http://unknown.org/zone', 'sex', 'nomenclaturalStatus',
       'infraspecificEpithet', 'waterBody', 'associatedOccurrences',
       'dataGeneralizations', 'http://unknown.org/rightsHolder',
       'http://unknown.org/license', 'http://unknown.org/orders'],
      dtype='object', length=197)

Filas totales: 3849


In [4]:
keep_cols = ['scientificName',
        'decimalLatitude',
        'decimalLongitude',
        'coordinateUncertaintyInMeters',
        'continent',
        'stateProvince',
        'year',
        'month',
        'day',
        'eventDate',
        'startDayOfYear',
        'endDayOfYear',
        'issues',
        'modified',
        'lastInterpreted',
        'isSequenced',
        'isInCluster',
        'countryCode',
        'gbifRegion',
        'country',
        'county',
        'municipality',
        'locality',
        'institutionCode',
        'datasetName',
        'identificationVerificationStatus']

In [5]:
df_gbif = df_raw[keep_cols]

In [6]:
df_gbif.scientificName.value_counts() # sacar los q no van

scientificName
Pycnoporus sanguineus (L.) Murrill                2938
Polyporus sanguineus (L.) Fr.                      646
Polystictus sanguineus (L.) G.Mey.                 183
Fabisporus sanguineus (L.) Zmitr.                   37
Coriolus sanguineus (L.) G.Cunn.                    18
SH1122470.09FU                                      12
Trametes sanguinea (L.) Imazeki, 1943                8
Trametes sanguinea (L.) Lloyd                        4
Trametes cinnabarina var. sanguinea (L.) Pilát       1
Microporus sanguineus (L.) Pat.                      1
Boletus sanguineus L.                                1
Name: count, dtype: int64

In [7]:
df_gbif.coordinateUncertaintyInMeters.describe() # hay algunos con valores altos 
# se podrian filtrar por algun valor

count    1.825000e+03
mean     1.545798e+04
std      1.052955e+05
min      2.000000e+00
25%      9.000000e+01
50%      1.000000e+03
75%      6.562000e+03
max      2.765311e+06
Name: coordinateUncertaintyInMeters, dtype: float64

In [8]:
df_gbif.continent.value_counts()

continent
NORTH_AMERICA    1989
SOUTH_AMERICA     932
OCEANIA           385
AFRICA            358
ASIA              146
EUROPE             13
Name: count, dtype: int64

In [9]:
df_gbif.country.value_counts()

country
Mexico                      922
United States of America    713
Brazil                      646
Australia                   306
South Africa                148
                           ... 
Virgin Islands (British)      1
Bangladesh                    1
Belgium                       1
Saint Barthélemy              1
Tokelau                       1
Name: count, Length: 98, dtype: int64

In [10]:
df_gbif_arg = df_gbif[df_gbif.country == 'Argentina']

In [11]:
df_gbif_arg.stateProvince.isna().sum()

np.int64(9)

In [12]:
df_gbif_arg.stateProvince.value_counts()

stateProvince
Córdoba       14
Salta          5
Jujuy          3
Cordoba        2
Tucuman        1
Entre Rios     1
Entre Ríos     1
Name: count, dtype: int64

In [13]:
df_gbif_arg.locality.isna().sum()

np.int64(2)

In [14]:
df_gbif_arg.locality.value_counts() #locality no va para arg

locality
Loma Bola.                                                                           2
Quebrada del Condorito                                                               2
Los Baldes                                                                           2
Concepcion del Uruguay                                                               2
Tucuman                                                                              2
Pampa de Achala, Parque Nac. Quebrada del Condorito                                  1
Parque Nac. El Rey                                                                   1
<p>PN Calilegua</p>                                                                  1
PARQUE NACIONAL CALILEGUA                                                            1
Santa Barbara range | 80 km S of San Pedro de Jujuy                                  1
Cerro Colorado                                                                       1
Parque Nac. El Rey- Sendero las Ch

In [15]:
df_gbif_arg.county.value_counts()

county
Anta           2
Capital        1
Trancas        1
Punila         1
Ledesma        1
Prov. Jujuy    1
Prov. Salta    1
Federación     1
Name: count, dtype: int64

In [16]:
df_gbif.eventDate # no todas en el mismo formato

0             2025-01-27
1       2025-02-12T00:00
2             2025-03-08
3             2025-03-11
4             2025-04-08
              ...       
3844                 NaN
3845                 NaN
3846                 NaN
3847                 NaN
3848                 NaN
Name: eventDate, Length: 3849, dtype: object

In [17]:
df_gbif.issues.value_counts() # ver que son y si conviene sacar alguna

issues
[GEODETIC_DATUM_ASSUMED_WGS84, CONTINENT_DERIVED_FROM_COORDINATES, TAXON_ID_NOT_FOUND, INSTITUTION_MATCH_FUZZY]                                                                                                   529
[CONTINENT_DERIVED_FROM_COORDINATES]                                                                                                                                                                              241
[COORDINATE_ROUNDED, CONTINENT_DERIVED_FROM_COORDINATES, SCIENTIFIC_NAME_ID_NOT_FOUND, TAXON_ID_NOT_FOUND, ELEVATION_MIN_MAX_SWAPPED, OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_COUNT, AMBIGUOUS_INSTITUTION]    173
[CONTINENT_DERIVED_FROM_COORDINATES, TAXON_CONCEPT_ID_NOT_FOUND]                                                                                                                                                  159
[COORDINATE_ROUNDED, GEODETIC_DATUM_ASSUMED_WGS84, CONTINENT_DERIVED_FROM_COORDINATES, SCIENTIFIC_NAME_ID_NOT_FOUND, TAXON_ID_NOT_FOUND, 

In [20]:
df_gbif.institutionCode.value_counts()

institutionCode
BPI         580
ENCB-IPN    255
NY          213
ECOSUR      186
LSUM        139
           ... 
UT            1
CTES          1
BMS           1
B             1
HUSA          1
Name: count, Length: 143, dtype: int64

In [19]:
df_gbif.identificationVerificationStatus.value_counts() # ver si conviene filtrar

identificationVerificationStatus
Approved | Expert | Evidence                                                                                                                143
verified                                                                                                                                     29
Probable                                                                                                                                      9
Probable, high degree of reliability                                                                                                          4
Invalide                                                                                                                                      3
<a href='https://bee.questagame.com/#/expert/pastwork/454550/comments?search=true&sighting_source_id=454550&_k=1vxxpo'>Challenge ID?</a>      1
<a href='https://bee.questagame.com/#/expert/pastwork/455663/comments?search=true&sighting_source_id=45