# Análisis exploratorio de data raw de gbif

Análisis de los datos descargados de gbif para evaluar pasos de limpieza.

## Librerías y ambiente

In [1]:
import sys
from pathlib import Path

ROOT = Path().resolve().parents[0]
sys.path.append(str(ROOT))

In [1]:
import pandas as pd

## Análisis

In [26]:
df_raw = pd.read_csv('../data/raw/df_trametes_sanguinea.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,hostingOrganizationKey,publishingCountry,protocol,lastCrawled,lastParsed,...,originalNameUsage,nameAccordingTo,georeferencedDate,distanceFromCentroidInMeters,fieldNotes,http://unknown.org/taxonRankID,verbatimSRS,verbatimCoordinateSystem,http://unknown.org/species,taxonRemarks
0,0,5007775853,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,US,DWC_ARCHIVE,2025-12-24T14:08:22.718+00:00,2025-12-25T06:16:59.298+00:00,...,,,,,,,,,,
1,1,5007900444,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,US,DWC_ARCHIVE,2025-12-24T14:08:22.718+00:00,2025-12-25T05:25:33.982+00:00,...,,,,,,,,,,
2,2,5036742586,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,US,DWC_ARCHIVE,2025-12-24T14:08:22.718+00:00,2025-12-25T03:50:47.119+00:00,...,,,,,,,,,,
3,3,5037098754,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,US,DWC_ARCHIVE,2025-12-24T14:08:22.718+00:00,2025-12-25T03:51:30.132+00:00,...,,,,,,,,,,
4,4,5037144192,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,AR,DWC_ARCHIVE,2025-12-24T14:08:22.718+00:00,2025-12-25T03:51:37.150+00:00,...,,,,,,,,,,


In [27]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3071 entries, 0 to 3070
Columns: 169 entries, Unnamed: 0 to taxonRemarks
dtypes: bool(2), float64(24), int64(11), object(132)
memory usage: 3.9+ MB


### Campos de fecha

Se conserva la columna eventDate que es la fecha normalizada de la observación. En este caso no interesa el horario.

In [28]:
date_cols = ['lastCrawled',
             'lastParsed',
             'dateIdentified',
             'eventDate',
             'year',
             'month',
             'day',
             'startDayOfYear',
             'endDayOfYear',
             'modified',
             'lastInterpreted',
             'verbatimEventDate',
             'eventTime']

df_raw[date_cols].describe()

Unnamed: 0,year,month,day,startDayOfYear,endDayOfYear
count,3026.0,2850.0,2722.0,2722.0,2722.0
mean,2014.479511,6.073333,15.544453,170.735489,170.735489
std,18.581082,3.188347,9.12696,98.270961,98.270961
min,1876.0,1.0,1.0,1.0,1.0
25%,2012.0,4.0,8.0,91.0,91.0
50%,2020.0,5.0,16.0,148.0,148.0
75%,2024.0,9.0,24.0,260.0,260.0
max,2025.0,12.0,31.0,365.0,365.0


### Geografía

- Hay algunas observaciones con latitud longitud nulas (las que tienen geoprivacy 'private').


In [30]:
geo_cols = ['decimalLatitude',
            'decimalLongitude',
            'continent',
            'stateProvince',
            'coordinateUncertaintyInMeters',
            ]

df_geo = df_raw[geo_cols]
df_geo.head()

Unnamed: 0,decimalLatitude,decimalLongitude,continent,stateProvince,coordinateUncertaintyInMeters
0,-32.28491,-60.693062,SOUTH_AMERICA,Santa Fe,
1,-32.269503,-60.69539,SOUTH_AMERICA,Santa Fe,
2,-34.154603,-58.537305,SOUTH_AMERICA,Buenos Aires,4.0
3,-23.68929,-64.820195,SOUTH_AMERICA,Jujuy,4702.0
4,-34.582583,-58.415888,SOUTH_AMERICA,Ciudad de Buenos Aires,14.0


In [31]:
df_geo.isna().sum()

decimalLatitude                     0
decimalLongitude                    0
continent                          10
stateProvince                     505
coordinateUncertaintyInMeters    1573
dtype: int64

In [32]:
df_raw.coordinateUncertaintyInMeters.describe() # se pueden filtrar las que tengan más uncertainty

count    1.498000e+03
mean     5.406696e+03
std      7.988577e+04
min      1.000000e+00
25%      1.700000e+01
50%      1.600000e+02
75%      3.750000e+02
max      2.362591e+06
Name: coordinateUncertaintyInMeters, dtype: float64

In [36]:
df_raw[df_raw.coordinateUncertaintyInMeters > 5000].shape

(83, 169)

In [38]:
df_raw.continent.value_counts()

continent
SOUTH_AMERICA    3060
NORTH_AMERICA       1
Name: count, dtype: int64

In [None]:
df_raw[df_raw.continent == 'NORTH_AMERICA'] # observación con country code erróneo

Unnamed: 0.1,Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,hostingOrganizationKey,publishingCountry,protocol,lastCrawled,lastParsed,...,originalNameUsage,nameAccordingTo,georeferencedDate,distanceFromCentroidInMeters,fieldNotes,http://unknown.org/taxonRankID,verbatimSRS,verbatimCoordinateSystem,http://unknown.org/species,taxonRemarks
2587,2587,3709724736,7625a91a-54fb-49cc-a916-6349169d2371,7a6bdf66-ef5c-4a81-b731-2e328f4881eb,4fd221cd-129c-404f-ab62-fdd2377d12ac,497c081b-8157-4287-bb5c-291a4c71439c,AR,EML,2026-01-01T03:01:19.472+00:00,2026-01-01T03:01:31.740+00:00,...,,,,,,,,,,


### Species guess and names

In [34]:
df_raw.scientificName.value_counts()

scientificName
Polyporaceae                                             1490
Trametes versicolor (L.) Lloyd                            119
Trametes Fr., 1836                                        117
Coltricia stuckertiana (Speg.) Rajchenb. & J.E.Wright     102
SH1248940.09FU                                             74
                                                         ... 
Poria perparadoxa (Speg.) Sacc.                             1
Fomes australis (Fr.) Cooke                                 1
Polyporus similis Berk.                                     1
Fomes lividus (Kalchbr.) Sacc.                              1
Daedalea elegans Spreng.                                    1
Name: count, Length: 197, dtype: int64

In [37]:
df_raw.acceptedScientificName.value_counts()

acceptedScientificName
Polyporaceae                                       1490
Trametes versicolor (L.) Lloyd                      120
Trametes Fr., 1836                                  117
Trametes stuckertiana (Speg.) Speg.                 104
SH1248940.09FU                                       74
                                                   ... 
Xerotus Fr.                                           1
Ganoderma multipileum Ding Hou                        1
Perenniporia vanhulleae Decock & Ryvarden             1
Ganoderma sanduense Hapuar., T.C.Wen & K.D.Hyde       1
Truncospora livida (Kalchbr.) Zmitr.                  1
Name: count, Length: 154, dtype: int64