In [134]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow as pa
import os

In [136]:
chile_df = pd.read_csv('data/chile.csv',  low_memory=False)
argentina_df = pd.read_csv('data/argentina.csv',  low_memory=False)
antartica_df = pd.read_csv('data/antartica.csv',  low_memory=False)

dataframes = {
    'Chile': chile_df,
    'Argentina': argentina_df,
    'Antártica': antartica_df
}

for name, df in dataframes.items():
    print(f"DataFrame: {name}")
    print(f"Número de columnas: {len(df.columns)}")
    print("Información de las columnas:")
    print(df.info(5))
    print("\n")


DataFrame: Chile
Número de columnas: 146
Información de las columnas:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9481 entries, 0 to 9480
Data columns (total 146 columns):
 #    Column               Dtype  
---   ------               -----  
 0    occurrence_no        int64  
 1    record_type          object 
 2    reid_no              float64
 3    flags                object 
 4    collection_no        int64  
 5    accepted_name        object 
 6    accepted_attr        float64
 7    accepted_rank        object 
 8    accepted_no          float64
 9    early_interval       object 
 10   late_interval        object 
 11   max_ma               float64
 12   min_ma               float64
 13   ref_author           object 
 14   ref_pubyr            int64  
 15   reference_no         int64  
 16   phylum               object 
 17   phylum_no            object 
 18   class                object 
 19   class_no             object 
 20   order                object 
 21   order_no   

Como grupo fuimos viendo manualmente que columnas nos servirian para poder responder las preguntas planteadas en la propuesta inicial, y llegamos a un concenso de elegir las siguientes 19 columnas como base para poder responder estas.

In [138]:
columns = [
    'species_name',
    'occurrence_no',
    'accepted_name',
    'accepted_rank',
    'early_interval',
    'late_interval',
    'max_ma',
    'min_ma',
    'phylum',
    'class',
    'order',
    'family',
    'genus',
    'lat',
    'lng',
    'diet',
    'geogscale',
    'regionalsection',
    'life_habit'
]

df_chile_new = chile_df[columns].copy()
df_argentina_new = argentina_df[columns].copy()
df_antartica_new = antartica_df[columns].copy()

In [139]:
df_chile_new

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,regionalsection,life_habit
0,ceciliana,16417,Leionucula,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculida,Nuculidae,Leionucula,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,,infaunal
1,amuriensis,16418,Nuculana,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Nuculanidae,Nuculana,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,,infaunal
2,cuneiformis,16419,Nuculana,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Nuculanidae,Nuculana,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,,infaunal
3,levitestata,16420,Yoldia,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Yoldiidae,Yoldia,-37.000000,-73.000000,deposit feeder,basin,,infaunal
4,pencana,16421,Neiloides,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculida,NO_FAMILY_SPECIFIED,Neiloides,-37.000000,-73.000000,deposit feeder,basin,,infaunal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9476,sp.,1659402,Ostrea,genus,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Ostreida,Ostreidae,Ostrea,-50.884998,-72.625000,suspension feeder,,,epifaunal
9477,antarctica,1659403,Cucullaea antarctica,species,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Arcida,Cucullaeidae,Cucullaea,-50.884998,-72.625000,suspension feeder,,,infaunal
9478,sp.,1659404,Nucula,genus,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Nuculida,Nuculidae,Nucula,-50.884998,-72.625000,"deposit feeder, suspension feeder",,,infaunal
9479,saladensis,1660104,Micrauchenia saladensis,species,Tortonian,Messinian,11.63,5.333,Chordata,Mammalia,Panameriungulata,Macraucheniidae,Micrauchenia,-27.684723,-70.949997,herbivore,,,scansorial


In [140]:
obtener_duplicados_chile = df_chile_new.duplicated(keep=False)
datos_duplicados_chile = df_chile_new[obtener_duplicados_chile]
datos_duplicados_chile

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,regionalsection,life_habit


In [141]:
obtener_duplicados_argentina = df_argentina_new.duplicated(keep=False)
datos_duplicados_argentina = df_argentina_new[obtener_duplicados_argentina]
datos_duplicados_argentina

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,regionalsection,life_habit


In [142]:
obtener_duplicados_antartica = df_antartica_new.duplicated(keep=False)
datos_duplicados_antartica = df_antartica_new[obtener_duplicados_antartica]
datos_duplicados_antartica

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,regionalsection,life_habit


In [143]:
df_chile_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9481 entries, 0 to 9480
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   species_name     9481 non-null   object 
 1   occurrence_no    9481 non-null   int64  
 2   accepted_name    9343 non-null   object 
 3   accepted_rank    9343 non-null   object 
 4   early_interval   9481 non-null   object 
 5   late_interval    1605 non-null   object 
 6   max_ma           9481 non-null   float64
 7   min_ma           9481 non-null   float64
 8   phylum           9337 non-null   object 
 9   class            9292 non-null   object 
 10  order            9123 non-null   object 
 11  family           8914 non-null   object 
 12  genus            8574 non-null   object 
 13  lat              9481 non-null   float64
 14  lng              9481 non-null   float64
 15  diet             8749 non-null   object 
 16  geogscale        6426 non-null   object 
 17  regionalsectio

como podemos ver la columan 17

como todos los float e int estan completos

In [146]:
df_chile_new = df_chile_new.drop(columns = ["regionalsection"])

In [147]:
df_chile_new

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,life_habit
0,ceciliana,16417,Leionucula,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculida,Nuculidae,Leionucula,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,infaunal
1,amuriensis,16418,Nuculana,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Nuculanidae,Nuculana,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,infaunal
2,cuneiformis,16419,Nuculana,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Nuculanidae,Nuculana,-37.000000,-73.000000,"deposit feeder, suspension feeder",basin,infaunal
3,levitestata,16420,Yoldia,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculanida,Yoldiidae,Yoldia,-37.000000,-73.000000,deposit feeder,basin,infaunal
4,pencana,16421,Neiloides,genus,Maastrichtian,,72.10,66.000,Mollusca,Bivalvia,Nuculida,NO_FAMILY_SPECIFIED,Neiloides,-37.000000,-73.000000,deposit feeder,basin,infaunal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9476,sp.,1659402,Ostrea,genus,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Ostreida,Ostreidae,Ostrea,-50.884998,-72.625000,suspension feeder,,epifaunal
9477,antarctica,1659403,Cucullaea antarctica,species,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Arcida,Cucullaeidae,Cucullaea,-50.884998,-72.625000,suspension feeder,,infaunal
9478,sp.,1659404,Nucula,genus,Turonian,Campanian,93.90,72.100,Mollusca,Bivalvia,Nuculida,Nuculidae,Nucula,-50.884998,-72.625000,"deposit feeder, suspension feeder",,infaunal
9479,saladensis,1660104,Micrauchenia saladensis,species,Tortonian,Messinian,11.63,5.333,Chordata,Mammalia,Panameriungulata,Macraucheniidae,Micrauchenia,-27.684723,-70.949997,herbivore,,scansorial


In [148]:
df_argentina_new = df_argentina_new.drop(columns = ["regionalsection"])

In [149]:
df_argentina_new

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,life_habit
0,sp.,17744,Nuculana,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Nuculanida,Nuculanidae,Nuculana,-39.000000,-69.000000,"deposit feeder, suspension feeder",basin,infaunal
1,rocana,17745,Cucullaea,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Arcida,Cucullaeidae,Cucullaea,-39.000000,-69.000000,suspension feeder,basin,infaunal
2,feruglioli,17746,Glycymerita,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Arcida,Glycymerididae,Glycymerita,-39.000000,-69.000000,suspension feeder,basin,semi-infaunal
3,sp.,17747,Glycymerita,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Arcida,Glycymerididae,Glycymerita,-39.000000,-69.000000,suspension feeder,basin,semi-infaunal
4,mechanquilensis,17748,Inoceramus,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Myalinida,Inoceramidae,Inoceramus,-39.000000,-69.000000,suspension feeder,basin,epifaunal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31876,cretacea,1664301,Guembelitria cretacea,species,Maastrichtian,,72.1,66.0,Foraminifera,NO_CLASS_SPECIFIED,Foraminifera,Guembelitriidae,Guembelitria,-38.099998,-68.383331,omnivore,,semi-infaunal
31877,macrocephala,1664302,Rugoglobigerina macrocephala,species,Maastrichtian,,72.1,66.0,Foraminifera,NO_CLASS_SPECIFIED,Foraminifera,Globotruncanidae,Rugoglobigerina,-38.099998,-68.383331,omnivore,,semi-infaunal
31878,excellens,1664303,Cythereis excellens,species,Maastrichtian,,72.1,66.0,Arthropoda,Ostracoda,Podocopida,Trachyleberididae,Cythereis,-38.099998,-68.383331,"detritivore, grazer",,epifaunal
31879,venusta,1664304,Petalocythereis venusta,species,Maastrichtian,,72.1,66.0,Arthropoda,Ostracoda,Podocopida,Trachyleberididae,Petalocythereis,-38.099998,-68.383331,"detritivore, grazer",,epifaunal


In [150]:
df_antartica_new = df_antartica_new.drop(columns = ["regionalsection"])

In [158]:
df_antartica_new

Unnamed: 0,species_name,occurrence_no,accepted_name,accepted_rank,early_interval,late_interval,max_ma,min_ma,phylum,class,order,family,genus,lat,lng,diet,geogscale,life_habit
0,rossiana,16484,Solemya,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Solemyida,Solemyidae,Solemya,-64.000000,-57.000000,"deposit feeder, chemosymbiotic",basin,deep infaunal
1,suboblonga,16485,Nucula (Leionucula) suboblonga,species,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Nuculida,Nuculidae,Leionucula,-64.000000,-57.000000,"deposit feeder, suspension feeder",basin,infaunal
2,nordenskjoldi,16486,Nordenskjoeldia,genus,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Arcida,Parallelodontidae,Nordenskjoeldia,-64.000000,-57.000000,suspension feeder,basin,low-level epifaunal
3,oliveroi,16487,,,Maastrichtian,,72.1,66.0,,,,,,-64.000000,-57.000000,,basin,
4,antarctica,16488,Cucullaea antarctica,species,Maastrichtian,,72.1,66.0,Mollusca,Bivalvia,Arcida,Cucullaeidae,Cucullaea,-64.000000,-57.000000,suspension feeder,basin,infaunal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11306,indet.,1662164,Mosasauridae,family,Late Maastrichtian,,72.1,66.0,Chordata,Reptilia,Squamata,Mosasauridae,,-64.286942,-56.836945,carnivore,,aquatic
11307,sp.,1662165,Moanasaurus,genus,Late Maastrichtian,,72.1,66.0,Chordata,Reptilia,Squamata,Mosasauridae,Moanasaurus,-64.292221,-56.800835,carnivore,,aquatic
11308,indet.,1662166,Mosasaurinae,subfamily,Late Maastrichtian,,72.1,66.0,Chordata,Reptilia,Squamata,Mosasauridae,,-64.292221,-56.800835,carnivore,,aquatic
11309,sp.,1662168,Tylosaurinae,subfamily,Late Maastrichtian,,72.1,66.0,Chordata,Reptilia,Squamata,Mosasauridae,,-64.286942,-56.836945,carnivore,,aquatic
