# EDA Marine Galapagos (Microplastics) 🌊

Este csv es una ampliación de Marine_Microplastics

## Importación de librerías y datos 📁

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [None]:
df_gal = pd.read_csv(
    "../files/Marine_Galapagos.csv",
    sep=";",
    engine="python",
    encoding="utf-8",
    quotechar='"',
    on_bad_lines='warn'  # también puedes usar 'skip' para ignorar líneas malas
)

df_gal.head()

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,SubRegions,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link,x,y
0,6191,04/12/2001,0.07,-897.9,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-9995377078,7792366
1,6192,05/12/2001,-0.51,-900.8,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1002765973,-5677369
2,6193,08/12/2001,-0.87,-902.6,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1004769724,-96851679
3,6194,09/12/2001,-121.0,-904.3,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1006662155,-134706597
4,6195,09/12/2001,-107.0,-908.4,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1011226254,-119118779


## Primera exploración 🔎

In [3]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_gal):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_gal.isna().sum() / df_gal.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_gal.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_gal)



    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    OBJECTID                                 0.0
Date                                     0.0
Latitude                                 0.0
Longitude                                0.0
Oceans                                   0.0
Regions                                100.0
SubRegions                             100.0
Microplastics Measurement (density)      0.0
Unit                                     0.0
Density Class Range                      0.0
Concentration Class                      0.0
Sampling Method                          0.0
Short Reference                          0.0
Long Reference                           0.0
DOI                                      0.0
Organization                             0.0
Keywords                                 0.0
NCEI Accession Number                    0.0
NCEI Accession Link                      0.0
x                                

## Transformaciones 💻

In [4]:
# Sólo tenemos que corregir la longitud, dividiendo entre 10

In [5]:
def corregir_longitud(longitud):
    return "{:.4f}".format(longitud / 10)

# Aplicar la función a la columna
df_gal['Longitude'] = df_gal['Longitude'].apply(corregir_longitud)


In [6]:
df_gal

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,SubRegions,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link,x,y
0,6191,04/12/2001,0.07,-89.79,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-9995377078,7792366
1,6192,05/12/2001,-0.51,-90.08,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1002765973,-5677369
2,6193,08/12/2001,-0.87,-90.26,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1004769724,-96851679
3,6194,09/12/2001,-121.0,-90.43,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1006662155,-134706597
4,6195,09/12/2001,-107.0,-90.84,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1011226254,-119118779
5,6196,10/12/2001,-113.0,-91.96,Pacific Ocean,,,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,-1023694037,-12579918
6,8982,24/11/2014,-0.7485,-90.3131,Pacific Ocean,,,0.001,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-100536083,-83325009
7,9371,06/02/2015,0.2891,-90.5589,Pacific Ocean,,,0.003,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-1008097064,32182601
8,9373,07/02/2015,0.314,-89.9471,Pacific Ocean,,,0.002,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-1001286537,34954495
9,9806,25/04/2016,-0.57,-90.57,Pacific Ocean,,,0.003,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-1008220628,-63453156


In [7]:
# Columna de media de density

# Función para eliminar el símbolo '>= y >'
def eliminar_menor_igual(rango):
    return rango.replace('>=','').replace('>','').strip()

# Aplicamos la función para eliminar '>=' de la columna 'Density Range'
df_gal['Density Class Range'] = df_gal['Density Class Range'].apply(eliminar_menor_igual)

In [8]:
# Función para extraer los valores numéricos y calcular el valor central
def calcular_densidad_central(rango):
    # Si el valor es solo un número
    if '-' not in rango:  # Caso cuando no hay guion, es un solo número
        return float(rango.strip())
    
    # Si el valor es un rango (con '-')
    else:
        # Extraemos los valores del rango y calculamos el promedio
        min_val, max_val = map(float, rango.replace(' ', '').split('-'))  # Convertimos los valores en float
        return (min_val + max_val) / 2  # Calculamos el promedio del rango

# Aplicamos la función a la columna 'Density Range' y creamos la nueva columna 'Density_Center'
df_gal['Density Center'] = df_gal['Density Class Range'].apply(calcular_densidad_central)

In [9]:
# Crear columna 'Location' en df_gal y asignar "Galápagos"
df_gal['Location'] = 'Galápagos'

# También asignar "Galápagos" a todas las filas de 'Regions'
df_gal['Regions'] = 'Galápagos'

# Borrar la columna 'Subregions'
df_gal = df_gal.drop(columns=['SubRegions','x','y'])


In [11]:
df_gal = df_gal.rename(columns={'Density Class Range': 'Density Range'})

In [12]:
df_gal=df_gal[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

In [None]:
#Guardar csv limpio
# df_gal.to_csv('../files/transition_files/Microplastics_por_zonas/Galapagoscolum.csv', index=False)