# EDA Marine Gran Reef (Australia) (Microplastics) 🌊

Este csv es una ampliación de Marine_Microplastics

## Importación de librerías y datos 📁

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df_aus = pd.read_csv(
    "files/Marine Microplastic Concentrations_Gran_reef_coral.csv",
    sep=";",
    engine="python",
    encoding="utf-8",
    quotechar='"',
    on_bad_lines='warn'  # también puedes usar 'skip' para ignorar líneas malas
)

df_aus.head()

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link
0,10262,26/07/2012,-165.766,1.457.434,Pacific Ocean,Coral Sea,Great Barrier Reef,0.023068,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
1,10263,26/07/2012,-165.617,1.457.530,Pacific Ocean,Coral Sea,Great Barrier Reef,0.063437,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
2,10264,26/07/2012,-165.429,1.457.642,Pacific Ocean,Coral Sea,Great Barrier Reef,0.046136,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
3,10265,27/07/2012,-150.275,1.453.907,Pacific Ocean,Coral Sea,Great Barrier Reef,0.027107,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
4,10266,27/07/2012,-150.086,1.453.855,Pacific Ocean,Coral Sea,Great Barrier Reef,0.060851,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...


## Primera exploración 🔎

In [3]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_aus):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_aus.isna().sum() / df_aus.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_aus.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_aus)



    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    OBJECTID                               0.0
Date                                   0.0
Latitude                               0.0
Longitude                              0.0
Oceans                                 0.0
Regions                                0.0
Location                               0.0
Microplastics Measurement (density)    0.0
Unit                                   0.0
Density Class Range                    0.0
Concentration Class                    0.0
Sampling Method                        0.0
Short Reference                        0.0
Long Reference                         0.0
DOI                                    0.0
Organization                           0.0
Keywords                               0.0
NCEI Accession Number                  0.0
NCEI Accession Link                    0.0
    
    ------------------------------------------------------------
 

## Transformaciones 💻

In [4]:
# Función para transformar el número
def transformar_numero(numero):
    # Verificamos si el número es un float o un string
    if isinstance(numero, str):
        # Eliminar los puntos si es una cadena
        return "{:,.4f}".format(int(numero.replace(".", "")) / 10000)
    elif isinstance(numero, float) or isinstance(numero, int):
        # Si es un número, lo dividimos por 10,000 y lo formateamos
        return "{:,.4f}".format(numero / 10000)

# Aplicar la función a la columna
df_aus['Longitude'] = df_aus['Longitude'].apply(transformar_numero)

In [5]:
def transformar_latitud(latitud):
    return "{:.4f}".format(latitud / 10)

# Aplicar la función a la columna
df_aus['Latitude'] = df_aus['Latitude'].apply(transformar_latitud)

In [6]:
# Con estos dos pasos se queda corregida la latitud y la longitud

In [7]:
df_aus

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Class Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,Keywords,NCEI Accession Number,NCEI Accession Link
0,10262,26/07/2012,-16.5766,145.7434,Pacific Ocean,Coral Sea,Great Barrier Reef,0.023068,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
1,10263,26/07/2012,-16.5617,145.7530,Pacific Ocean,Coral Sea,Great Barrier Reef,0.063437,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
2,10264,26/07/2012,-16.5429,145.7642,Pacific Ocean,Coral Sea,Great Barrier Reef,0.046136,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
3,10265,27/07/2012,-15.0275,145.3907,Pacific Ocean,Coral Sea,Great Barrier Reef,0.027107,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
4,10266,27/07/2012,-15.0086,145.3855,Pacific Ocean,Coral Sea,Great Barrier Reef,0.060851,pieces/m3,0.005-1,Medium,Manta net,Reisser et al.2013,"Reisser, J., J. Shaw, C. Wilcox, B.D. Hardesty...",https://doi.org/10.1371/journal.pone.0080466,University of Western Australia,RV Southern Surveyor/RV Solander,252260,https://www.ncei.noaa.gov/access/metadata/land...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,21481,22/03/2020,-20.2514,148.9329,Pacific Ocean,Coral Sea,Great Barrier Reef,0.027810,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...
61,21482,23/03/2020,-20.2765,148.9170,Pacific Ocean,Coral Sea,Great Barrier Reef,0.043204,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...
62,21483,23/03/2020,-20.2281,148.7833,Pacific Ocean,Coral Sea,Great Barrier Reef,0.034875,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...
63,21484,24/03/2020,-20.3456,148.8388,Pacific Ocean,Coral Sea,Great Barrier Reef,0.080216,pieces/m3,0.005-1,Medium,Manta net,Carbery et al. 2022,"Carbery, M., F. Herb, J. Reynes, C. K. Pham, W...",https://doi.org/10.1016/j.marpolbul.2022.114179,"University of Newcastle, Australia",Sail and Explore Association,279321,https://www.ncei.noaa.gov/access/metadata/land...


In [8]:
# Columna de media de density

# Función para eliminar el símbolo '>= y >'
def eliminar_menor_igual(rango):
    return rango.replace('>=','').replace('>','').strip()

# Aplicamos la función para eliminar '>=' de la columna 'Density Range'
df_aus['Density Class Range'] = df_aus['Density Class Range'].apply(eliminar_menor_igual)

In [9]:
# Función para extraer los valores numéricos y calcular el valor central
def calcular_densidad_central(rango):
    # Si el valor es solo un número
    if '-' not in rango:  # Caso cuando no hay guion, es un solo número
        return float(rango.strip())
    
    # Si el valor es un rango (con '-')
    else:
        # Extraemos los valores del rango y calculamos el promedio
        min_val, max_val = map(float, rango.replace(' ', '').split('-'))  # Convertimos los valores en float
        return (min_val + max_val) / 2  # Calculamos el promedio del rango

# Aplicamos la función a la columna 'Density Range' y creamos la nueva columna 'Density_Center'
df_aus['Density Center'] = df_aus['Density Class Range'].apply(calcular_densidad_central)

In [11]:
df_aus = df_aus.rename(columns={'Density Class Range': 'Density Range'})

In [12]:
df_aus=df_aus[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

In [None]:
#Guardar csv limpio
# df_aus.to_csv('../files/transition_files/Microplastics_por_zonas/Great_Barrier_Reefcolum.csv', index=False)