# EDA Marine South_China_Sea (Microplastics) 🌊

Este csv es una ampliación de Marine_Microplastics

## Importación de librerías y datos 📁

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [None]:
df_chi = pd.read_csv("../files/Marine Microplastic Concentrations_south_CHINA.csv",
    sep=";",
    engine="python",
    encoding="utf-8",
    quotechar='"',
    on_bad_lines='warn'  # también puedes usar 'skip' para ignorar líneas malas
)

df_chi.head()

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,NCEI Accession Number,NCEI Accession Link
0,9376,14/02/2015,100.888,998.253,Pacific Ocean,Gulf of Thailand,South China Sea,0.004,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
1,9395,04/03/2015,71.221,994.243,Pacific Ocean,Malacca Strait,South China Sea,0.0,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
2,9757,14/03/2016,35.438,1.002.277,Pacific Ocean,Malacca Strait,South China Sea,0.077,pieces/m3,0.005-1,Medium,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
3,9784,28/12/2015,17.45,1.104.930,Pacific Ocean,South China Sea,South China Sea,0.0,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
4,9785,29/12/2015,18.1,1.103.250,Pacific Ocean,South China Sea,South China Sea,0.0,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...


## Primera exploración 🔎

In [3]:
# Función para conocer nulos y duplicados en un informe. Next step--> ETL

def nulos_duplicados(df_chi):
    # Cálculo del porcentaje de nulos
    porcentaje_nulos = df_chi.isna().sum() / df_chi.shape[0] * 100
    
    # Verificación de duplicados
    duplicados = df_chi.duplicated().sum()
    if duplicados == 0:
        mensaje_duplicados = "No hay duplicados"
    else:
        mensaje_duplicados = f"Hay {duplicados} duplicados"
    
    # Creación de un reporte bonito y visual
    reporte = f"""
    ===================== Informe de Datos =====================
    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    {porcentaje_nulos.to_string()}
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------------------------------------------
    {mensaje_duplicados}
    
    ============================================================
    """
    
    # Imprimir directamente el reporte
    print(reporte)

# Ejemplo de uso
# df_mp = pd.DataFrame(...)

# Llamar directamente a la función
nulos_duplicados(df_chi)


    
    Porcentaje de Nulos por Columna:
    ------------------------------------------------------------
    OBJECTID                               0.0
Date                                   0.0
Latitude                               0.0
Longitude                              0.0
Oceans                                 0.0
Regions                                0.0
Location                               0.0
Microplastics Measurement (density)    0.0
Unit                                   0.0
Density Range                          0.0
Concentration Class                    0.0
Sampling Method                        0.0
Short Reference                        0.0
Long Reference                         0.0
DOI                                    0.0
Organization                           0.0
NCEI Accession Number                  0.0
NCEI Accession Link                    0.0
    
    ------------------------------------------------------------
    Duplicados:
    ------------------------

## Transformaciones 💻

In [4]:
# Función para transformar el número
def transformar_numero(numero):
    # Verificamos si el número es un float o un string
    if isinstance(numero, str):
        # Eliminar los puntos si es una cadena
        return "{:,.4f}".format(int(numero.replace(".", "")) / 10000)
    elif isinstance(numero, float) or isinstance(numero, int):
        # Si es un número, lo dividimos por 10,000 y lo formateamos
        return "{:,.4f}".format(numero / 10000)

# Aplicar la función a la columna
df_chi['Longitude'] = df_chi['Longitude'].apply(transformar_numero)

In [6]:
def transformar_latitud(latitud):
    return "{:.4f}".format(latitud / 10)

# Aplicar la función a la columna
df_chi['Latitude'] = df_chi['Latitude'].apply(transformar_latitud)

In [7]:
df_chi

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,NCEI Accession Number,NCEI Accession Link
0,9376,14/02/2015,10.0888,99.8253,Pacific Ocean,Gulf of Thailand,South China Sea,0.004000,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
1,9395,04/03/2015,7.1221,99.4243,Pacific Ocean,Malacca Strait,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
2,9757,14/03/2016,3.5438,100.2277,Pacific Ocean,Malacca Strait,South China Sea,0.077000,pieces/m3,0.005-1,Medium,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
3,9784,28/12/2015,1.7450,110.4930,Pacific Ocean,South China Sea,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
4,9785,29/12/2015,1.8100,110.3250,Pacific Ocean,South China Sea,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,21672,11/10/2017,21.7495,108.5672,Pacific Ocean,South China Sea,South China Sea,9.533.333.333,pieces/m3,>=10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...
178,21673,11/10/2017,21.7835,108.5466,Pacific Ocean,South China Sea,South China Sea,2.266.666.667,pieces/m3,>=10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...
179,21674,11/10/2017,21.7968,108.5517,Pacific Ocean,South China Sea,South China Sea,1.600.000.000,pieces/m3,>=10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...
180,21675,11/10/2017,21.8168,108.5710,Pacific Ocean,South China Sea,South China Sea,3.400.000.000,pieces/m3,>=10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...


In [8]:
# Columna de media de density

# Función para eliminar el símbolo '>= y >'
def eliminar_menor_igual(rango):
    return rango.replace('>=','').replace('>','').strip()

# Aplicamos la función para eliminar '>=' de la columna 'Density Range'
df_chi['Density Range'] = df_chi['Density Range'].apply(eliminar_menor_igual)

In [9]:
# Función para extraer los valores numéricos y calcular el valor central
def calcular_densidad_central(rango):
    # Si el valor es solo un número
    if '-' not in rango:  # Caso cuando no hay guion, es un solo número
        return float(rango.strip())
    
    # Si el valor es un rango (con '-')
    else:
        # Extraemos los valores del rango y calculamos el promedio
        min_val, max_val = map(float, rango.replace(' ', '').split('-'))  # Convertimos los valores en float
        return (min_val + max_val) / 2  # Calculamos el promedio del rango

# Aplicamos la función a la columna 'Density Range' y creamos la nueva columna 'Density_Center'
df_chi['Density Center'] = df_chi['Density Range'].apply(calcular_densidad_central)

In [10]:
df_chi

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Range,Concentration Class,Sampling Method,Short Reference,Long Reference,DOI,Organization,NCEI Accession Number,NCEI Accession Link,Density Center
0,9376,14/02/2015,10.0888,99.8253,Pacific Ocean,Gulf of Thailand,South China Sea,0.004000,pieces/m3,0.0005-0.005,Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00275
1,9395,04/03/2015,7.1221,99.4243,Pacific Ocean,Malacca Strait,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
2,9757,14/03/2016,3.5438,100.2277,Pacific Ocean,Malacca Strait,South China Sea,0.077000,pieces/m3,0.005-1,Medium,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.50250
3,9784,28/12/2015,1.7450,110.4930,Pacific Ocean,South China Sea,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
4,9785,29/12/2015,1.8100,110.3250,Pacific Ocean,South China Sea,South China Sea,0.000000,pieces/m3,0-0.0005,Very Low,Grab sample,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,211009,https://www.ncei.noaa.gov/access/metadata/land...,0.00025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,21672,11/10/2017,21.7495,108.5672,Pacific Ocean,South China Sea,South China Sea,9.533.333.333,pieces/m3,10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...,10.00000
178,21673,11/10/2017,21.7835,108.5466,Pacific Ocean,South China Sea,South China Sea,2.266.666.667,pieces/m3,10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...,10.00000
179,21674,11/10/2017,21.7968,108.5517,Pacific Ocean,South China Sea,South China Sea,1.600.000.000,pieces/m3,10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...,10.00000
180,21675,11/10/2017,21.8168,108.5710,Pacific Ocean,South China Sea,South China Sea,3.400.000.000,pieces/m3,10,Very High,stainless-steel sampler,Zhu et al. 2019,"Zhu, J., Q. Zhang, Y. Li, S. Tan, Z. Kang, X. ...",https://doi.org/10.1016/j.scitotenv.2018.12.192,"Beibu Gulf University, China",280675,https://www.ncei.noaa.gov/access/metadata/land...,10.00000


In [11]:
df_chi=df_chi[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

In [None]:
# df_chi.to_csv('../files/transition_files/Microplastics_por_zonas/South_China_Seacolum.csv', index=False)

  df_chi.to_csv('files\clean_files(galapa, china, granreef)/South_China_Seacolum.csv', index=False)
