# Procesos de unión CSV --> Para gestión conjunta 🙌🏻

## Importación de librerías y datos 📁

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np


In [None]:
df_marine = pd.read_csv("../files/transition_files/Marine_microplastics/01_Marine_Microplastics_all_columns_EDA.csv")
df_marine.head()

Unnamed: 0,OBJECTID,Oceans,Regions,Sampling Method,Measurement,Unit,Density Range,Density Class,Short Reference,Long Reference,DOI,Organization,Keywords,Accession Number,Accession Link,Latitude,Longitude,Date,GlobalID,Density_Center
0,9676,Atlantic Ocean,south atlantic ocean,Grab sample,0.018,pieces/m3,0.005-1,Medium,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-31.696,-48.56,8/11/2015 12:00:00 AM,a77121b2-e113-444e-82d9-7af11d62fdd2,0.5025
1,6427,Pacific Ocean,north pacific ocean,Neuston net,0.0,pieces/m3,0-0.0005,Very Low,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",https://doi.org/10.1021/es4053076,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,6.35,-121.85,12/18/2002 12:00:00 AM,be27c450-02ca-4261-8d89-cae21108e6cc,0.00025
2,10672,Pacific Ocean,north pacific ocean,Manta net,0.013,pieces/m3,0.005-1,Medium,Goldstein et al.2013,"Goldstein, M.C., A.J. Titmus, M. Ford. 2013. S...",https://doi.org/10.1371/journal.pone.0080020,Scripps Institution of Oceanography-University...,Great Pacific Garbage Patch/SEAPLEX,253448,https://www.ncei.noaa.gov/access/metadata/land...,0.5,-95.35,10/17/2006 12:00:00 AM,23effcdd-35b7-4e1e-adb4-390693a287d3,0.5025
3,13921,Atlantic Ocean,north atlantic ocean,Aluminum bucket,1368.0,pieces/m3,10,Very High,Queiroz et al.2022,"Queiroz, A.F.dos S., A.S. da Conceição, D. Che...",https://doi.org/10.1016/j.scitotenv.2022.156259,"Federal University of Pará, Brazil",Amazon Continental Shelf,276482,https://www.ncei.noaa.gov/access/metadata/land...,0.631825,-45.398158,10/17/2018 12:00:00 AM,16d77822-0533-4116-97b9-0bdb592f3d6e,10.0
4,9344,Pacific Ocean,north pacific ocean,Grab sample,0.001,pieces/m3,0.0005-0.005,Low,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",https://doi.org/10.1016/j.envpol.2018.02.062,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,16.623,-99.6978,1/3/2015 12:00:00 AM,b9e435e3-9e86-4143-8b51-877e5dcdc7a6,0.00275


## Modificaciones previas ⚙️

In [4]:
# Vamos a crear la columna Location

df_marine['Location']=None

In [5]:
df_marine.shape

(20425, 21)

In [6]:
# Rellenamos [Location] con las Regiones:

# Localizaciones: Red Sea, Great Barrier Reef, Caribbean Sea, Galápagos, South China Sea, Maldives,Hawaiian Islands

# Red Sea: Red Sea
# Caribbean Sea:
# Maldives:
# Hawaiian Islands:
# Great Barrier Reef, South China Sea, y Galápagos (EN LOS OTROS CSV)


In [7]:
region_to_location = {
    "red sea":"red sea",
    "caribbean sea":[
"gulf of mexico",
"coastal waters of florida",
"florida keys national marine sanctuary",
"new york bight",
"gulf of st. lawrence",
"stellwagen bank national marine sanctuary",
"greater farallones national marine sanctuary",
"channel islands national marine sanctuary",
"monterey bay National marine sanctuary",
"gulf of california","caribbean sea"],
"galápagos":"south pacific ocean",
"maldives":"indian ocean",
"hawaiian islands":
["papahānaumokuākea marine national monument",
"hawaiian islands humpback whale national marine sanctuary"]}

In [8]:
# Creamos un nuevo diccionario vacío
region_to_location_expanded = {}

# Recorremos el diccionario original SIN modificarlo
for main_location, regions in region_to_location.items():
    if isinstance(regions, list):  # Si es una lista
        for r in regions:
            region_to_location_expanded[r.lower()] = main_location
    else:  # Si es solo un string
        region_to_location_expanded[regions.lower()] = main_location

# Ahora ya podemos mapear en el DataFrame
df_marine['Location'] = df_marine['Regions'].str.lower().map(region_to_location_expanded).fillna('Other')


In [9]:
df_marine['Location'].unique()

array(['Other', 'caribbean sea', 'galápagos', 'maldives', 'red sea',
       'hawaiian islands'], dtype=object)

In [10]:
df_marine = df_marine.rename(columns={'Measurement': 'Microplastics Measurement (density)'})

In [11]:
df_marine = df_marine.rename(columns={'Density Class':'Concentration Class'})

In [12]:
# Separar la columna 'Date' en fecha y hora por el espacio
# En caso de que alguna fila no tenga espacio, la función llenará con NaN
df_marine[['Date_Only', 'Time_Only']] = df_marine['Date'].str.split(' ', expand=True, n=1)

# Verificar el resultado
print(df_marine[['Date_Only', 'Time_Only']].head())

    Date_Only    Time_Only
0   8/11/2015  12:00:00 AM
1  12/18/2002  12:00:00 AM
2  10/17/2006  12:00:00 AM
3  10/17/2018  12:00:00 AM
4    1/3/2015  12:00:00 AM


In [13]:
df_marine = df_marine.drop(columns=["Time_Only", "Date"])

In [14]:
df_marine

Unnamed: 0,OBJECTID,Oceans,Regions,Sampling Method,Microplastics Measurement (density),Unit,Density Range,Concentration Class,Short Reference,Long Reference,...,Organization,Keywords,Accession Number,Accession Link,Latitude,Longitude,GlobalID,Density_Center,Location,Date_Only
0,9676,Atlantic Ocean,south atlantic ocean,Grab sample,0.0180,pieces/m3,0.005-1,Medium,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",...,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,-31.696000,-48.560000,a77121b2-e113-444e-82d9-7af11d62fdd2,0.50250,Other,8/11/2015
1,6427,Pacific Ocean,north pacific ocean,Neuston net,0.0000,pieces/m3,0-0.0005,Very Low,Law et al.2014,"Law, K.L, S.K. Morét-Ferguson, D.S. Goodwin, E...",...,Sea Education Association,SEA,211008,https://www.ncei.noaa.gov/access/metadata/land...,6.350000,-121.850000,be27c450-02ca-4261-8d89-cae21108e6cc,0.00025,Other,12/18/2002
2,10672,Pacific Ocean,north pacific ocean,Manta net,0.0130,pieces/m3,0.005-1,Medium,Goldstein et al.2013,"Goldstein, M.C., A.J. Titmus, M. Ford. 2013. S...",...,Scripps Institution of Oceanography-University...,Great Pacific Garbage Patch/SEAPLEX,253448,https://www.ncei.noaa.gov/access/metadata/land...,0.500000,-95.350000,23effcdd-35b7-4e1e-adb4-390693a287d3,0.50250,Other,10/17/2006
3,13921,Atlantic Ocean,north atlantic ocean,Aluminum bucket,1368.0000,pieces/m3,10,Very High,Queiroz et al.2022,"Queiroz, A.F.dos S., A.S. da Conceição, D. Che...",...,"Federal University of Pará, Brazil",Amazon Continental Shelf,276482,https://www.ncei.noaa.gov/access/metadata/land...,0.631825,-45.398158,16d77822-0533-4116-97b9-0bdb592f3d6e,10.00000,Other,10/17/2018
4,9344,Pacific Ocean,north pacific ocean,Grab sample,0.0010,pieces/m3,0.0005-0.005,Low,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",...,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,16.623000,-99.697800,b9e435e3-9e86-4143-8b51-877e5dcdc7a6,0.00275,Other,1/3/2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20420,9366,Atlantic Ocean,north atlantic ocean,Grab sample,0.0110,pieces/m3,0.005-1,Medium,Barrows et al.2018,"Barrows, A.P.W., S.E. Cathey, C.W. Petersen. 2...",...,Adventure Scientist,Adventure Scientist/Citizen Science,211009,https://www.ncei.noaa.gov/access/metadata/land...,17.100000,-47.550000,fd4db6d6-aed7-48b1-a22b-402557112e2f,0.50250,Other,2/1/2015
20421,4683,Atlantic Ocean,caribbean sea,Neuston net,0.0216,pieces/m3,0.005-1,Medium,Law et al.2010,"Law, K.L., S. Morét-Ferguson, N.A. Maximenko, ...",...,Sea Education Association,SEA,211007,https://www.ncei.noaa.gov/access/metadata/land...,21.390000,-81.960000,83a8208e-2d06-4ad9-bcd9-81e29d57e0aa,0.50250,caribbean sea,3/14/2001
20422,6043,Atlantic Ocean,north atlantic ocean,Neuston net,0.0000,pieces/m3,0-0.0005,Very Low,Law et al.2010,"Law, K.L., S. Morét-Ferguson, N.A. Maximenko, ...",...,Sea Education Association,SEA,211007,https://www.ncei.noaa.gov/access/metadata/land...,24.140000,-81.980000,0fd72d83-cd27-42e1-b63a-b015c23b28d2,0.00025,Other,5/1/2008
20423,20169,Atlantic Ocean,north atlantic ocean,Hand picking,,pieces/10 mins,40-200,High,Tunnell et al. 2020,"Tunnell, Jace W.; Dunning, Kelly H.; Scheef, L...",...,University of Texas Marine Science Institute,Nurdle Patrol,259486,https://www.ncei.noaa.gov/access/metadata/land...,39.924800,-75.127200,fc84069e-c782-4c3d-bf6b-265535969d8e,120.00000,Other,10/28/2021


In [15]:
df_marine = df_marine.rename(columns={'Date_Only': 'Date'})

In [None]:
#df_marine.to_csv('../files/transition_files/Marine_microplastics/01_Marine_Microplastics_all_columns_EDA.csv')

## Unión de CSVs listos 👉🏻👈🏻

In [None]:
df1 = pd.read_csv("../files/transition_files/Microplastics_por_zonas/Galapagoscolum.csv")
df2 = pd.read_csv("../files/transition_files/Microplastics_por_zonas/Great_Barrier_Reefcolum.csv")
df3 = pd.read_csv("../files/transition_files/Microplastics_por_zonas/South_China_Seacolum.csv")
df4 = pd.read_csv("../files/transition_files/Marine_microplastics/01_Marine_Microplastics_all_columns_EDA.csv")

In [19]:
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()
df3.columns = df3.columns.str.strip()
df4.columns = df4.columns.str.strip()


In [20]:
df1=df1[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

df2=df2[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

df3=df3[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

df4=df4[['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization']]

In [21]:
df2.columns

Index(['OBJECTID', 'Date', 'Latitude', 'Longitude', 'Oceans', 'Regions',
       'Location', 'Microplastics Measurement (density)', 'Unit',
       'Density Range', 'Concentration Class', 'Sampling Method',
       'Organization'],
      dtype='object')

In [22]:
columnas_finales = df2.columns


In [23]:
df1 = df1.reindex(columns=columnas_finales)
df2 = df2.reindex(columns=columnas_finales)
df3 = df3.reindex(columns=columnas_finales)
df4 = df4.reindex(columns=columnas_finales)

In [24]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)

(24, 13)
(65, 13)
(182, 13)
(20425, 13)


In [25]:
combined_df = pd.concat ([df1,df2, df3, df4], ignore_index=True)

In [None]:
combined_df.to_csv('../files/transition_files/Marine_microplastics/02_Microplastic_marine_withLocation_Date.csv', index=False)

In [None]:
df_unido = pd.read_csv("../files/transition_files/Marine_microplastics/02_Microplastic_marine_withLocation_Date.csv", sep=",")

In [None]:
# Función para corregir el formato de fecha
def corregir_fecha(fecha):
    try:
        # Intentar convertir a formato DD/MM/YYYY
        fecha = pd.to_datetime(fecha, format='%d/%m/%Y', errors='raise')
    except:
        try:
            # Si no se puede, intentar convertir a formato MM/DD/YYYY
            fecha = pd.to_datetime(fecha, format='%m/%d/%Y', errors='raise')
            # Cambiar la fecha de formato MM/DD/YYYY a DD/MM/YYYY
            fecha = fecha.strftime('%d/%m/%Y')
        except:
            fecha = None  # Si no se puede convertir, asignar None
    return fecha

# Aplicar la corrección de fecha a la columna 'Date' de df_llorar
df_unido['Date'] = df_unido['Date'].apply(corregir_fecha)

# Verificar si algunas fechas siguen siendo nulas y ver el error
print(df_unido[df_unido['Date'].isnull()])

# Guardar el archivo CSV con las fechas corregidas (si necesitas guardarlo)
# df_unido.to_csv('files/clean_files/Marine_actualizado.csv', index=False)

Empty DataFrame
Columns: [OBJECTID, Date, Latitude, Longitude, Oceans, Regions, Location, Microplastics Measurement (density), Unit, Density Range, Concentration Class, Sampling Method, Organization]
Index: []


In [None]:
df_unido

Unnamed: 0,OBJECTID,Date,Latitude,Longitude,Oceans,Regions,Location,Microplastics Measurement (density),Unit,Density Range,Concentration Class,Sampling Method,Organization
0,6191,2001-12-04 00:00:00,0.0700,-89.790000,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
1,6192,2001-12-05 00:00:00,-0.5100,-90.080000,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
2,6193,2001-12-08 00:00:00,-0.8700,-90.260000,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
3,6194,2001-12-09 00:00:00,-121,-90.430000,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
4,6195,2001-12-09 00:00:00,-107,-90.840000,Pacific Ocean,Galápagos,Galápagos,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20691,9366,2015-01-02 00:00:00,17.1,-47.550000,Atlantic Ocean,north atlantic ocean,Other,0.011,pieces/m3,0.005-1,Medium,Grab sample,Adventure Scientist
20692,4683,14/03/2001,21.39,-81.960000,Atlantic Ocean,caribbean sea,caribbean sea,0.0216,pieces/m3,0.005-1,Medium,Neuston net,Sea Education Association
20693,6043,2008-01-05 00:00:00,24.14,-81.980000,Atlantic Ocean,north atlantic ocean,Other,0.0,pieces/m3,0-0.0005,Very Low,Neuston net,Sea Education Association
20694,20169,28/10/2021,39.9248,-75.127200,Atlantic Ocean,north atlantic ocean,Other,,pieces/10 mins,40-200,High,Hand picking,University of Texas Marine Science Institute


## Unión final: 🌊+🪸

In [None]:
#Algunas modificaciones previas
df5 = pd.read_csv("../files/clean_files/realistic_ocean_climate_dataset.csv")

In [None]:
df5.columns

Index(['Date', 'Location', 'Latitude', 'Longitude', 'SST (°C)', 'pH Level',
       'Bleaching Severity', 'Species Observed', 'Marine Heatwave'],
      dtype='object')

In [None]:
df5 = df5.rename(columns={'Date': 'Date_coral'})

In [None]:
df5.columns

Index(['Date_coral', 'Location', 'Latitude', 'Longitude', 'SST (°C)',
       'pH Level', 'Bleaching Severity', 'Species Observed',
       'Marine Heatwave'],
      dtype='object')

In [None]:
# Cargar los CSV
uni1 = pd.read_csv('../files/transition_files/Marine_microplastics/02_Microplastic_marine_withLocation_Date.csv')
uni2 = pd.read_csv("../files/clean_files/realistic_ocean_climate_dataset.csv")

# Limpiar 'Location': minúsculas y sin espacios extra
uni1['Location'] = uni1['Location'].str.strip().str.lower()
uni2['Location'] = uni2['Location'].str.strip().str.lower()

# Extraer año
uni1['Year'] = pd.to_datetime(uni1['Date'], errors='coerce').dt.year
uni2['Year'] = pd.to_datetime(uni2['Date'], errors='coerce').dt.year

# Hacer el LEFT JOIN usando uni2 como base
df_unido2 = pd.merge(uni2, uni1, on=['Location', 'Year'], how='left')

# Guardar el resultado
#df_unido2.to_csv('files/clean_files/arrecife_mas_info.csv', index=False)

# Mostrar un resumen
print(df_unido2.head())


       Date_x            Location  Latitude_x  Longitude_x  SST (°C)  \
0  2015-01-01             red sea     20.0248      38.4931     29.47   
1  2015-01-07  great barrier reef    -18.2988     147.7782     29.65   
2  2015-01-14       caribbean sea     14.9768     -75.0233     28.86   
3  2015-01-14       caribbean sea     14.9768     -75.0233     28.86   
4  2015-01-14       caribbean sea     14.9768     -75.0233     28.86   

   pH Level Bleaching Severity  Species Observed  Marine Heatwave  Year  ...  \
0     8.107                NaN               106            False  2015  ...   
1     8.004               High               116            False  2015  ...   
2     7.947               High                90            False  2015  ...   
3     7.947               High                90            False  2015  ...   
4     7.947               High                90            False  2015  ...   

   Latitude_y Longitude_y          Oceans               Regions  \
0         NaN      