# **Bajada de datos de AERONET**

In [14]:
import subprocess
import pandas as pd

# Defino los argumentos que toma el script aeraod.py
args = {'site_name'    : "CEILAP-BA", # ie. "CEILAP-BA"   # Specify the site/station name
        'date_initial' : "2020-09-08", # ie. "2022-09-08"  # Specify the start date in 'YYYY-MM-DD' format
        'date_final'   : "2022-09-08", # ie. "2022-09-08"  # Specify the end date in 'YYYY-MM-DD' format
        'data_type'    : "Aerosol Optical Depth (AOD) with Precipitable Water and Angstrom Parameter", # ie. "Aerosol Optical Depth (AOD) with Precipitable Water and Angstrom Parameter" or "Total Optical Depth based on AOD Level*"
        'data_format'  : "All points", # ie. "All points"  # Choose the data format: 'All points', 'daily averages', or 'monthly averages'
        'aod_level'    : "15", # ie. "10"          #  Choose the AOD level: '10' for 1.0, '15' for 1.5, or '20' for 2.0
        'download_folder' : "/content/"  # Specify the folder to save downloaded data
         }

# Utilizando el modulo subprocess, corro el .py con los argumentos definidos arriba.
# Es como correrlo desde terminal (!python aeraod.py arg1 arg2 arg3 ...)
result = subprocess.run(["python", "/content/aeraod.py",
                args['site_name'], args['date_initial'],
                args['date_final'], args['data_type'],
                args['data_format'], args['aod_level'],
                args['download_folder']],
                capture_output=True, text=True, # Esto se le agrega para ver lo que pasa mientras corre
                )

# Para poder ver si hay errores en la ejecución
print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)
print("Return code:", result.returncode)
print("\n ---------------------")

# Abro y leo las primeras lineas del archivo descargado:
# Armo el nombre del archivo como lo arman en aeraod.py
name_txt = f"{args['site_name']}{args['date_initial']}_to{args['date_final']}AOD{args['aod_level']}{args['data_format'].replace(' ', '_')}.txt"
# Le agrego la ruta de la carpeta donde se descargó (definida en args)
file = f"{args['download_folder']}{name_txt}"

# Leo las primeras 15 lineas del archivo (Para verlo todo, se puede hacer doble click en el archivo)
print('Archivo descargado: \n')
with open(file, "r", encoding="utf-8", errors="replace") as f:
    for i in range(15):
        print(f.readline().strip())

STDOUT:
 
Downloading data...

Downloaded data to: /content/CEILAP-BA_2020-09-08_to_2022-09-08_AOD15_All_points.txt

STDERR:
 
Return code: 0

 ---------------------
Archivo descargado: 



FileNotFoundError: [Errno 2] No such file or directory: '/content/CEILAP-BA2020-09-08_to2022-09-08AOD15All_points.txt'

In [None]:
# Otra forma de cargar como modulo la funcion dentro de aeraod.py
import sys
sys.path.append("/content/")  # ruta donde este aeraod.py
import aeraod

aeraod.download_aeronet_aod(
    site="CEILAP-BA",
    start_date="2022-09-08",
    end_date="2022-09-08",
    data_type="Aerosol Optical Depth (AOD) with Precipitable Water and Angstrom Parameter",
    data_format="All points",
    aod_level="10",
    download_folder="/content/data_down/"
)

## **Limpieza de datos**

In [15]:
# Leer el archivo
import os
import numpy as np

os.chdir("/content/")
files = os.listdir()
file = np.where([file.endswith(".txt") for file in files])[0][0]
#print(f"{files[file]}")

file_path = "/content/" + files[file]
df = pd.read_csv(file_path, skiprows=5,encoding='utf-8' )  # salteamos las primeras filas de metadata


In [16]:
## Procesar columna Date(dd:mm:yyyy)
df[['dd', 'mm', 'yyyy']] = df['Date(dd:mm:yyyy)'].str.split(":", expand=True).astype(int)
# Insertarlas en posiciones 1, 2, 3
df.insert(1, 'dd', df.pop('dd'))
df.insert(2, 'mm', df.pop('mm'))
df.insert(3, 'yyyy', df.pop('yyyy'))
# Borrar columna Date
df = df.drop(columns=['Date(dd:mm:yyyy)'])

In [None]:
## reemplazo los -999 por Nan y elimino las columnas cuyos datos son todos NaN
cols_to_drop = []
cols_with_nan = {}

for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:  # sólo columnas numéricas
        vals = df[col].unique()
        if np.all((vals == -999) | (vals == -999.000000)):
            cols_to_drop.append(col)
        else:
            mask = (df[col] == -999) | (df[col] == -999.000000)
            if mask.sum() > 0:
                df.loc[mask, col] = np.nan
                cols_with_nan[col] = mask.sum()

# Eliminar columnas que son solo -999
df.drop(columns=cols_to_drop, inplace=True)

# Mostrar resumen de columnas afectadas
if cols_with_nan:
    print("Columnas con reemplazo de -999/NaN:")
    for col, count in cols_with_nan.items():
        print(f"- {col}: {count} valores reemplazados por NaN")

if cols_to_drop:
    print("\nColumnas eliminadas por ser solo -999:")
    print(cols_to_drop)



Columnas con reemplazo de -999/NaN:
- AOD_1640nm: 16686 valores reemplazados por NaN
- AOD_1020nm: 65 valores reemplazados por NaN
- AOD_870nm: 34 valores reemplazados por NaN
- AOD_675nm: 30 valores reemplazados por NaN
- AOD_500nm: 27 valores reemplazados por NaN
- AOD_440nm: 33 valores reemplazados por NaN
- AOD_380nm: 88 valores reemplazados por NaN
- AOD_340nm: 318 valores reemplazados por NaN
- Precipitable_Water(cm): 663 valores reemplazados por NaN
- Triplet_Variability_1640: 16686 valores reemplazados por NaN
- Triplet_Variability_1020: 66 valores reemplazados por NaN
- Triplet_Variability_870: 35 valores reemplazados por NaN
- Triplet_Variability_675: 31 valores reemplazados por NaN
- Triplet_Variability_500: 28 valores reemplazados por NaN
- Triplet_Variability_440: 34 valores reemplazados por NaN
- Triplet_Variability_380: 89 valores reemplazados por NaN
- Triplet_Variability_340: 319 valores reemplazados por NaN
- Triplet_Variability_Precipitable_Water(cm): 664 valores ree

In [17]:
## Muestra las primeras 5 filas
print("\nPrimeras 10 filas del DataFrame limpio:")
display(df[((df.dd==15) & (df.mm==5))])

## Guarda csv
df.to_csv("/content/data_limpia.csv", index=False)


Primeras 10 filas del DataFrame limpio:


Unnamed: 0,AERONET_Site,dd,mm,yyyy,Time(hh:mm:ss),Day_of_Year,Day_of_Year(Fraction),AOD_1640nm,AOD_1020nm,AOD_870nm,...,Exact_Wavelengths_of_AOD(um)_380nm,Exact_Wavelengths_of_AOD(um)_340nm,Exact_Wavelengths_of_PW(um)_935nm,Exact_Wavelengths_of_AOD(um)_681nm,Exact_Wavelengths_of_AOD(um)_709nm,Exact_Wavelengths_of_AOD(um)_Empty,Exact_Wavelengths_of_AOD(um)_Empty.1,Exact_Wavelengths_of_AOD(um)_Empty.2,Exact_Wavelengths_of_AOD(um)_Empty.3,Exact_Wavelengths_of_AOD(um)_Empty.4
16190,CEILAP-BA,15,5,2021,12:51:20,135,135.535648,-999.000000,0.296064,0.296506,...,0.3797,0.3403,-999.0000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
16191,CEILAP-BA,15,5,2021,15:09:59,135,135.631933,-999.000000,0.571373,0.576501,...,0.3797,0.3403,0.9363,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36262,CEILAP-BA,15,5,2022,11:58:06,135,135.498681,0.044595,0.057462,0.053403,...,-999.0000,-999.0000,-999.0000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36263,CEILAP-BA,15,5,2022,12:01:07,135,135.500775,0.007735,0.015588,0.017940,...,0.3791,-999.0000,0.9377,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36264,CEILAP-BA,15,5,2022,12:05:43,135,135.503970,0.011870,0.020710,0.019436,...,-999.0000,-999.0000,-999.0000,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36336,CEILAP-BA,15,5,2022,19:54:12,135,135.829306,0.038510,0.053374,0.045199,...,0.3791,0.3410,0.9377,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36337,CEILAP-BA,15,5,2022,20:00:19,135,135.833553,0.028500,0.037656,0.035104,...,0.3791,0.3410,0.9377,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36338,CEILAP-BA,15,5,2022,20:05:16,135,135.836991,0.032583,0.041566,0.039316,...,0.3791,0.3410,0.9377,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
36339,CEILAP-BA,15,5,2022,20:09:26,135,135.839884,0.031855,0.038361,0.038741,...,0.3791,0.3410,0.9377,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [18]:
## Calcula promedios
# Convertir fecha y hora
df['datetime'] = pd.to_datetime(df[['yyyy','mm','dd']].astype(str).agg('-'.join, axis=1) + ' ' + df['Time(hh:mm:ss)'])

# Promedios horarios
df_hourly = df.set_index('datetime').resample('H').mean(numeric_only=True)

# Promedios diarios
df_daily = df.set_index('datetime').resample('D').mean(numeric_only=True)

# Promedios mensuales
df_monthly = df.set_index('datetime').resample('M').mean(numeric_only=True)

# Guardar cada DataFrame
df_hourly.to_csv("/content/promedios_horarios.csv")
df_daily.to_csv("/content/promedios_diarios.csv")
df_monthly.to_csv("/content/promedios_mensuales.csv")

  df_hourly = df.set_index('datetime').resample('H').mean(numeric_only=True)
  df_monthly = df.set_index('datetime').resample('M').mean(numeric_only=True)


In [19]:
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Crear listas de opciones
columnas_numericas = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
tipos_grafico = ["Línea", "Dispersión", "Histograma", "Boxplot"]

# Widgets interactivos
grafico_dropdown = widgets.Dropdown(options=tipos_grafico, description="Gráfico:")
x_dropdown = widgets.Dropdown(options=columnas_numericas, description="X (indep):")
y_dropdown = widgets.Dropdown(options=columnas_numericas, description="Y (dep):")

# Función para graficar según selección
def graficar(grafico, x_var, y_var):
    plt.figure(figsize=(10,5))

    if grafico == "Línea":
        plt.plot(df[x_var], df[y_var], marker='o')
        plt.xlabel(x_var)
        plt.ylabel(y_var)
        plt.title(f"{y_var} vs {x_var} (Línea)")

    elif grafico == "Dispersión":
        plt.scatter(df[x_var], df[y_var])
        plt.xlabel(x_var)
        plt.ylabel(y_var)
        plt.title(f"{y_var} vs {x_var} (Dispersión)")

    elif grafico == "Histograma":
        plt.hist(df[y_var].dropna(), bins=30, edgecolor="black")
        plt.xlabel(y_var)
        plt.ylabel("Frecuencia")
        plt.title(f"Histograma de {y_var}")

    elif grafico == "Boxplot":
      ## grupos = df.groupby(var_y)[x_var].apply(list)  # lista de datos por grupo
        ##plt.boxplot(grupos, labels=grupos.index, vert=True)

        plt.boxplot(df[x_var].dropna(), vert=True)
        plt.xlabel(x_var)
        plt.title(f"Boxplot de {x_var}")

    plt.grid(True, alpha=0.3)
    plt.show()

# Interfaz interactiva
out = widgets.interactive_output(
    graficar,
    {"grafico": grafico_dropdown, "x_var": x_dropdown, "y_var": y_dropdown}
)

display(grafico_dropdown, x_dropdown, y_dropdown, out)

Dropdown(description='Gráfico:', options=('Línea', 'Dispersión', 'Histograma', 'Boxplot'), value='Línea')

Dropdown(description='X (indep):', options=('dd', 'mm', 'yyyy', 'Day_of_Year', 'Day_of_Year(Fraction)', 'AOD_1…

Dropdown(description='Y (dep):', options=('dd', 'mm', 'yyyy', 'Day_of_Year', 'Day_of_Year(Fraction)', 'AOD_164…

Output()

In [20]:
## Grafica
# Convertir fecha y hora
df['datetime'] = pd.to_datetime(df[['yyyy','mm','dd']].astype(str).agg('-'.join, axis=1) + ' ' + df['Time(hh:mm:ss)'])

# Promedios horarios
df_hourly = df.set_index('datetime').resample('H').mean(numeric_only=True)

# Promedios diarios
df_daily = df.set_index('datetime').resample('D').mean(numeric_only=True)

# Promedios mensuales
df_monthly = df.set_index('datetime').resample('M').mean(numeric_only=True)

# Guardar cada DataFrame
df_hourly.to_csv("/content/promedios_horarios.csv")
df_daily.to_csv("/content/promedios_diarios.csv")
df_monthly.to_csv("/content/promedios_mensuales.csv")

  df_hourly = df.set_index('datetime').resample('H').mean(numeric_only=True)
  df_monthly = df.set_index('datetime').resample('M').mean(numeric_only=True)
