####                ---  STATISTICAL FUNCTIONS  --- ######

 OBJETIVO: Realizar una funcion para calcular diferentes funciones estadisticas para usar en otros codigos


In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats


In [2]:
def funcion_estadisticas(m, o, type):
    m = np.array(m, dtype=float)
    o = np.array(o, dtype=float)

    if type == "bias":
        bias = np.nanmean(m - o)
        return round(bias, 5)

    if type == "rmse":
        rmse = np.sqrt(np.nanmean((m - o) ** 2))
        return round(rmse, 5)

    if type == "nrmse_mm":
        nrmse_mm = np.sqrt(np.nanmean((m - o) ** 2)) / (np.nanmax(o) - np.nanmin(o))
        return round(nrmse_mm, 5)

    if type == "nrmse_sd":
        nrmse_sd = np.sqrt(np.nanmean((m - o) ** 2)) / np.nanstd(o)
        return round(nrmse_sd, 5)

    if type == "nrmse_mean":
        nrmse_mean = np.sqrt(np.nanmean((m - o) ** 2)) / np.nanmean(o)
        return round(nrmse_mean, 5)

    if type == "nmbe_mm":
        bias = np.nanmean(m - o)
        nmbe_mm = bias / (np.nanmax(o) - np.nanmin(o))
        return round(nmbe_mm, 5)

    if type == "nmbe_mean":
        bias = np.nanmean(m - o)
        nmbe_mean = bias / np.nanmean(o)
        return round(nmbe_mean, 5)

    if type == "mean_obs":
        return round(np.nanmean(o), 5)

    if type == "mean_mod":
        return round(np.nanmean(m), 5)

    if type == "long_o":
        return len(o)

    if type == "long_m":
        return len(m)

    if type == "r":
        r, _ = stats.pearsonr(m[~np.isnan(o)], o[~np.isnan(o)])
        return round(r, 5)

    if type == "r2_aj":
        slope, intercept, r_value, _, _ = stats.linregress(o, m)
        n = len(o)
        r2 = r_value ** 2
        r2_adj = 1 - (1 - r2) * (n - 1) / (n - 2)
        return round(r2_adj, 5)

    if type == "sd_mod":
        return np.nanstd(m)

    if type == "sd_obs":
        return np.nanstd(o)

    if type == "crmsd":
        m_mean = np.nanmean(m)
        o_mean = np.nanmean(o)
        crmsd = np.sqrt(np.nanmean(((m - m_mean) - (o - o_mean)) ** 2)) / o_mean
        return round(crmsd, 4)

    if type == "intercep":
        slope, intercept, _, _, _ = stats.linregress(o, m)
        return intercept

    if type == "slope":
        slope, _, _, _, _ = stats.linregress(o, m)
        return slope


In [None]:
def tabla_estadisticas(m, o):
    data = {
        "mean modelado": funcion_estadisticas(m, o, "mean_mod"),
        "mean obs": funcion_estadisticas(m, o, "mean_obs"),
        "len obs": funcion_estadisticas(m, o, "long_o"),
        "len mod": funcion_estadisticas(m, o, "long_m"),
        "rmse": funcion_estadisticas(m, o, "rmse"),
        "nrmse mean": funcion_estadisticas(m, o, "nrmse_mean"),
        "bias": funcion_estadisticas(m, o, "bias"),
        "nbias mean": funcion_estadisticas(m, o, "nmbe_mean"),
        "r": funcion_estadisticas(m, o, "r"),
        "R^2 aj": funcion_estadisticas(m, o, "r2_aj"),
        "sd_mod": funcion_estadisticas(m, o, "sd_mod"),
        "sd_obs": funcion_estadisticas(m, o, "sd_obs"),
        "crmsd": funcion_estadisticas(m, o, "crmsd"),
        "intercep": funcion_estadisticas(m, o, "intercep"),
        "slope": funcion_estadisticas(m, o, "slope")
    }

    return pd.DataFrame([data])


In [None]:
def funcion_estadisticas_carpeta(dire):
    files = [f for f in os.listdir(dire) if f.endswith(".csv")]
    df_total = []

    for i, file in enumerate(files):
        print(i + 1)
        data = pd.read_csv(os.path.join(dire, file))

        info = tabla_estadisticas(
            m=data["AOD_550_maiac"],
            o=data["AOD_550_AER_mean"]
        )

        info.insert(0, "name", file)
        df_total.append(info)

    return pd.concat(df_total, ignore_index=True)


In [None]:
region = "USA"

direcciones = {
    "1km": f"D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/{region}_C61/dia/1km/",
    "3km": f"D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/{region}_C61/dia/3km/",
    "5km": f"D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/{region}_C61/dia/5km/",
    "15km": f"D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/{region}_C61/dia/15km/",
    "25km": f"D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/{region}_C61/dia/25km/"
}

df_tot = pd.concat(
    [funcion_estadisticas_carpeta(d) for d in direcciones.values()],
    ignore_index=True
)

df_tot["ciudad"] = df_tot["name"].str[2:4]
df_tot["buffer"] = df_tot["name"].str[6:9]
df_tot["temp"] = df_tot["name"].str[17:19]


In [None]:
df_tot.to_csv(
    "D:/Josefina/paper_git/paper_maiac/datasets/V03/processed/merge_AER-MAIAC/USA_C61/estadisticas_USA-MAIAC_C61-AER-V03-dia.csv",
    index=False
)
