Where we convert the raw data into training and validation sets.
Manila Observatory will be the "test" set.

`aerosol_type` is the response variable

In [12]:
import pandas as pd
import numpy as np
import os

In [2]:
REF_CLUSTER_DIR = "raw_data/reference_sites/"

site2type = {
    "Solar_Village" : "MD",
    "Beijing" : "PD",
    "Mongu" : "BBD",
    "Alta_Floresta": "BBW",
    "GSFC" : "UI",
    "Chen-Kung_Univ" : "UID",
}

site_min_month = {
    "Solar_Village" : 3,
    "Beijing" : 1,
    "Mongu" : 8,
    "Alta_Floresta": 8,
    "GSFC" : 6,
    "Chen-Kung_Univ" : 1,
}

site_max_month = {
    "Solar_Village" : 7,
    "Beijing" : 12,
    "Mongu" : 11,
    "Alta_Floresta": 10,
    "GSFC" : 9,
    "Chen-Kung_Univ" : 12,
}



cols_to_keep = [
    'Site',
    'Date(dd:mm:yyyy)',
    'Angstrom_Exponent_440-870nm_from_Coincident_Input_AOD',
    'Extinction_Angstrom_Exponent_440-870nm-Total',
    'Single_Scattering_Albedo[440nm]',
    'Single_Scattering_Albedo[675nm]',
    'Single_Scattering_Albedo[870nm]',
    'Single_Scattering_Albedo[1020nm]',
    'Absorption_Angstrom_Exponent_440-870nm',
    'Refractive_Index-Real_Part[440nm]',
    'Refractive_Index-Real_Part[675nm]',
    'Refractive_Index-Real_Part[870nm]',
    'Refractive_Index-Real_Part[1020nm]',
    'Refractive_Index-Imaginary_Part[440nm]',
    'Refractive_Index-Imaginary_Part[675nm]',
    'Refractive_Index-Imaginary_Part[870nm]',
    'Refractive_Index-Imaginary_Part[1020nm]',
    'Asymmetry_Factor-Total[440nm]',
    'Asymmetry_Factor-Total[675nm]',
    'Asymmetry_Factor-Total[870nm]',
    'Asymmetry_Factor-Total[1020nm]',
    'Asymmetry_Factor-Fine[440nm]',
    'Asymmetry_Factor-Fine[675nm]',
    'Asymmetry_Factor-Fine[870nm]',
    'Asymmetry_Factor-Fine[1020nm]',
    'Asymmetry_Factor-Coarse[440nm]',
    'Asymmetry_Factor-Coarse[675nm]',
    'Asymmetry_Factor-Coarse[870nm]',
    'Asymmetry_Factor-Coarse[1020nm]',
    'Sphericity_Factor(%)',
    'Lidar_Ratio[440nm]',
    'Lidar_Ratio[675nm]',
    'Lidar_Ratio[870nm]',
    'Lidar_Ratio[1020nm]',
    'Depolarization_Ratio[440nm]',
    'Depolarization_Ratio[675nm]',
    'Depolarization_Ratio[870nm]',
    'Depolarization_Ratio[1020nm]',
]

In [3]:
df = pd.concat([
    pd.read_csv(
        REF_CLUSTER_DIR  + f, skiprows=6, na_values=-999
    )[cols_to_keep].assign(
        date = lambda x: pd.to_datetime(x["Date(dd:mm:yyyy)"], format=r"%d:%m:%Y"),
        month = lambda x: x.date.dt.month,
        aerosol_type = lambda x: x.Site.map(site2type),
        start_month = lambda x: x.Site.map(site_min_month),
        end_month = lambda x: x.Site.map(site_max_month),
    ).query("start_month <= month <= end_month")\
    .drop(columns=["Site", "Date(dd:mm:yyyy)", "month", "start_month", "end_month"])
    for f in os.listdir(REF_CLUSTER_DIR)
])

In [4]:
df.aerosol_type.value_counts()

MD     6162
UI     3093
PD     3088
BBD    1984
UID    1550
BBW    1060
Name: aerosol_type, dtype: int64

In [7]:
df["Single_Scattering_Albedo[440nm]"].isna().mean()

0.4823758634941253

# Train-Validation Split

Data must be split before doing imputation in order to avoid data leakage. To keep things simple, we'll stick with a simple 70-30 split.

In [26]:
np.random.seed(2023)
rand_idx = np.random.permutation(df.index)
train_idx = rand_idx[:int(0.7*len(rand_idx))]
valid_idx = rand_idx[int(0.7*len(rand_idx)):]

In [27]:
train_df = df.loc[train_idx]
valid_df = df.loc[valid_idx]

# Test Set (Manila Observatory)

In [9]:
pd.read_csv(
    "./raw_data/20090101_20221231_Manila_Observatory.all", skiprows=6, na_values=-999
)[cols_to_keep].assign(
    date = lambda x: pd.to_datetime(x["Date(dd:mm:yyyy)"], format=r"%d:%m:%Y"),
).drop(columns=["Site", "Date(dd:mm:yyyy)"])

Unnamed: 0,Angstrom_Exponent_440-870nm_from_Coincident_Input_AOD,Extinction_Angstrom_Exponent_440-870nm-Total,Single_Scattering_Albedo[440nm],Single_Scattering_Albedo[675nm],Single_Scattering_Albedo[870nm],Single_Scattering_Albedo[1020nm],Absorption_Angstrom_Exponent_440-870nm,Refractive_Index-Real_Part[440nm],Refractive_Index-Real_Part[675nm],Refractive_Index-Real_Part[870nm],...,Sphericity_Factor(%),Lidar_Ratio[440nm],Lidar_Ratio[675nm],Lidar_Ratio[870nm],Lidar_Ratio[1020nm],Depolarization_Ratio[440nm],Depolarization_Ratio[675nm],Depolarization_Ratio[870nm],Depolarization_Ratio[1020nm],date
0,1.190813,1.205538,,,,,,,,,...,,,,,,,,,,2009-01-29
1,1.226127,1.243955,,,,,,,,,...,,,,,,,,,,2009-01-29
2,0.718492,0.730107,,,,,,,,,...,,,,,,,,,,2009-02-02
3,0.789545,0.820188,,,,,,,,,...,,,,,,,,,,2009-02-02
4,0.969575,0.979558,0.8107,0.8243,0.8162,0.8114,1.035931,1.4987,1.5113,1.5349,...,,84.974,51.694,43.923,45.294,0.022468,0.030086,0.030258,0.023158,2009-02-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,1.783046,1.798098,0.9526,0.9384,0.9232,0.9128,1.098563,1.3689,1.3795,1.3822,...,,92.095,61.056,46.398,39.482,0.002604,0.010199,0.010238,0.002815,2021-05-30
1599,1.747102,1.748290,0.9471,0.9453,0.9359,0.9267,1.488420,1.3904,1.3966,1.4019,...,,95.060,68.715,51.525,42.424,0.002912,0.009879,0.009690,0.002039,2021-08-13
1600,1.323658,1.335902,,,,,,,,,...,,,,,,,,,,2021-08-16
1601,1.347535,1.362961,,,,,,,,,...,,,,,,,,,,2021-08-16


In [10]:
df

Unnamed: 0,Angstrom_Exponent_440-870nm_from_Coincident_Input_AOD,Extinction_Angstrom_Exponent_440-870nm-Total,Single_Scattering_Albedo[440nm],Single_Scattering_Albedo[675nm],Single_Scattering_Albedo[870nm],Single_Scattering_Albedo[1020nm],Absorption_Angstrom_Exponent_440-870nm,Refractive_Index-Real_Part[440nm],Refractive_Index-Real_Part[675nm],Refractive_Index-Real_Part[870nm],...,Lidar_Ratio[440nm],Lidar_Ratio[675nm],Lidar_Ratio[870nm],Lidar_Ratio[1020nm],Depolarization_Ratio[440nm],Depolarization_Ratio[675nm],Depolarization_Ratio[870nm],Depolarization_Ratio[1020nm],date,aerosol_type
2,1.747441,1.733596,0.9519,0.9456,0.9312,0.9227,1.232544,1.3749,1.3931,1.4036,...,80.851,48.047,37.443,33.702,0.002826,0.010646,0.010374,0.003461,1993-06-17,UI
3,1.716517,1.713547,0.9564,0.9454,0.9283,0.9180,1.004454,1.4716,1.4357,1.4236,...,64.663,68.423,57.520,49.716,0.002544,0.009993,0.010518,0.003286,1993-06-22,UI
4,1.099792,1.099257,,,,,,,,,...,,,,,,,,,1993-06-23,UI
5,0.775823,0.754045,,,,,,,,,...,,,,,,,,,1993-06-23,UI
6,1.192937,1.183126,,,,,,,,,...,,,,,,,,,1993-06-24,UI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,1.386883,1.390236,0.9920,0.9914,0.9905,0.9898,1.157125,1.4621,1.4564,1.4554,...,70.602,59.035,49.682,46.283,0.095438,0.076798,0.071499,0.064686,2014-12-14,UID
1546,1.411037,1.413937,0.9882,0.9860,0.9837,0.9821,0.947446,1.4463,1.4435,1.4452,...,71.111,60.703,50.261,45.458,0.059294,0.050430,0.048061,0.041710,2014-12-14,UID
1547,1.425489,1.431481,0.9588,0.9450,0.9342,0.9314,0.743527,1.4851,1.4792,1.4812,...,63.575,54.004,43.119,39.476,0.028433,0.029694,0.029841,0.025043,2014-12-22,UID
1548,1.378596,1.381993,0.9501,0.9404,0.9265,0.9249,0.827961,1.4886,1.4834,1.4838,...,81.988,62.130,49.840,45.440,0.094746,0.076861,0.077290,0.078949,2014-12-22,UID
