In [4]:
# These libraries are available in the Python 3 kernel
import pandas as pd
import numpy as np
#import dill

# These libraries are not available in the Python 3 kernel but are in py-gordon_2_0_0
#import matplotlib.pyplot as plt
#import cartopy.crs as ccrs
#import sklearn as skl
#from sklearn.model_selection import train_test_split
#from sklearn.gaussian_process import GaussianProcessRegressor

# Collect simulation data sets

## Lists of parameters

In [5]:
variable_names = {
        "dust" : "atmosphere_optical_thickness_due_to_dust_ambient_aerosol",
        "soluble_aitken_mode" : "atmosphere_optical_thickness_due_to_soluble_aitken_mode_ambient_aerosol",
        "soluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_soluble_accumulation_mode_ambient_aerosol",
        "soluble_coarse_mode" : "atmosphere_optical_thickness_due_to_soluble_coarse_mode_ambient_aerosol",
        "insoluble_aitken_mode" : "atmosphere_optical_thickness_due_to_insoluble_aitken_mode_ambient_aerosol",
        "insoluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_insoluble_accumulation_mode_ambient_aerosol",
        "insoluble_coarse_mode" : "atmosphere_optical_thickness_due_to_insoluble_coarse_mode_ambient_aerosol"
}

dates = ["0" + str(int("0801") + k*int("0001"))
         for k in range(0, 31)] + ["0" + str(int("0901") + k*int("0001"))
                                   for k in range(0, 30)]

times = ['3A', '6A', '9A', '12P', '3P', '6P', '9P', '12A']

ensemble = ["0"*(3-len(str(int("000") + k*int("001")))) + str(int("000") + k*int("001")) for k in range(0, 221)]

# Obtain a comma-separated table of simulation ensemble members' parameter combinations
csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()

# Save this table instead as a pandas dataframe (221 ensemble members x 64 parameters)
dfparams = pd.DataFrame(csv_table_metric)
dfparams["member"] = dfparams.id.str.split('_', 3, expand=True)[3].apply(lambda x: x.zfill(3))
dfparams_of_interest = dfparams.iloc[:, 5:65]

## Methods for compiling datasets

In [6]:
def open_sat_sim(date = "0826", member = "000"):
    """
    The variable atmosphere optical depth (AOD) as found in the simulated satellite data sets are indexed by the pseudo 
    level (the wavelength for which the AOD was calculated), time (which 3X-hour mark in the day when the simulated data 
    are meant to occur), latitude, and longitude.
    
    Parameters

    date : Day of 2017 of form "mmdd"
    member : Ensemble member, from 000-220

    Value

    netCDF data frame
    """

    file_name = "azure://monthly-mean-simulations/bs714a.pb2017" + date + ".pp_" + member + ".nc"

    return(c3.NetCDFUtil.openFile(file_name))


def flatten_variable(df, name):
    """
    Helper function for 'select_sim_data'

    Parameters

    df : netCDF data frame
    name : string, the name of a variable 
    
    Value
    
    list, a 1-d vector of elements in the chosen array
    """

    var = df[name][:][2, :, :, :]

    return(np.array(var).flatten())


def select_sim_data(df, member = "000"):
    """
    Unpack the netCDF file as a Pandas dataframe

    Parameters

    df : netCDF data frame
    times : list of strings, the time periods at which the simulated data are meant to occur

    Value

    Pandas DataFrame with columns the variables in df
    """

    # First produce a table of all time-latitude-longitude combinations (in a carefully managed order!) Note: We rotate
    # longitude values by 180 degrees so that they fall between -180 and +180, just as the data are plotted
    lat = df["latitude"][:]
    lon = [x*(x < 180) + (x - 360)*(x >= 180) for x in df["longitude"][:]]

    data = pd.DataFrame()
    data["time"] = [t for t in times for n in range(0, len(lat)*len(lon))]
    data["latitude"] = [l for l in lat for n in range(0, len(lon))]*len(times)
    data["longitude"] = [l for l in lon]*len(times)*len(lat)
    data["member"] = member

    # Enter each aerosol into the data frame by flattening the 4-d arrays from df
    for var in variable_names.items():
        data[var[0]] = flatten_variable(df, var[1])

    data = pd.merge(data,
                    dfparams_of_interest,
                    on="member",
                    how="inner")

    return(data)


def compile_output(ensem_membs=["000"], region=[-180, 180, -90, 90], one_day=1, date='0826', one_time=1, time='9A'):
    """
    Parameters

    num_ensem_mems : integer
        For speed, sometimes only a subset of the ensemble are desired for compiling
    region : list of integers
        The region of the globe whose data is desired
    one_day : integer
        If zero, compile all days available. Else, only obtain data from the given day.
    day : string
        Of the form 'mmdd', the day for which data is desired.
    time : string
        Of the form 'HA' or 'HP', where 'H' is the hour (a multiple of 3) and 'A' or 'P' designates AM or PM

    Value

    Pandas DataFrame
        All of the data satisfying the request specified by the parameters
    """
    data = pd.DataFrame()

    # Decide which ensemble members to compile
    my_members = ensem_membs

    # Decide which days to compile
    if one_day != 0:
        my_days = [date]
    else:
        my_days = dates

    # Decide which days to compile
    if one_time != 0:
        my_times = [time]
    else:
        my_times = times

    # Loop through all of my_members, my_days and append them to the data
    for memb in my_members:
        for day in my_days:
            for tm in my_times:
                # Obtain data frame for given member, day, time combo
                dataset = open_sat_sim(date = day, member = memb)
                sample = select_sim_data(dataset)

                # Subset the sample data frame accordingly
                sample = sample[
                    (sample.time == tm) &
                    (sample.longitude >= region[0]) &
                    (sample.longitude <= region[1]) &
                    (sample.latitude >= region[2]) &
                    (sample.latitude <= region[3])
                ]

                sample["member"] = memb
                sample["date"] = day

                data = data.append(sample)

    return(data)

# Perform GP regression

### Example: Ensemble at (0.625, -29.0625)

In [7]:
# Preferably we don't wait 6+ minutes by accidentally running the following line again unless necessary
aug_26 = compile_output(ensem_membs=ensemble, region=[-30, -29, 0, 1])

In [8]:
print(aug_26)

      time  latitude  longitude member      dust  soluble_aitken_mode  \
69296   9A     0.625   -29.0625    000  0.003138             0.006927   
69296   9A     0.625   -29.0625    001  0.002556             0.005135   
69296   9A     0.625   -29.0625    002  0.002134             0.008307   
69296   9A     0.625   -29.0625    003  0.005573             0.002312   
69296   9A     0.625   -29.0625    004  0.003040             0.046211   
...    ...       ...        ...    ...       ...                  ...   
69296   9A     0.625   -29.0625    216  0.001786             0.009026   
69296   9A     0.625   -29.0625    217  0.004037             0.004525   
69296   9A     0.625   -29.0625    218  0.003354             0.004862   
69296   9A     0.625   -29.0625    219  0.003561             0.003441   
69296   9A     0.625   -29.0625    220  0.002575             0.005384   

       soluble_accumulation_mode  soluble_coarse_mode  insoluble_aitken_mode  \
69296                   0.186872           

In [10]:
aug_26

Unnamed: 0,time,latitude,longitude,member,dust,soluble_aitken_mode,soluble_accumulation_mode,soluble_coarse_mode,insoluble_aitken_mode,insoluble_accumulation_mode,...,bparam,two_d_fsd_factor,c_r_correl,acure_autoconv_exp_lwp,acure_autoconv_exp_nd,dbsdtbs_turb_0,ai,m_ci,a_ent_1_rp,date
69296,9A,0.625,-29.0625,000,0.003138,0.006927,0.186872,0.030580,0.002284,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,001,0.002556,0.005135,0.137844,0.036069,0.003615,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,002,0.002134,0.008307,0.025207,0.015324,0.004603,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,003,0.005573,0.002312,0.065641,0.029111,0.000874,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,004,0.003040,0.046211,0.113608,0.016555,0.002697,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69296,9A,0.625,-29.0625,216,0.001786,0.009026,0.118728,0.068868,0.005794,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,217,0.004037,0.004525,0.245055,0.062533,0.012304,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,218,0.003354,0.004862,0.198963,0.010503,0.002857,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826
69296,9A,0.625,-29.0625,219,0.003561,0.003441,0.187917,0.099713,0.007170,0.0,...,0.5,0.4,0.9,0.275862,0.605,0.15,0.514,0.333333,0.46,0826


# Unpacking in

It is unaceptable that obtaining a 221x71 data frame takes ~6min. What's taking so long?

In [11]:
# get parameters
csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()

dfparams = pd.DataFrame(csv_table_metric)
dfparams["member"] = dfparams.id.str.split('_', 3, expand=True)[3].apply(lambda x: x.zfill(3))
dfparams_of_interest = dfparams.iloc[:, 5:65]

It is not getting the parameters (~0.5s). Let's dive in.

In [17]:
# defining parameters that go in compile output

ensemble = ["0"*(3-len(str(int("000") + k*int("001")))) + str(int("000") + k*int("001")) for k in range(0, 221)]
region = [-30, -29, 0, 1]
date = "0826"
time = "9A"

In [19]:
for sample in ensemble:
    dataset = open_sat_sim(date = date, member = sample)

`open_sat_sim` is where the bulk of calculation time is going (~ 5.5min). 

What's happening is that everytime we call this function, it goes to the Azure blob storage, downloads the target file to the current local server, then finally opens the netCDF file. Let's see how long it takes from one of them to be loaded:

In [21]:
dataset = open_sat_sim(date = date, member = "200")

In fact, the average time from the loop above is 1.5s, which is reasonable for a 35.5 MB file (23.7 MB/s bandwidth).

In [22]:
dataset

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    source: Data from Met Office Unified Model
    um_version: 11.1
    Conventions: CF-1.7
    dimensions(sizes): pseudo_level(6), time(8), latitude(144), longitude(192)
    variables(dimensions): float32 atmosphere_optical_thickness_due_to_dust_ambient_aerosol(pseudo_level, time, latitude, longitude), int32 latitude_longitude(), int32 pseudo_level(pseudo_level), float64 time(time), float32 latitude(latitude), float32 longitude(longitude), int64 jobn(), float32 atmosphere_optical_thickness_due_to_soluble_aitken_mode_ambient_aerosol(pseudo_level, time, latitude, longitude), float32 atmosphere_optical_thickness_due_to_soluble_accumulation_mode_ambient_aerosol(pseudo_level, time, latitude, longitude), float32 atmosphere_optical_thickness_due_to_soluble_coarse_mode_ambient_aerosol(pseudo_level, time, latitude, longitude), float32 atmosphere_optical_thickness_due_to_insoluble_aitken_mode_ambient_aerosol(p

# Back on track

In [103]:
list1 = list(dfparams_of_interest.columns[0:59])
X = np.array(aug_26[list1])
y = np.array(aug_26["soluble_accumulation_mode"])

datasets = train_test_split(X, y, test_size=0.1, random_state=42)

# cast into c3 Datasets
X_train = c3.Dataset.fromPython(datasets[0])
X_test = c3.Dataset.fromPython(datasets[1])
y_train = c3.Dataset.fromPython(datasets[2])
y_test = c3.Dataset.fromPython(datasets[3])

In [92]:
print(variable_names)

{'dust': 'atmosphere_optical_thickness_due_to_dust_ambient_aerosol', 'soluble_aitken_mode': 'atmosphere_optical_thickness_due_to_soluble_aitken_mode_ambient_aerosol', 'soluble_accumulation_mode': 'atmosphere_optical_thickness_due_to_soluble_accumulation_mode_ambient_aerosol', 'soluble_coarse_mode': 'atmosphere_optical_thickness_due_to_soluble_coarse_mode_ambient_aerosol', 'insoluble_aitken_mode': 'atmosphere_optical_thickness_due_to_insoluble_aitken_mode_ambient_aerosol', 'insoluble_accumulation_mode': 'atmosphere_optical_thickness_due_to_insoluble_accumulation_mode_ambient_aerosol', 'insoluble_coarse_mode': 'atmosphere_optical_thickness_due_to_insoluble_coarse_mode_ambient_aerosol'}


In [104]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelConstant(constantValue=1.0).build().kernel

# define technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)
y_test_preds = trained_GPReg_pipe.process(input=X_test)
y_test_preds = c3.Dataset.toPandas(dataset=y_test_preds)
print(y_test_preds)
# print(np.mean((y_test_preds - y_test)^2))

         0
0   0.0625
1   0.0625
2   0.0625
3   0.0625
4   0.0625
5   0.0625
6   0.0625
7   0.0625
8   0.0625
9   0.0625
10  0.0625
11  0.0625
12  0.0625
13  0.0625
14  0.0625
15  0.0625
16  0.0625
17  0.0625
18  0.0625
19  0.0625
20  0.3750
21  0.3750
22  0.3750


In [87]:
X_train = datasets[0]
X_test = datasets[1]
y_train = datasets[2]
y_test = datasets[3]

GPReg_technique_noC3 = GaussianProcessRegressor()
GPReg_pipe_noC3 = GPReg_technique_noC3.fit(X_train, y_train)
GPReg_pipe_noC3.score(X_test, y_test)

0.17787995626841183