# Get data

We obtain simulated data sets with the following methods. In summary, we want to take the simulated data at a GeoSurfaceTimePoint from the full ensemble and then join the set with our input set (i.e. the SourceSimulationModelParametersMap).

In [1]:
def get_GSTP(start_lat=-0.625, end_lat=-0.625,
             start_lon=-0.9375, end_lon=-0.9375,
             start_time="2017-07-01T06:00:00.000", end_time="2017-07-01T07:00:00.000"):
    """    
    Arguments

    start_lat, end_lat, start_lon, end_lon : float
        Starting and ending latitude and longitude coordinates, for specifying the region of data to fetch
    start_time, end_time : string
        For specifying the interval of time of data to fetch

    Value

    GeoSurfaceTimePoint
        Carries time and latitude-longitude filtering information
    """

    # filter
    gstpFilter = c3.Filter().ge("latitude", start_lat).and_().le("latitude", end_lat).and_().ge("longitude", start_lon).and_().le("longitude", end_lon).and_().ge("time", start_time).and_().le("time", end_time)

    # collect instances 
    gstp = c3.GeoSurfaceTimePoint.fetch({"filter": gstpFilter, "limit": -1})

    return(gstp)


def get_inputs():
    """
    Get a table which lists the inputs to be considered for GP regression. Some inputs are assumed a priori to be 
    irrelevant (e.g. ones with 'carb' or 'ems' in the name). (Note: Maybe we want to make this selection step later, 
    during GP emulation.)
    
    Arguments
    
    None
    
    Value
    
    pandas DataFrame
        Table of parameters (inputs) available and their values for each member of the perturbed parameter ensemble
    """
    import pandas as pd

    # fetch inputs table
    csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()
    all_inputs = pd.DataFrame(csv_table_metric)
    
    # filter out inputs we won't use
    use_names = []
    for input_name in all_inputs.columns:
        if 'carb' not in input_name and 'ems' not in input_name and input_name not in ['type', 'meta', 'version', 'ensemble']:
            if input_name != 'acure_anth_so2':
                use_names.append(str(input_name))
    
    return(all_inputs[use_names])


def get_outputs(start_lat=-0.625, end_lat=-0.625,
                start_lon=-0.9375, end_lon=-0.9375,
                start_day="01", start_month="07", start_year="2017", start_hour="06",
                end_day="01", end_month="07", end_year="2017", end_hour="07"):
    """
    Use the get_GSTP method to filter out data for a desired time and latitude-longitude pair, and then grab the output
    values stored on the C3 table for that GSTP.

    Arguments

    start_lat, end_lat, start_lon, end_lon : float
        Starting and ending latitude and longitude coordinates, for specifying the region of data to fetch
    start_day, end_day, start_month, end_month, start_year, end_year, start_hour, end_hour : string
        For specifying the interval of time of data to fetch, to be concatenated appropriately.

    Value

    pandas DataFrame
        Fetched data over the range specified through the above arguments
    """
    
    import pandas as pd

    # take user inputs and obtain the appropriate GeoSurfaceTimePoint
    my_start_time = start_year + "-" + start_month + "-" + start_day + "T" + start_hour + ":00:00.000"
    my_end_time = end_year + "-" + end_month + "-" + end_day + "T" + end_hour + ":00:00.000"
    my_GSTP = get_GSTP(start_lat=start_lat, end_lat=end_lat,
                        start_lon=start_lon, end_lon=end_lon,
                        start_time=my_start_time, end_time=my_end_time).objs

    # filter
    s3haodFilter = c3.Filter().eq("geoSurfaceTimePoint", my_GSTP) 

    # collect objects
    all_outputs = c3.Simulation3HourlyAODOutput.fetch({"filter": s3haodFilter, "limit": -1})

    # cast it into pandas
    all_outputs = all_outputs.objs.toJson()
    all_outputs = pd.DataFrame(all_outputs)
    
    # add key for joining with inputs
    all_outputs["id"] = [x['id'] for x in all_outputs.simulationSample]
    
    # exclude a list of descriptive columns which won't be used
    use_names = []
    for col_name in all_outputs.columns:
        if col_name not in ['type', 'version', 'simulationSample']:
            use_names.append(col_name)
    all_outputs = all_outputs[use_names]
    
    return(all_outputs)


def get_data(my_inputs, my_outputs, total_output=True):
    """
    Given a table of inputs and a table of outputs, join them so that the inputs for a certain ensemble member match
    with the outputs from that ensemble member

    Arguments

    my_inputs : pandas DataFrame
        A data frame of the format outputted by the `get_inputs()` function
    my_outputs : pandas DataFrame
        A data frame of the format outputted by the `get_outputs()` function
    total_output : boolean
        If total_output == True, then include a column which is the sum of the AOD output columns

    Value

    pandas DataFrame
    """
    import pandas as pd

    # compute total AOD
    if total_output is True:
        use_names = []
        for col_name in my_outputs.columns:
            if 'Mode' in col_name:
                use_names.append(col_name)
        my_outputs["total"] = my_outputs[use_names].sum(axis=1)

    # join inputs and outputs
    my_data = pd.merge(my_inputs,
                       my_outputs,
                       on="id",
                       how="inner")
    
    my_data['point'] = [my_data.geoSurfaceTimePoint[k]['id'] for k in range(my_data.shape[0])]

    return(my_data)

In [2]:
get_inputs()

Unnamed: 0,id,acure_bl_nuc,acure_ait_width,acure_cloud_ph,acure_prim_so4_diam,acure_sea_spray,acure_anth_so2_chi,acure_anth_so2_asi,acure_anth_so2_eur,acure_anth_so2_nam,...,acure_oxidants_o3,bparam,two_d_fsd_factor,c_r_correl,acure_autoconv_exp_lwp,acure_autoconv_exp_nd,dbsdtbs_turb_0,ai,m_ci,a_ent_1_rp
0,EnsNo_1_SimNo_0,0.500000,0.650000,0.396000,1.000000,0.500000,0.557493,0.557493,0.557493,0.557493,...,0.576175,0.500000,0.400000,0.900000,0.275862,0.605000,0.150000,0.514000,0.333333,0.460000
1,EnsNo_1_SimNo_1,0.470000,0.500000,0.500000,0.500000,0.530000,0.500000,0.500000,0.500000,0.500000,...,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000
2,EnsNo_1_SimNo_10,0.969888,0.083081,0.478474,0.516864,0.989837,0.717475,0.180696,0.034665,0.327155,...,0.017104,0.927093,0.833905,0.610920,0.993935,0.755788,0.774187,0.960911,0.988952,0.508725
3,EnsNo_1_SimNo_100,0.132847,0.445265,0.390414,0.792391,0.607072,0.139325,0.043223,0.837181,0.229608,...,0.010731,0.950732,0.902536,0.780157,0.267910,0.018570,0.106893,0.218308,0.163327,0.936031
4,EnsNo_1_SimNo_101,0.058261,0.630422,0.132292,0.490741,0.545087,0.945053,0.268063,0.476302,0.982620,...,0.779013,0.129769,0.712185,0.552866,0.328090,0.651008,0.613814,0.101666,0.254514,0.089525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,EnsNo_1_SimNo_95,0.591530,0.996801,0.170201,0.119761,0.745082,0.171170,0.064218,0.226830,0.389115,...,0.836703,0.515917,0.891746,0.418227,0.732156,0.900471,0.035621,0.175431,0.482366,0.400788
217,EnsNo_1_SimNo_96,0.774235,0.165151,0.881014,0.807389,0.340840,0.194492,0.694277,0.678455,0.086511,...,0.787844,0.786932,0.231686,0.840574,0.349984,0.433408,0.930975,0.119901,0.777433,0.116056
218,EnsNo_1_SimNo_97,0.227072,0.231834,0.185796,0.243118,0.500152,0.658270,0.007125,0.428821,0.998654,...,0.709293,0.204519,0.096402,0.147632,0.313749,0.741739,0.144573,0.488763,0.245905,0.686731
219,EnsNo_1_SimNo_98,0.047377,0.633909,0.721278,0.827818,0.062070,0.484378,0.324429,0.362036,0.114881,...,0.214451,0.303059,0.898960,0.114170,0.725976,0.766313,0.259877,0.872825,0.966951,0.826032


# Visualize

In [11]:
def scatter_AOD(lat=-0.625, lon=-0.9375,
                start_day="01", start_month="07",
                end_day="07", end_month="07",
                response="total", favorite_inputs=True):
    """
    This function is for diagnostic purposes. It simply plots a few scatterplots to check whether the joined datasets
    present expected trends.
    """

    import matplotlib.pyplot as plt

    inputs = get_inputs()
    outputs = get_outputs(start_lat=lat, end_lat=lat, start_lon=lon, end_lon=lon,
                          start_day=start_day, start_month=start_month, start_year="2017", start_hour="00",
                          end_day=end_day, end_month=end_month, end_year="2017", end_hour="21")
    
    data = get_data(inputs, outputs)

    avg_data = data.copy().groupby(['id']).mean()

    if favorite_inputs:
        for an_input in ['acure_sea_spray', 'acure_dry_dep_acc', 'acure_dry_dep_so2', 'acure_bvoc_soa', 'acure_prim_so4_diam', 'm_ci']:
            if an_input != 'id':
                plt.scatter(avg_data[an_input], avg_data[response])
                plt.xlabel(an_input)
                plt.ylabel(response + 'aod')
                plt.show()
    else:
        for an_input in inputs.columns:
            if an_input != 'id':
                plt.scatter(avg_data[an_input], avg_data[response])
                plt.xlabel(an_input)
                plt.ylabel(response + 'aod')
                plt.show()

    return