In [2]:
%run ../Get-data.ipynb

Exception: File `'../Get-data.ipynb.py'` not found.

In [3]:
# These libraries are available in the Python 3 kernel
import pandas as pd
import numpy as np

# These libraries are not available in the Python 3 kernel but are in py-gordon_2_0_0
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import sklearn as skl

# Constants

In [8]:
variable_names = {
        "dust" : "atmosphere_optical_thickness_due_to_dust_ambient_aerosol",
        "soluble_aitken_mode" : "atmosphere_optical_thickness_due_to_soluble_aitken_mode_ambient_aerosol",
        "soluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_soluble_accumulation_mode_ambient_aerosol",
        "soluble_coarse_mode" : "atmosphere_optical_thickness_due_to_soluble_coarse_mode_ambient_aerosol",
        "insoluble_aitken_mode" : "atmosphere_optical_thickness_due_to_insoluble_aitken_mode_ambient_aerosol",
        "insoluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_insoluble_accumulation_mode_ambient_aerosol",
        "insoluble_coarse_mode" : "atmosphere_optical_thickness_due_to_insoluble_coarse_mode_ambient_aerosol"
}

dates = ["0" + str(int("0801") + k*int("0001"))
         for k in range(0, 31)] + ["0" + str(int("0901") + k*int("0001"))
                                   for k in range(0, 30)]

times = ['3A', '6A', '9A', '12P', '3P', '6P', '9P', '12A']
times_dict = dict(zip(times, list(range(len(times)))))

ensemble = ["0"*(3-len(str(int("000") + k*int("001")))) + str(int("000") + k*int("001")) for k in range(0, 220)]

# Obtain a comma-separated table of simulation ensemble members' parameter combinations
csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()

# Save this table instead as a pandas dataframe (221 ensemble members x 64 parameters)
dfparams = pd.DataFrame(csv_table_metric)
dfparams["member"] = dfparams.id.str.split('_', 3, expand=True)[3].apply(lambda x: x.zfill(3))
dfparams_of_interest = dfparams.iloc[:, 5:65]

# Functions

In [9]:
def open_sat_sim(month = "jul", date = "0726", member = "000"):
    """
    The variable atmosphere optical depth (AOD) as found in the simulated satellite data sets are indexed by the pseudo 
    level (the wavelength for which the AOD was calculated), time (which 3X-hour mark in the day when the simulated data 
    are meant to occur), latitude, and longitude.
    
    Parameters

    date : Day of 2017 of form "mmdd"
    member : Ensemble member, from 000-220

    Value

    netCDF data frame
    """

    # file_name = "azure://monthly-mean-simulations/bs714a.pb2017" + date + ".pp_" + member + ".nc"
    file_name = "azure://aod-3hourly/" + month + "/bs714a.pb2017" + date + ".pp_" + member + ".nc"

    return(c3.NetCDFUtil.openFile(file_name))


def open_sat_obs(url):
    return(c3.NetCDFUtil.openFile(url))

In [6]:
def flatten_variable(df, name):
    """
    Helper function for 'select_sim_data'

    Parameters

    df : netCDF data frame
    name : string, the name of a variable 
    
    Value
    
    list, a 1-d vector of elements in the chosen array
    """

    var = df[name][:][2, :, :, :]

    return(np.array(var).flatten())


def select_sim_data(df, member = "000"):
    """
    Unpack the netCDF file as a Pandas dataframe

    Parameters

    df : netCDF data frame
    times : list of strings, the time periods at which the simulated data are meant to occur

    Value

    Pandas DataFrame with columns the variables in df
    """

    # First produce a table of all time-latitude-longitude combinations (in a carefully managed order!) Note: We rotate
    # longitude values by 180 degrees so that they fall between -180 and +180, just as the data are plotted
    lat = df["latitude"][:]
    lon = [x*(x < 180) + (x - 360)*(x >= 180) for x in df["longitude"][:]]

    data = pd.DataFrame()
    data["time"] = [t for t in times for n in range(0, len(lat)*len(lon))]
    data["latitude"] = [l for l in lat for n in range(0, len(lon))]*len(times)
    data["longitude"] = [l for l in lon]*len(times)*len(lat)
    data["member"] = member

    # Enter each aerosol into the data frame by flattening the 4-d arrays from df
    for var in variable_names.items():
        data[var[0]] = flatten_variable(df, var[1])

    data = pd.merge(data,
                    dfparams_of_interest,
                    on="member",
                    how="inner")

    return(data)

In [7]:
def compile_output(ensem_membs=["000"], region=[-180, 180, -90, 90], one_day=1, date='0826', one_time=1, time='9A'):
    """
    Parameters

    num_ensem_mems : integer
        For speed, sometimes only a subset of the ensemble are desired for compiling
    region : list of integers
        The region of the globe whose data is desired
    one_day : integer
        If zero, compile all days available. Else, only obtain data from the given day.
    day : string
        Of the form 'mmdd', the day for which data is desired.
    time : string
        Of the form 'HA' or 'HP', where 'H' is the hour (a multiple of 3) and 'A' or 'P' designates AM or PM

    Value

    Pandas DataFrame
        All of the data satisfying the request specified by the parameters
    """
    data = pd.DataFrame()

    # Decide which ensemble members to compile
    my_members = ensem_membs

    # Decide which days to compile
    if one_day != 0:
        my_days = [date]
    else:
        my_days = dates

    # Decide which days to compile
    if one_time != 0:
        my_times = [time]


    else:
        my_times = times

        # Loop through all of my_members, my_days and append them to the data
        for memb in my_members:
            for day in my_days:

                # Obtain data frame for given member, day, time combo
                dataset = open_sat_sim(date = day, member = memb)
                sample = select_sim_data(dataset, member = memb)

                # Subset the sample data frame accordingly
                sample = sample[
                    (sample.longitude >= region[0]) &
                    (sample.longitude <= region[1]) &
                    (sample.latitude >= region[2]) &
                    (sample.latitude <= region[3])
                ]

                sample["member"] = memb
                sample["date"] = day

                data = data.append(sample)

        return(data)