Please have `py-gordon_1_0_0` installed to run this notebook.

In [1]:
import pandas as pd
import numpy as np
import datetime

# File data methods

In [2]:
def get_file_data(file_name):
    """
    Get data from one file given the file's URL. 
    """
    
    ncdf = c3.NetCDFUtil.openFile(file_name)

    lat = ncdf["latitude"][:]
    lon = [x*(x < 180) + (x - 360)*(x >= 180) for x in ncdf["longitude"][:]]
    tim = ncdf["time"][:]

    zero_time = datetime.datetime(1970,1,1,0,0)
    dt_tim = []
    for t in tim:
        target_time = zero_time + datetime.timedelta(hours=t)
        dt_tim.append(target_time)
    dt_tim
    
    file_df = pd.DataFrame()
    file_df["time"] = [t for t in dt_tim for n in range(len(lat)*len(lon))]
    file_df["latitude"] = [l for l in lat for n in range(len(lon))]*len(dt_tim)
    file_df["longitude"] = [l for l in lon]*len(dt_tim)*len(lat)

    variable_names = {
        "dust" : "atmosphere_optical_thickness_due_to_dust_ambient_aerosol",
        "soluble_aitken_mode" : "atmosphere_optical_thickness_due_to_soluble_aitken_mode_ambient_aerosol",
        "soluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_soluble_accumulation_mode_ambient_aerosol",
        "soluble_coarse_mode" : "atmosphere_optical_thickness_due_to_soluble_coarse_mode_ambient_aerosol",
        "insoluble_aitken_mode" : "atmosphere_optical_thickness_due_to_insoluble_aitken_mode_ambient_aerosol",
        "insoluble_accumulation_mode" : "atmosphere_optical_thickness_due_to_insoluble_accumulation_mode_ambient_aerosol",
        "insoluble_coarse_mode" : "atmosphere_optical_thickness_due_to_insoluble_coarse_mode_ambient_aerosol"
    }
    for var in variable_names.items():
        values = ncdf[var[1]][:][2, :, :, :]
        file_df[var[0]] = np.array(values).flatten()
        
    return file_df

# Cassandra data methods

In [3]:
def cassandra_DynamicMapReduceFetch(gstpFilter, nBatches=200):
    
    def cassandra_mapper(batch, objs, job):
        return {"result": c3.Simulation3HourlyAODOutput.fetch({"filter": c3.Filter().intersects("geoSurfaceTimePoint.id", [obj.id for obj in objs]).value}).objs}
    
    def cassandra_reducer(key, interValues, job):
        values = []
        for iv in interValues:
            for val in iv:
                values.append(val)
        return values
    
    map_lambda = c3.Lambda.fromPython(cassandra_mapper)
    reduce_lambda = c3.Lambda.fromPython(cassandra_reducer)
    
    job = c3.DynMapReduce.startFromSpec(c3.DynMapReduceSpec(
        targetType="GeoSurfaceTimePoint",       
        filter=gstpFilter, 
        mapLambda=map_lambda,
        reduceLambda=reduce_lambda,
        batchSize=nBatches
        )
    )
    
    return job

In [4]:
def cassandra_DynamicMapReduceFetchWithSimContext(gstpFilter, simNumber, nBatches=200):
    
    def cassandra_mapper(batch, objs, job):
        simId = job.context.id
        gstp = [obj.id for obj in objs]
        fetchFilter = c3.Filter().eq("simulationSample.id", simId).and_().intersects("geoSurfaceTimePoint.id", gstp)
        return {"result": c3.Simulation3HourlyAODOutput.fetch({"filter": fetchFilter.value}).objs}
    
    
    def cassandra_reducer(key, interValues, job):
        values = []
        for iv in interValues:
            for val in iv:
                values.append(val)
        return values
    
    map_lambda = c3.Lambda.fromPython(cassandra_mapper)
    reduce_lambda = c3.Lambda.fromPython(cassandra_reducer)
    simId = "EnsNo_1_SimNo_" + str(simNumber)
    simulationSample = c3.SimulationSample.get(simId)
    
    job = c3.DynMapReduce.startFromSpec(c3.DynMapReduceSpec(
        targetType="GeoSurfaceTimePoint",       
        filter=gstpFilter, 
        mapLambda=map_lambda,
        reduceLambda=reduce_lambda,
        batchSize=nBatches,
        context=simulationSample
        )
    )
    
    return job

In [5]:
def get_df_from_job_results(job):
    """
    Formats data from DynMapReduce job into a Pandas dataframe.
    """
    import pandas as pd
    import datetime as dt
    
    def get_lats(c3id):
        x = c3id.split("_")
        lat = float(x[0])
        return lat
    
    def get_lons(c3id):
        x = c3id.split("_")
        lon = float(x[1])
        return lon
    
    def get_tims(c3id):
        x = c3id.split("_")
        time = x[2]
        time = time.split("#")[0]
        time = dt.datetime.strptime(time, "%Y-%m-%dT%H:%M:%S")
        return time
    
    def get_coordinates(df):
        df["latitude"] = df["id"].apply(get_lats)
        df["longitude"] = df["id"].apply(get_lons)
        df["time"] = df["id"].apply(get_tims)
        return df
    
    if job.status().status == "completed":
        df = pd.DataFrame(job.results()['result'])
        df = get_coordinates(df)
        return df
    else:
        print("Job did not complete")
        return

In [6]:
def get_simulation_from_df(df, simulationNumber):
    """
    Given a df from get_df_from_job_results_original, selects the input simulation.
    """
    simulationSampleId = "EnsNo_1_SimNo_" + str(simulationNumber)
    
    def check_sim_id(c3simSam):
        x = c3simSam["id"]
        if (x == simulationSampleId):
            return True
        else:
            return False
    
    checker = df["simulationSample"].apply(check_sim_id)
    
    df_masked = df[checker]
    
    return df_masked

# Check data consistency methods

In [11]:
def check_consistency(date, simulationNumber, tolerance):
    """
    Checks if data from files and Cassandra are the same within tolerance.
    
    Inputs:
        - date (str): the target date in "MMDD" format, eg. "0721"
        - simulationNumber (str): the ensemble member with three digits, eg. "027"
        - tolerance (float): the value that is acceptable to have for the sum of differences |file - cassandra| 
        
    Returns:
        - bool: True if checked passed, False if did not
    """
    import time
    
    # get file data
    print("Getting file data...", end=" ")
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    monthInt = int(date[0:2]) - 1
    file_name = "azure://aod-3hourly/" + months[monthInt] + "/bs714a.pb2017" + date + ".pp_" + simulationNumber + ".nc"
    file_df = get_file_data(file_name)
    print("done.")
    
    # get cassandra data
    print("Getting Cassandra data...", end=" ")
    target_date = "2017-" + date[0:2] + "-" + date[2:4]
    start_time = target_date + "T00:00:00.000"
    stop_time = target_date + "T23:59:59.999"
    gstpFilter = c3.Filter().ge("time", start_time).and_().le("time", stop_time)
    
    job = cassandra_DynamicMapReduceFetchWithSimContext(gstpFilter, int(simulationNumber))
    while(job.status().status != "completed"):
        time.sleep(30)
        print("...", end=" ")
    cass_df = get_df_from_job_results(job)
    print("done.")
    
    # compare
    print("Comparing data...")
    cass_df = cass_df.sort_values(by = ['latitude', 'longitude', 'time'], ignore_index=True)
    file_df = file_df.sort_values(by = ['latitude', 'longitude', 'time'], ignore_index=True)
    
    variables_names = {
            "latitude": "latitude",
            "longitude": "longitude",
            "dust" : "dust",
            "solubleAitkenMode" : "soluble_aitken_mode",
            "solubleAccumulationMode" : "soluble_accumulation_mode",
            "solubleCoarseMode" : "soluble_coarse_mode",
            "insolubleAitkenMode" : "insoluble_aitken_mode"
            }
    
    for var in variables_names.items():
        print(f"Comparing {var[0]}...", end=" ")
        y = file_df[var[1]] - cass_df[var[0]]
        y = y.abs()
        s = y.sum() / len(y)
        if (s > tolerance):
            print(f" not consistent! {s}")
            return False
        else:
            print("done.")
    print("Test passed.")
        
    return True

# Tests

## Random sampling files

In [13]:
import numpy as np

for i in range(20):
    day = np.random.randint(1,8)
    sim = np.random.randint(0,221)
    date = "070" + str(day)
    sim = str(sim).zfill(3)
    print(f"Simulation {sim} for {date} check")
    result = check_consistency(date,sim,0.01)
    if (result == False):
        print(f"Simulation {sim} for {date} is not consistent")

Simulation 023 for 0703 check
Getting file data... done.
Getting Cassandra data... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... done.
Comparing data...
Comparing latitude... done.
Comparing longitude... done.
Comparing dust... done.
Comparing solubleAitkenMode... done.
Comparing solubleAccumulationMode... done.
Comparing solubleCoarseMode... done.
Comparing insolubleAitkenMode... done.
Test passed.
Simulation 055 for 0701 check
Getting file data... done.
Getting Cassandra data... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... done.
Comparing data...
Comparing latitude... done.
Comparing longitude... done.
Comparing dust... done.
Comparing solubleAitkenMode... done.
Comparing solubleAccumulationMode... done.
Comparing solubleCoarseMode... done.
Comparing insolubleAitkenMode... done.
Test passed.
Simulation 126 for 0701 check
Getting file data... don