# Obtaining the Timeseries from Cassandra

This means that the NetCDF files have already been processed. All that needs to be done is normalization and aggregation.

In [1]:
import pandas as pd
import time
import numpy as np
from datetime import datetime, timedelta

In [21]:
start_time = time.time()

# build metrics
my_metric = c3.SimpleMetric(id = "AverageX_SimulationSample",
                            name = "AverageX_SimulationSample",
                            description = "Calculates average of property X for a given SimulationSample",
                            srcType = "SimulationSample",
                            path = "output",
                            expression = "avg(avg(normalized.data.propertyX))"
                           )

# build spec
my_spec = c3.EvalMetricsSpec(
            ids = ["EnsNo_1_SimNo_0"],
            expressions = ["AverageX_SimulationSample"],
            start = "2017-01-01T00:00:00.000",
            end = "2017-10-28T00:00:00.000",
            interval = "HOUR" 
            )

N = 100
times = np.zeros(N)

for i in range(N):
    start_time = time.time()
    # evaluate metrics
    evalMetricsResult = c3.SimulationSample.evalMetricsWithMetadata(
                                                        spec=my_spec,
                                                        overrideMetrics=[my_metric])
    # get a data frame
    df1 = c3.EvalMetricsResult.toPandas(result=evalMetricsResult)
    end_time = time.time()

    times[i] = end_time - start_time
    
print('Metrics evaluation takes', times.mean(), '+-', times.std()/N, 's')

df1['source'] = df1.index.str.split('_').str[0:-1]
df1['timestamp'] = pd.to_datetime(df1.index.str.split('_').str[-1],format="%Y-%m-%dT%H:%M:%S.%f")

Metrics evaluation takes 0.12593433141708374 +- 0.0003261852787188128 s


# Obtaining Timeseries directly from the NetCDF files

In [6]:
files = c3.SimulationOutputFile.fetch({'filter': "simulationSample.id == 'EnsNo_1_SimNo_0'"}).objs

zero_time = datetime(1970,1,1,0,0)

In [7]:
start_time = time.time()

df = pd.DataFrame()
for file in files:
    partial_df = pd.DataFrame()
    data = c3.NetCDFUtil.openFile(file.file.url)
    
    partial_df['time'] = data.variables['time'][:]
    transformed_times = []
    for t in partial_df['time']:
        target_time = zero_time + timedelta(hours=t)
        transformed_times.append(target_time)
    partial_df['datetime'] = transformed_times
    partial_df.drop(columns=['time'], inplace=True)
    
    partial_df['longitude'] = data.variables['longitude'][:]
    partial_df['latitude'] = data.variables['latitude'][:]
    partial_df['propertyX'] = data.variables['mass_fraction_of_black_carbon_in_soluble_accumulation_mode_dry_aerosol_in_air'][:]
    
    frames = [df, partial_df]
    df = pd.concat(frames)
    
end_time = time.time()
print('Reading from files takes', end_time - start_time, 's')



Reading from files takes 14.713778257369995 s


In [24]:
N = 100
times = np.zeros(N)

for i in range(N):
    start_time = time.time()
    
    # resample
    df2 = df.resample('H', on='datetime').mean()
    
    # NOW NEED TO NORMALIZE...
    
    end_time = time.time()

    times[i] = end_time - start_time
    
print('Pandas resampling takes', times.mean(), '+-', times.std()/N, 's')

Pandas resampling takes 0.005504114627838135 +- 8.607817607708349e-06 s


# Plot differences

In [18]:
x = np.array(df2['propertyX'])
y = np.array(df1['AverageX_SimulationSample'])