# Optimization Strategie - STEP 3
* IAEffectSampler - outer function call optimization
  - single call to sample_time_and_space for all counties
  - precalculation for predications
  - use required dtype directly in precalculations instead of converting it afterwards


In [1]:
import numpy as np
import pandas as pd

# A - Read/Create Input Data
* output:
  - kw_data
  - day_data
  - time_by_day

## Read counties

In [2]:
import pickle as pkl
with open('../data/counties/counties.pkl', "rb") as f:
    counties = pkl.load(f)

## Read data

In [3]:
disease = "covid19"
prediction_region = "germany"
def load_daily_data(disease, prediction_region, counties, seperator=","):
    data = pd.read_csv("../data/diseases/{}.csv".format(disease),
                       sep=seperator, encoding='iso-8859-1', index_col=0)

    if "99999" in data.columns:
        data.drop("99999", inplace=True, axis=1)

    data = data.loc[:, list(
        filter(lambda cid: prediction_region in counties[cid]["region"], data.columns))]
    data.index = [pd.Timestamp(date) for date in data.index]

    return data
indata = load_daily_data(disease, prediction_region, counties)
data = indata
data

Unnamed: 0,03159,09576,07334,06631,10046,01058,03459,05316,15089,04011,...,11003,11004,11006,11007,11008,11009,11010,11011,11012,11005
2020-01-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create times_by_day dictionary

In [4]:
import datetime
from collections import OrderedDict

rnd_tsel = np.random.Generator(np.random.PCG64(12345))

def uniform_times_by_day(days, n=10):
    """ Samples n random timepoints within a day, per day. converts pd.Timestamps to datetime obj."""
    res = OrderedDict()
    for day in days:
        time_min = datetime.datetime.combine(day, datetime.time.min)
        time_max = datetime.datetime.combine(day, datetime.time.max)
        res[day] = rnd_tsel.random(n) * (time_max - time_min) + time_min
    return res

times_by_day=uniform_times_by_day(data.index)
#times_by_day

## Create locations_by_county dictionary

In [5]:
from collections import OrderedDict

rnd_csel = np.random.Generator(np.random.PCG64(12345))

def uniform_locations_by_county(counties, n=5):
    res = OrderedDict()
    for (county_id, county) in counties.items():
        tp = county["testpoints"]
        if n == len(tp):
            res[county_id] = tp
        else:
            idx = rnd_csel.choice(tp.shape[0], n, replace=n > len(tp))
            res[county_id] = tp[idx]
    return res

locations_by_county=uniform_locations_by_county(counties)
#locations_by_county

# B - Do the Sampling (the old way)

In [6]:
# set seed to check results
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))
rnd_time_pred = np.random.Generator(np.random.PCG64(12345))
rnd_loc_pred  = np.random.Generator(np.random.PCG64(12345))

# random generators:
# MT19937, PCG64, Philox, SFC64 - https://numpy.org/devdocs/reference/random/bit_generators/index.html

In [7]:
#%%timeit
# loop over all days of all counties
# and draw per day n-times a random time from times_by_day[day]

def sample_time_and_space(data, times_by_day, locations_by_county, rnd_t, rnd_l):
    n_total = data.sum().sum()
    t_all = np.empty((n_total,), dtype=object)
    x_all = np.empty((n_total, 2))
    
    i=0
    for (county_id, series) in data.iteritems():
        for (day, n) in series.iteritems():
            #if n==0: continue
            #print(i,"\n   day =",day,"\n   no. samples to draw = ",n)
        
            # draw n random times
            times = times_by_day[day]
            #idx = rnd_time.choice(len(times), n)
            idx = np.floor( (n*[len(times)]) * rnd_t.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
            #print("   random sample ids   = ",idx)
            t_all[i:i + n] = times[idx]

            # draw n random locations
            locs = locations_by_county[county_id]
            #idx = rnd_loc.choice(locs.shape[0], n)
            idx = np.floor( (n*[locs.shape[0]]) * rnd_l.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
            x_all[i:i + n, :] = locs[idx, :]
        
            i += n          

    return t_all, x_all

#num_features = len(temporal_bfs(tt.fmatrix("tmp"))) * len(spatial_bfs(tt.fmatrix("tmp")))
#res = np.zeros( (len(days), len(counties), num_features), dtype=np.float32)

t_data_0 = []
x_data_0 = []
t_pred_0 = []
x_pred_0 = []

num_tps=5
d_offs=0 # just to limit the time of test
c_offs=0 # just to limit the time of test
days = data.index[d_offs:d_offs+50]
counties = data.columns[c_offs:c_offs+50]
for i, day in enumerate(days):
    for j, county in enumerate(counties):
        idx = ((day - pd.Timedelta(days=5)) <= data.index) * (data.index < day)

        t_data, x_data = sample_time_and_space(data.iloc[idx], times_by_day, locations_by_county, rnd_time, rnd_loc)
        t_pred, x_pred = sample_time_and_space(pd.DataFrame(num_tps, index=[day], columns=[county]), times_by_day, locations_by_county, rnd_time_pred, rnd_loc_pred)

        #res[i, j, :] = ia_bfs(_to_timestamp(t_pred), x_pred, _to_timestamp(t_data), x_data)        
        
        # store all to compare with old algo
        t_data_0 = t_data_0 + t_data.tolist()
        x_data_0 = x_data_0 + x_data.tolist()
        t_pred_0 = t_pred_0 + t_pred.tolist()
        x_pred_0 = x_pred_0 + x_pred.tolist()

######## output ########
#display(t_data_0[:2])
#display(x_data_0[:2])
#display(t_pred_0[:2])
#display(x_pred_0[:2])

# C - Do the Sampling (the NEW way)

---
---
## C3 - COMPACT result
* requires (A) to be finished -> data, times_by_day

In [13]:
def sample_time_and_space__once(times_by_day, locations_by_county):
    """ 
    Convert dictonarys to arrays for faster access in sample_time_and_space().
  
    Random access in times_by_day and locations_by_county are very costy.
    Hence they need to be converted to arrays and access must be done through indexes.
    """
    # times_by_day_np[day-id] => times[n_times]
    times_by_day_np = pd.DataFrame.from_dict(times_by_day,orient='index').to_numpy(dtype='datetime64') # => type=='numpy.datetime64'
    
    t_convert_1 = np.frompyfunc(pd.Timestamp, 1, 1)
    times_by_day_np = t_convert_1(times_by_day_np) # => type=='pandas._libs.tslibs.timestamps.Timestamp'
    
    t_convert_2 = np.frompyfunc(datetime.datetime.timestamp, 1, 1)    
    times_by_day_np = t_convert_2(times_by_day_np) # => type=='float'
    
    # locations_by_county_np[county-id] => locs[m_locs[x,y]]
    max_coords = 0
    for item in locations_by_county.items():
        max_coords = max( len(item[1]), max_coords)
    locations_by_county_np = np.empty([len(locations_by_county.keys()), max_coords, 2], dtype='float64')
    for i,item in enumerate(locations_by_county.items()): # counties are sorted because of OrderedDict
        locations_by_county_np[i][:] = item[1][:]
    
    return(times_by_day_np, locations_by_county_np)

In [14]:
def sample_time_and_space__prep(times_by_day_np, locations_by_county_np, data, idx):
    """ 
    Recalculations for a fixed dataframe sample_time_and_space().
  
    Calculation of helper arrays are very costy.
    If the dataframe does not change, precalculated values can be reused.
    """

    # subdata 'data' of 'indata' is likely to skip a few first days(rows) in 'indata',
    # but as times_by_day_np represents the whole 'indata', an offsets needs to be considered when accessing 'times_by_day_np'
    dayoffset = np.where(idx==True)[0][0]   
    n_total = data.sum().sum()

    # get number of samples per county-day
    smpls_per_cntyday = np.array(data.values).flatten('F')

    ######## t_all ########

    # get list of day-ids for all county-days
    dayids = np.arange(len(data.index))
    day_of_cntyday = np.tile(dayids, len(data.columns))

    # get list of day-ids for all samples
    day_of_smpl = [ day_of_cntyday[i] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]  

    # get available times for each sample
    time_of_days = data.index.tolist() # cannot be a np.array as it needs to stay a pandas.timeformat
    av_times_per_day = [len(times_by_day[d]) for d in time_of_days]
    av_times_per_smpl = [ av_times_per_day[day_of_cntyday[i]] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]
    
    ######## x_all ########

    # get list of county-ids for all county-days
    cntyids = np.arange(len(data.columns))
    cnty_of_cntyday = np.repeat(cntyids, len(data.index))

    # get list of county-ids for all samples
    cnty_of_smpl = [ cnty_of_cntyday[i] for (i,smpl) in enumerate(smpls_per_cntyday) for x in range(smpl) ]

    # get available locations for each sample
    label_of_cntys = data.columns # list of countys labels
    av_locs_per_cnty = [len(locations_by_county[c]) for c in label_of_cntys]
    av_locs_per_smpl = [ av_locs_per_cnty[cnty_of_cntyday[i]] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]
    
    return (n_total, dayoffset,
            day_of_smpl, av_times_per_smpl, 
            cnty_of_smpl, av_locs_per_smpl)

In [15]:
def sample_time_and_space__pred(n_days, n_counties, d_offs, c_offs, num_tps, av_times_per_smpl, av_locs_per_smpl, rnd_time, rnd_loc):
    
    ######## t_all ########    
    n_total = n_days * n_counties * num_tps
    
    rnd_timeid_per_smpl = np.floor( av_times_per_smpl * rnd_time.random( n_total ) ).astype("int32")
    
    # collect times for each sample with its random time-id
    t_all = [ times_by_day_np[d_offs+i][rnd_timeid_per_smpl[(i*n_counties+j)*num_tps+x]] for i in range(n_days) for j in range(n_counties) for x in range(num_tps) ] 

    ######## x_all ########

    # calc random location-id for each sample
    rnd_locid_per_smpl = np.floor( av_locs_per_smpl * rnd_loc.random((n_total,)) ).astype("int32")

    # collect locations for each sample with its random location-id
    x_all = [ locations_by_county_np[c_offs+j][rnd_locid_per_smpl[(i*n_counties+j)*num_tps+x]] for i in range(n_days) for j in range(n_counties) for x in range(num_tps) ] 

    return t_all, x_all

In [16]:
def sample_time_and_space(n_counties, n_total, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl, rnd_time, rnd_loc):
    """ 
    Calculations samples in time and space.
  
    Calculation a hughe random number array use precalulated results to pick samples.
    """
    
    ######## t_all ########
    
    # calc random time-id for each sample
    n_all = n_total * n_counties
    
    av_times_per_smpl_all = np.tile(av_times_per_smpl, n_counties)
    rnd_timeid_per_smpl_all = np.floor( av_times_per_smpl_all * rnd_time.random( (n_all,) ) ).astype("int32")

    # collect times for each sample with its random time-id
    t_all = [ times_by_day_np[day+dayoffset][rnd_timeid_per_smpl_all[j*n_total+i]] for j in range(n_counties) for (i,day) in enumerate(day_of_smpl) ] # [county][day][smpl]

    ######## x_all ########

    # calc random location-id for each sample
    av_locs_per_smpl_all = np.tile(av_locs_per_smpl, n_counties)
    rnd_locid_per_smpl_all = np.floor( av_locs_per_smpl_all * rnd_loc.random( (n_all,) ) ).astype("int32")

    # collect locations for each sample with its random location-id
    x_all = [ locations_by_county_np[cnty][rnd_locid_per_smpl_all[j*n_total+i]] for j in range(n_counties) for (i,cnty) in enumerate(cnty_of_smpl)] # [county][day][smpl]

    return t_all, x_all
    

In [17]:
#%%timeit
# set seed to check results
# Parallel Random Number Generation - https://docs.scipy.org/doc/numpy/reference/random/parallel.html
# Multithreaded Generation - https://docs.scipy.org/doc/numpy/reference/random/multithreading.html
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))
rnd_time_pred = np.random.Generator(np.random.PCG64(12345))
rnd_loc_pred  = np.random.Generator(np.random.PCG64(12345))

# Convert dictonarys to arrays for faster access in sample_time_and_space().
(times_by_day_np, locations_by_county_np,) = sample_time_and_space__once(times_by_day, locations_by_county)

t_data_1 = []
x_data_1 = []
t_pred_1 = []
x_pred_1 = []

d_offs=0 # just to limit the time of test
c_offs=0 # just to limit the time of test
days = data.index[d_offs:d_offs+50]
counties = data.columns[c_offs:c_offs+50]

num_tps=5
n_days = len(days)
n_counties = len(counties)

# create dataframe with 'num_tps' in each cell
pred_data = pd.DataFrame(num_tps, index=days, columns=counties)
idx = np.empty([len(data.index)], dtype='bool')
idx.fill(True)

# precalculate pediction values
(n_total, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl,) = sample_time_and_space__prep(times_by_day_np, locations_by_county_np, pred_data, idx)
(t_pred_all, x_pred_all,) = sample_time_and_space__pred(n_days, n_counties, d_offs, c_offs, num_tps, av_times_per_smpl, av_locs_per_smpl, rnd_time_pred, rnd_loc_pred)

for i, day in enumerate(days):
    
    # calc which sub-table will be selected
    idx = ((day - pd.Timedelta(days=5)) <= data.index) * (data.index < day)
    subdata = data.iloc[idx]
    
    if subdata.size != 0:
        # Recalculations for a fixed dataframe sample_time_and_space().
        (n_total, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl,) = sample_time_and_space__prep(times_by_day_np, locations_by_county_np, subdata, idx)    

        # Calculate time and space samples for all counties at once
        (t_data_all, x_data_all,) = sample_time_and_space(len(counties), n_total, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl, rnd_time, rnd_loc)

    for j, county in enumerate(counties):

        if subdata.size != 0:
            # get subarray for county==j
            t_data = t_data_all[j*n_total:(j+1)*n_total] # [county][smpl]
            x_data = x_data_all[j*n_total:(j+1)*n_total] # [county][smpl]
        else:
            t_data = []
            x_data = []

        # calcs only for the single DataFrame.cell[day][county]
        offs = (i*n_counties+j)*num_tps
        t_pred = t_pred_all[offs:offs+num_tps] 
        x_pred = x_pred_all[offs:offs+num_tps] 
        
        # define theano.function for day==i and county==j
        #res[i, j, :] = ia_bfs(t_pred, x_pred, t_data, x_data)    
    
        # store all to compare with old algo
        t_data_1 = t_data_1 + t_data
        x_data_1 = x_data_1 + x_data
        t_pred_1 = t_pred_1 + t_pred
        x_pred_1 = x_pred_1 + x_pred        

######## output ########
#display(t_data_1[:2])
#display(x_data_1[:2])
#display(t_pred_1[:2])
#display(x_pred_1[:2])

# D - Compare results (MUST be the same)
* requirementes
  * t_res must be uncommented in both algorithms
  * replace 'rnd_time.choice' with np.floor(...) to use same random numbers

## 1 - print 'data' result and type of OLD algorithm

In [18]:
print("t_data_0 (type, size, value): ", type(t_data_0[0]), np.shape(t_data_0), t_data_0[0])
print("x_data_0 (type, size, value): ", type(x_data_0[0][0]), np.shape(x_data_0), x_data_0[0][0])

t_data_0 (type, size, value):  <class 'datetime.datetime'> (1572300,) 2020-01-28 19:08:12.375513
x_data_0 (type, size, value):  <class 'float'> (1572300, 2) 11.389656225805421


In [19]:
_to_timestamp_0 = np.frompyfunc(datetime.datetime.timestamp, 1, 1)
t_data_old = _to_timestamp_0( t_data_0 )
x_data_old = x_data_0
print("t_data_old (types, type1, size, value): ", type(t_data_old), type(t_data_old[0]), np.shape(t_data_old), t_data_old[0])
print("x_data_old (types, size, value)       : ", type(x_data_old), type(x_data_old[0]), type(x_data_old[0][0]), np.shape(x_data_old), x_data_old[0][0])
display(t_data_old[:10])
display(x_data_old[:10])

t_data_old (types, type1, size, value):  <class 'numpy.ndarray'> <class 'float'> (1572300,) 1580234892.375513
x_data_old (types, size, value)       :  <class 'list'> <class 'list'> <class 'float'> (1572300, 2) 11.389656225805421


array([1580234892.375513, 1580224428.403552, 1580182133.833636,
       1580217693.876309, 1580224428.403552, 1580224428.403552,
       1580194755.123367, 1580193367.920551, 1580217693.876309,
       1580247371.767558], dtype=object)

[[11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.129027578958399, 51.57865700695229],
 [7.129027578958399, 51.57865700695229],
 [11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.002386813736268, 51.62250254585719],
 [7.025599010880636, 51.524092548810025],
 [11.32335488551304, 47.92160103315107],
 [11.115394883162168, 48.04390714570248]]

## 2 - print 'data' result and type of NEW algorithm

In [20]:
t_data_new = t_data_1
x_data_new = x_data_1
print("t_data_new (types, type1, size, value): ", type(t_data_new), type(t_data_new[0]), np.shape(t_data_new), t_data_new[0])
print("x_data_new (types, size, value)       : ", type(x_data_new), type(x_data_new[0]), type(x_data_new[0][0]), np.shape(x_data_new), x_data_new[0][0])
display(t_data_new[:10])
display(x_data_new[:10])

t_data_new (types, type1, size, value):  <class 'list'> <class 'float'> (1572300,) 1580234892.375513
x_data_new (types, size, value)       :  <class 'list'> <class 'numpy.ndarray'> <class 'numpy.float64'> (1572300, 2) 11.389656225805421


[1580234892.375513,
 1580224428.403552,
 1580182133.833636,
 1580217693.876309,
 1580224428.403552,
 1580224428.403552,
 1580194755.123367,
 1580193367.920551,
 1580217693.876309,
 1580247371.767558]

[array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.12902758, 51.57865701]),
 array([ 7.12902758, 51.57865701]),
 array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.00238681, 51.62250255]),
 array([ 7.02559901, 51.52409255]),
 array([11.32335489, 47.92160103]),
 array([11.11539488, 48.04390715])]

## 3 - check if result is the same

In [21]:
np.array_equal(t_data_old, t_data_new)

True

In [22]:
np.array_equal(x_data_old, x_data_new)

True

## 4 - print 'pred' result and type of OLD algorithm

In [23]:
print("t_pred_0 (type, size, value): ", type(t_pred_0[0]), np.shape(t_pred_0), t_pred_0[0])
print("x_pred_0 (type, size, value): ", type(x_pred_0[0][0]), np.shape(x_pred_0), x_pred_0[0][0])

t_pred_0 (type, size, value):  <class 'datetime.datetime'> (12500,) 2020-01-28 19:08:12.375513
x_pred_0 (type, size, value):  <class 'float'> (12500, 2) 10.435944369180099


In [24]:
_to_timestamp_0 = np.frompyfunc(datetime.datetime.timestamp, 1, 1)
t_pred_old = _to_timestamp_0( t_pred_0 ).tolist()
x_pred_old = x_pred_0
print("t_pred_old (types, type1, size, value): ", type(t_pred_old), type(t_pred_old[0]), np.shape(t_pred_old), t_pred_old[0])
print("x_pred_old (types, size, value)       : ", type(x_pred_old), type(x_pred_old[0]), type(x_pred_old[0][0]), np.shape(x_pred_old), x_pred_old[0][0])
display(t_pred_old[24:30])
display(x_pred_old[:10])

t_pred_old (types, type1, size, value):  <class 'list'> <class 'float'> (12500,) 1580234892.375513
x_pred_old (types, size, value)       :  <class 'list'> <class 'list'> <class 'float'> (12500, 2) 10.435944369180099


[1580234892.375513,
 1580224126.122202,
 1580193367.920551,
 1580193367.920551,
 1580185641.832341,
 1580194755.123367]

[[10.435944369180099, 51.69958916804793],
 [10.435944369180099, 51.69958916804793],
 [10.134378974970323, 51.51153765399198],
 [10.134378974970323, 51.51153765399198],
 [10.435944369180099, 51.69958916804793],
 [10.97023632180951, 49.35209111265112],
 [11.18855933984982, 49.23071192781685],
 [11.163806617939409, 49.10176138258148],
 [11.170639882542568, 49.20834421943156],
 [11.129581675596537, 49.30549971169858]]

## 5 - print 'data' result and type of NEW algorithm

In [25]:
t_pred_new = t_pred_1
x_pred_new = x_pred_1
print("t_pred_new (types, type1, size, value): ", type(t_pred_new), type(t_pred_new[0]), np.shape(t_pred_new), t_pred_new[0])
print("x_pred_new (types, size, value)       : ", type(x_pred_new), type(x_pred_new[0]), type(x_pred_new[0][0]), np.shape(x_pred_new), x_pred_new[0][0])
display(t_pred_new[24:30])
display(x_pred_new[:10])

t_pred_new (types, type1, size, value):  <class 'list'> <class 'float'> (12500,) 1580234892.375513
x_pred_new (types, size, value)       :  <class 'list'> <class 'numpy.ndarray'> <class 'numpy.float64'> (12500, 2) 10.435944369180099


[1580234892.375513,
 1580224126.122202,
 1580193367.920551,
 1580193367.920551,
 1580185641.832341,
 1580194755.123367]

[array([10.43594437, 51.69958917]),
 array([10.43594437, 51.69958917]),
 array([10.13437897, 51.51153765]),
 array([10.13437897, 51.51153765]),
 array([10.43594437, 51.69958917]),
 array([10.97023632, 49.35209111]),
 array([11.18855934, 49.23071193]),
 array([11.16380662, 49.10176138]),
 array([11.17063988, 49.20834422]),
 array([11.12958168, 49.30549971])]

## 4 - check if result is the same

In [26]:
np.array_equal(t_pred_old, t_pred_new)

True

In [27]:
np.array_equal(x_pred_old, x_pred_new)

True