# Optimization Strategie - STEP 2
* IAEffectSampler - outer function call optimization
  - precalculation in outer loop & reuse for whole innner loop
* other optimizations
  - no use of dictionarys in performance relevant code
    - translate OrderedDicts times_by_day, location_by_county to arrays
* FAILED optimization ideas
  - numpy everywhere
    - python lists are twice as fast in this scenario
  - np.fromiter for better preallocation of memory
    - it seems, that creating the iterator is more costy than copying the list

In [1]:
import numpy as np
import pandas as pd

# A - Read/Create Input Data
* output:
  - kw_data
  - day_data
  - time_by_day

## Read counties

## Create times_by_day dictionary

In [2]:
import pickle as pkl
with open('../data/counties/counties.pkl', "rb") as f:
    counties = pkl.load(f)

## Read data

In [6]:
disease = "covid19"
prediction_region = "germany"
def load_daily_data(disease, prediction_region, counties, seperator=","):
    data = pd.read_csv("../data/diseases/{}.csv".format(disease),
                       sep=seperator, encoding='iso-8859-1', index_col=0)

    if "99999" in data.columns:
        data.drop("99999", inplace=True, axis=1)

    data = data.loc[:, list(
        filter(lambda cid: prediction_region in counties[cid]["region"], data.columns))]
    data.index = [pd.Timestamp(date) for date in data.index]

    return data
indata = load_daily_data(disease, prediction_region, counties)
data = indata
data

Unnamed: 0,03159,09576,07334,06631,10046,01058,03459,05316,15089,04011,...,11003,11004,11006,11007,11008,11009,11010,11011,11012,11005
2020-01-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import datetime
from collections import OrderedDict

rnd_tsel = np.random.Generator(np.random.PCG64(12345))

def uniform_times_by_day(days, n=10):
    """ Samples n random timepoints within a day, per day. converts pd.Timestamps to datetime obj."""
    res = OrderedDict()
    for day in days:
        time_min = datetime.datetime.combine(day, datetime.time.min)
        time_max = datetime.datetime.combine(day, datetime.time.max)
        res[day] = rnd_tsel.random(n) * (time_max - time_min) + time_min
    return res

times_by_day=uniform_times_by_day(data.index)
#times_by_day

## Create locations_by_county dictionary

In [13]:
from collections import OrderedDict

rnd_csel = np.random.Generator(np.random.PCG64(12345))

def uniform_locations_by_county(counties, n=5):
    res = OrderedDict()
    for (county_id, county) in counties.items():
        tp = county["testpoints"]
        if n == len(tp):
            res[county_id] = tp
        else:
            idx = rnd_csel.choice(tp.shape[0], n, replace=n > len(tp))
            res[county_id] = tp[idx]
    return res

locations_by_county=uniform_locations_by_county(counties)
#locations_by_county

# B - Do the Sampling (the old way)

In [106]:
# set seed to check results
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))

# random generators:
# MT19937, PCG64, Philox, SFC64 - https://numpy.org/devdocs/reference/random/bit_generators/index.html

In [107]:
#%%timeit
# loop over all days of all counties
# and draw per day n-times a random time from times_by_day[day]

def sample_time_and_space(data, times_by_day, locations_by_county):
    n_total = data.sum().sum()
    t_all = np.empty((n_total,), dtype=object)
    x_all = np.empty((n_total, 2))

    i=0
    for (county_id, series) in data.iteritems():
        for (day, n) in series.iteritems():
            #print(i,"\n   day =",day,"\n   no. samples to draw = ",n)

            # draw n random times
            times = times_by_day[day]
            #idx = rnd_time.choice(len(times), n)
            idx = np.floor( (n*[len(times)]) * rnd_time.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
            #print("   random sample ids   = ",idx)
            t_all[i:i + n] = times[idx]

            # draw n random locations
            locs = locations_by_county[county_id]
            idx = np.floor( (n*[locs.shape[0]]) *  rnd_loc.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
            x_all[i:i + n, :] = locs[idx, :]

            i += n

    return t_all, x_all


t_res_0 = []
x_res_0 = []
days = indata.index[:20]
counties = indata.columns[:20]
for i, day in enumerate(days):
    for j, county in enumerate(counties):
        idx = ((day - pd.Timedelta(days=5)) <= data.index) * (data.index < day)

        t_data, x_data = sample_time_and_space(data.iloc[idx], times_by_day, locations_by_county)

        # store all to compare with old algo
        t_res_0 = t_res_0 + t_data.tolist()
        x_res_0 = x_res_0 + x_data.tolist()

######## output ########
display(t_res_0[:10])
display(x_res_0[:10])

[datetime.datetime(2020, 1, 28, 19, 8, 12, 375513),
 datetime.datetime(2020, 1, 28, 16, 13, 48, 403552),
 datetime.datetime(2020, 1, 28, 4, 28, 53, 833636),
 datetime.datetime(2020, 1, 28, 14, 21, 33, 876309),
 datetime.datetime(2020, 1, 28, 16, 13, 48, 403552),
 datetime.datetime(2020, 1, 28, 16, 13, 48, 403552),
 datetime.datetime(2020, 1, 28, 7, 59, 15, 123367),
 datetime.datetime(2020, 1, 28, 7, 36, 7, 920551),
 datetime.datetime(2020, 1, 28, 14, 21, 33, 876309),
 datetime.datetime(2020, 1, 28, 22, 36, 11, 767558)]

[[11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.129027578958399, 51.57865700695229],
 [7.129027578958399, 51.57865700695229],
 [11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.002386813736268, 51.62250254585719],
 [7.025599010880636, 51.524092548810025],
 [11.32335488551304, 47.92160103315107],
 [11.115394883162168, 48.04390714570248]]

# C - Do the Sampling (the NEW way)

---
---
## C3 - COMPACT result
* requires (A) to be finished -> data, times_by_day

In [108]:
def sample_time_and_space__once(times_by_day, locations_by_county):
    """ 
    Convert dictonarys to arrays for faster access in sample_time_and_space().

    Random access in times_by_day and locations_by_county are very costy.
    Hence they need to be converted to arrays and access must be done through indexes.
    """
    # times_by_day_np[day-id] => times[n_times]
    times_by_day_np = pd.DataFrame.from_dict(times_by_day,orient='index').to_numpy(dtype='datetime64') # => type=='numpy.datetime64'

    t_convert_1 = np.frompyfunc(pd.Timestamp, 1, 1)
    times_by_day_np = t_convert_1(times_by_day_np) # => type=='pandas._libs.tslibs.timestamps.Timestamp'

    t_convert_2 = np.frompyfunc(datetime.datetime.timestamp, 1, 1)    
    times_by_day_np = t_convert_2(times_by_day_np) # => type=='float'

    # locations_by_county_np[county-id] => locs[m_locs[x,y]]
    max_coords = 0
    for item in locations_by_county.items():
        max_coords = max( len(item[1]), max_coords)
    locations_by_county_np = np.empty([len(locations_by_county.keys()), max_coords, 2], dtype='float64')
    for i,item in enumerate(locations_by_county.items()): # counties are sorted because of OrderedDict
        #print(i,item[1][:])
        locations_by_county_np[i][:] = item[1][:]

    return(times_by_day_np, locations_by_county_np)

#def test_sample_time_and_space__once(times_by_day, locations_by_county):
#    """
#    """
#    times_by_day_np, locations_by_county_np = sample_time_and_space__once(times_by_day, locations_by_county)
#
#    # times_by_day_np[day-id] => times[n_times]
#    for idx, t_np in enumerate(times_by_day_np):
#        t_dict = times_by_day[day(idx)]
#        # compare


In [109]:
def sample_time_and_space__prep(times_by_day_np, locations_by_county_np, data, idx):
    """
    Recalculations for a fixed dataframe sample_time_and_space().

    Calculation of helper arrays are very costy.
    If the dataframe does not change, precalculated values can be reused.
    """

    # subdata 'data' of 'indata' is likely to skip a few first days(rows) in 'indata',
    # but as times_by_day_np represents the whole 'indata', an offsets needs to be considered when accessing 'times_by_day_np'
    dayoffset = np.where(idx==True)[0][0]

    n_total = data.sum().sum()

    # get number of samples per county-day
    smpls_per_cntyday = np.array(data.values).flatten('F')
    n_smpls =  smpls_per_cntyday.size

    ######## t_all ########

    # get list of day-ids for all county-days
    dayids = np.arange(len(data.index))
    day_of_cntyday = np.tile(dayids, len(data.columns))

    # get list of day-ids for all samples
    day_of_smpl = [ day_of_cntyday[i] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]  
    #day_of_smpl = np.array([ day_of_cntyday[i] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ])
    #day_of_smpl = np.fromiter( (day_of_cntyday[i] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls)), dtype='int16', count=n_smpls)

    # get available times for each sample
    time_of_days = data.index.tolist() # cannot be a np.array as it needs to stay a pandas.timeformat
    av_times_per_day = [len(times_by_day[d]) for d in time_of_days]
    #av_times_per_day = np.array([len(times_by_day[d]) for d in time_of_days]) 
    #av_times_per_day = np.fromiter( (len(times_by_day[d]) for d in time_of_days), dtype='int16', count=len(time_of_days))

    av_times_per_smpl = [ av_times_per_day[day_of_cntyday[i]] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]
    #av_times_per_smpl = np.array([ av_times_per_day[day_of_cntyday[i]]-1 for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ])
    #av_times_per_smpl = np.fromiter( (av_times_per_day[day_of_cntyday[i]]-1 for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls)), dtype='int16', count=n_total)

    ######## x_all ########

    # get list of county-ids for all county-days
    cntyids = np.arange(len(data.columns))
    cnty_of_cntyday = np.repeat(cntyids, len(data.index))

    # get list of county-ids for all samples
    cnty_of_smpl = [ cnty_of_cntyday[i] for (i,smpl) in enumerate(smpls_per_cntyday) for x in range(smpl) ]
    #cnty_of_smpl = np.array([ cnty_of_cntyday[i] for (i,smpl) in enumerate(smpls_per_cntyday) for x in range(smpl) ])
    #cnty_of_smpl = np.fromiter( (cnty_of_cntyday[i] for (i,smpl) in enumerate(smpls_per_cntyday) for x in range(smpl)), dtype='int16', count=n_smpls)  

    # get available locations for each sample
    label_of_cntys = data.columns # list of countys labels
    #label_of_cntys = np.array(data.columns) # list of countys labels

    av_locs_per_cnty = [len(locations_by_county[c]) for c in label_of_cntys]
    #av_locs_per_cnty = np.array([len(locations_by_county[c]) for c in label_of_cntys]) 
    #av_locs_per_cnty = np.fromiter( (len(locations_by_county[c]) for c in label_of_cntys), dtype='int16', count=len(label_of_cntys))

    av_locs_per_smpl = [ av_locs_per_cnty[cnty_of_cntyday[i]] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ]
    #av_locs_per_smpl = np.array([ av_locs_per_cnty[cnty_of_cntyday[i]]-1 for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ])
    #av_locs_per_smpl = np.fromiter( (av_locs_per_cnty[cnty_of_cntyday[i]]-1 for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls)), dtype='int16', count=n_total)

    return (n_total, n_smpls, dayoffset,
            day_of_smpl, av_times_per_smpl, 
            cnty_of_smpl, av_locs_per_smpl)

In [110]:
def sample_time_and_space(n_total, n_smpls, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl, rnd_time, rnd_loc):
    """ 
    Calculations samples in time and space.

    Calculation a hughe random number array use precalulated results to pick samples.
    """ 

    ######## t_all ########

    # calc random time-id for each sample
    rnd_timeid_per_smpl = np.floor( av_times_per_smpl * rnd_time.random( (n_total,) ) ).astype("int32")

    # collect times for each sample with its random time-id
    t_all = [ times_by_day_np[day+dayoffset][rnd_timeid_per_smpl[i]] for (i,day) in enumerate(day_of_smpl) ]
    #t_all = np.array([ times_by_day_np[day][rnd_timeid_per_smpl[i]] for (i,day) in enumerate(day_of_smpl) ])
    #t_all = np.fromiter( (times_by_day_np[day][rnd_timeid_per_smpl[i]] for (i,day) in enumerate(day_of_smpl)), dtype='datetime64', count=n_smpls )

    ######## x_all ########

    # calc random location-id for each sample
    rnd_locid_per_smpl = np.floor( av_locs_per_smpl * rnd_loc.random( (n_total,) ) ).astype("int32")
    
    # collect locations for each sample with its random location-id
    x_all = [ locations_by_county_np[cnty][rnd_locid_per_smpl[i]] for (i,cnty) in enumerate(cnty_of_smpl)]
    #x_all = np.array([ locations_by_county_np[cnty][rnd_locid_per_smpl[i]] for (i,cnty) in enumerate(cnty_of_smpl)])
    #x_all = np.fromiter( (locations_by_county_np[cnty][rnd_locid_per_smpl[i]] for (i,cnty) in enumerate(cnty_of_smpl)), dtype='float64', count=n_smpls*2)

    return t_all, x_all


In [111]:
#%%timeit
# set seed to check results
# Parallel Random Number Generation - https://docs.scipy.org/doc/numpy/reference/random/parallel.html
# Multithreaded Generation - https://docs.scipy.org/doc/numpy/reference/random/multithreading.html
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))

# Convert dictonarys to arrays for faster access in sample_time_and_space().
(times_by_day_np, locations_by_county_np,) = sample_time_and_space__once(times_by_day, locations_by_county)

t_res_1 = []
x_res_1 = []
days = data.index[:20]
counties = data.columns[:20]
#print(counties)

for i, day in enumerate(days):

    # calc which sub-table will be selected
    idx = ((day - pd.Timedelta(days=5)) <= data.index) * (data.index < day)
    subdata = data.iloc[idx]
    if subdata.size == 0: continue

    # Recalculations for a fixed dataframe sample_time_and_space().
    (n_total, n_smpls, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl,) = sample_time_and_space__prep(times_by_day_np, locations_by_county_np, subdata, idx)

    for j, county in enumerate(counties):

        # Calculate time and space samples
        (t_data, x_data,) = sample_time_and_space(n_total, n_smpls, dayoffset, day_of_smpl, av_times_per_smpl, cnty_of_smpl, av_locs_per_smpl, rnd_time, rnd_loc)

        # store all to compare with old algo
        t_res_1 = t_res_1 + t_data
        x_res_1 = x_res_1 + x_data

######## output ########
display(t_res_1[:10])
display(x_res_1[:10])

[1580234892.375513,
 1580224428.403552,
 1580182133.833636,
 1580217693.876309,
 1580224428.403552,
 1580224428.403552,
 1580194755.123367,
 1580193367.920551,
 1580217693.876309,
 1580247371.767558]

[array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.12902758, 51.57865701]),
 array([ 7.12902758, 51.57865701]),
 array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.00238681, 51.62250255]),
 array([ 7.02559901, 51.52409255]),
 array([11.32335489, 47.92160103]),
 array([11.11539488, 48.04390715])]

# D - Compare results (MUST be the same)
* requirementes
  * t_res must be uncommented in both algorithms
  * replace 'rnd_time.choice' with np.floor(...) to use same random numbers

## 1 - print result and type of OLD algorithm

In [112]:
print("t_res_0 (type, size, value): ", type(t_res_0[0]), np.shape(t_res_0), t_res_0[0])
print("x_res_0 (type, size, value): ", type(x_res_0[0][0]), np.shape(x_res_0), x_res_0[0][0])

t_res_0 (type, size, value):  <class 'datetime.datetime'> (1980,) 2020-01-28 19:08:12.375513
x_res_0 (type, size, value):  <class 'float'> (1980, 2) 11.389656225805421


In [113]:
_to_timestamp_0 = np.frompyfunc(datetime.datetime.timestamp, 1, 1)
t_res_old = _to_timestamp_0( t_res_0 )
x_res_old = x_res_0
print("t_res_old (types, type1, size, value): ", type(t_res_old), type(t_res_old[0]), np.shape(t_res_old), t_res_old[0])
print("x_res_old (types, size, value)       : ", type(x_res_old), type(x_res_old[0]), type(x_res_old[0][0]), np.shape(x_res_old), x_res_old[0][0])
display(t_res_old[:10])
display(x_res_old[:10])

t_res_old (types, type1, size, value):  <class 'numpy.ndarray'> <class 'float'> (1980,) 1580234892.375513
x_res_old (types, size, value)       :  <class 'list'> <class 'list'> <class 'float'> (1980, 2) 11.389656225805421


array([1580234892.375513, 1580224428.403552, 1580182133.833636,
       1580217693.876309, 1580224428.403552, 1580224428.403552,
       1580194755.123367, 1580193367.920551, 1580217693.876309,
       1580247371.767558], dtype=object)

[[11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.129027578958399, 51.57865700695229],
 [7.129027578958399, 51.57865700695229],
 [11.389656225805421, 48.06570349652414],
 [11.061510398161968, 48.111771343898525],
 [7.002386813736268, 51.62250254585719],
 [7.025599010880636, 51.524092548810025],
 [11.32335488551304, 47.92160103315107],
 [11.115394883162168, 48.04390714570248]]

## 2 - print result and type of NEW algorithm

In [114]:
t_res_new = t_res_1
x_res_new = x_res_1
print("t_res_new (types, type1, size, value): ", type(t_res_new), type(t_res_new[0]), np.shape(t_res_new), t_res_new[0])
print("x_res_new (types, size, value)       : ", type(x_res_new), type(x_res_new[0]), type(x_res_new[0][0]), np.shape(x_res_new), x_res_new[0][0])
display(t_res_new[:10])
display(x_res_new[:10])

t_res_new (types, type1, size, value):  <class 'list'> <class 'float'> (1980,) 1580234892.375513
x_res_new (types, size, value)       :  <class 'list'> <class 'numpy.ndarray'> <class 'numpy.float64'> (1980, 2) 11.389656225805421


[1580234892.375513,
 1580224428.403552,
 1580182133.833636,
 1580217693.876309,
 1580224428.403552,
 1580224428.403552,
 1580194755.123367,
 1580193367.920551,
 1580217693.876309,
 1580247371.767558]

[array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.12902758, 51.57865701]),
 array([ 7.12902758, 51.57865701]),
 array([11.38965623, 48.0657035 ]),
 array([11.0615104 , 48.11177134]),
 array([ 7.00238681, 51.62250255]),
 array([ 7.02559901, 51.52409255]),
 array([11.32335489, 47.92160103]),
 array([11.11539488, 48.04390715])]

## 4 - check if result is the same

In [115]:
np.array_equal(t_res_old, t_res_new)

True

In [116]:
np.array_equal(x_res_old, x_res_new)

True