# Optimization - STEP 1 
* sample_time_and_space() - inner function call optimization
  - no for-loops (just list-comprehension)
    - reduce calls to np.random functions drastically
      - replace random.choice() by np.floor((n*[l])*np.random((n,)))
  - no single-value-operations
    - arrays everywhere
  - separate precalculations from calculations

In [1]:
import numpy as np
import pandas as pd

# A - Read/Create Input Data
* output:
  - kw_data
  - day_data
  - time_by_day

## Read counties

In [2]:
import pickle as pkl
with open('../data/counties/counties.pkl', "rb") as f:
    counties = pkl.load(f)

## Read data

In [3]:
disease = "covid19"
prediction_region = "germany"
def load_daily_data(disease, prediction_region, counties, seperator=","):
    data = pd.read_csv("../data/diseases/{}.csv".format(disease),
                       sep=seperator, encoding='iso-8859-1', index_col=0)

    if "99999" in data.columns:
        data.drop("99999", inplace=True, axis=1)

    data = data.loc[:, list(
        filter(lambda cid: prediction_region in counties[cid]["region"], data.columns))]
    data.index = [pd.Timestamp(date) for date in data.index]

    return data
indata = load_daily_data(disease, prediction_region, counties)
indata

Unnamed: 0,03159,09576,07334,06631,10046,01058,03459,05316,15089,04011,...,11003,11004,11006,11007,11008,11009,11010,11011,11012,11005
2020-01-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Select SubSet

In [4]:
day=indata.index[10] # end-row of subtable
idx = ((day - pd.Timedelta(days=5)) <= indata.index) * (indata.index < day) # calc which sub-table will be selected
idx

array([False, False, False, False, False,  True,  True,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [5]:
data = indata #.iloc[50:55,1:5] #idx]
display(data)

Unnamed: 0,03159,09576,07334,06631,10046,01058,03459,05316,15089,04011,...,11003,11004,11006,11007,11008,11009,11010,11011,11012,11005
2020-01-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-02-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create times_by_day dictionary

In [6]:
import datetime
from collections import OrderedDict

rnd_tsel = np.random.Generator(np.random.PCG64(12345))

def uniform_times_by_day(days, n=10):
    """ Samples n random timepoints within a day, per day. converts pd.Timestamps to datetime obj."""
    res = OrderedDict()
    for day in days:
        time_min = datetime.datetime.combine(day, datetime.time.min)
        time_max = datetime.datetime.combine(day, datetime.time.max)
        res[day] = rnd_tsel.random(n) * (time_max - time_min) + time_min
    return res

times_by_day=uniform_times_by_day(data.index)
#times_by_day

## Create locations_by_county dictionary

In [7]:
from collections import OrderedDict

rnd_csel = np.random.Generator(np.random.PCG64(12345))

def uniform_locations_by_county(counties, n=5):
    res = OrderedDict()
    for (county_id, county) in counties.items():
        tp = county["testpoints"]
        if n == len(tp):
            res[county_id] = tp
        else:
            idx = rnd_csel.choice(tp.shape[0], n, replace=n > len(tp))
            res[county_id] = tp[idx]
    return res

locations_by_county=uniform_locations_by_county(counties)
#locations_by_county

# B - Do the Sampling (the old way)

In [8]:
n_total = data.sum().sum()
t_all_old = np.empty((n_total,), dtype=object)
x_all_old = np.empty((n_total, 2))

# set seed to check results
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))

# random generators:
# MT19937, PCG64, Philox, SFC64 - https://numpy.org/devdocs/reference/random/bit_generators/index.html

In [9]:
#%%timeit
# loop over all days of all counties
# and draw per day n-times a random time from times_by_day[day]
i=0
for (county_id, series) in data.iteritems():
    for (day, n) in series.iteritems():
        #print(i,"\n   day =",day,"\n   no. samples to draw = ",n)
        
        # draw n random times
        times = times_by_day[day]
        #idx = rnd_time.choice(len(times), n)
        idx = np.floor( (n*[len(times)]) * rnd_time.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
        #print("   random sample ids   = ",idx)
        t_all_old[i:i + n] = times[idx]

        # draw n random locations
        locs = locations_by_county[county_id]
        #idx = rnd_loc.choice(locs.shape[0], n)
        idx = np.floor( (n*[locs.shape[0]]) *  rnd_loc.random((n,)) ).astype("int32") # replace 'rnd_time.choice' to enable compare with new optimized solution
        x_all_old[i:i + n, :] = locs[idx, :]
        
        i += n

display(t_all_old[:10])
display(x_all_old[:10])

array([datetime.datetime(2020, 3, 12, 21, 7, 49, 600516),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 12, 15, 43, 30, 954482),
       datetime.datetime(2020, 3, 12, 17, 29, 43, 252998),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 13, 8, 30, 29, 252590),
       datetime.datetime(2020, 3, 14, 0, 12, 46, 58058),
       datetime.datetime(2020, 3, 14, 23, 14, 47, 486243),
       datetime.datetime(2020, 3, 15, 20, 35, 56, 182337),
       datetime.datetime(2020, 3, 15, 6, 7, 16, 294807)], dtype=object)

array([[10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [10.22557886, 51.50146323],
       [10.22557886, 51.50146323],
       [10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [ 9.94686559, 51.56576792],
       [ 9.90706472, 51.52194244],
       [10.22557886, 51.50146323],
       [10.433864  , 51.71695713]])

# C - Do the Sampling (NEW and DETAILED explaination)

In [10]:
n_total = data.sum().sum()
n_total

243092

## C1 - Time Sampling

### 1) Erstelle einmalig(!) Liste, die jedem Sample eine Day-Id zuweist

---
#### smpls_per_cntyday (FIX)
* Erstellen einer List über alle __County-Days__  
mit der jeweiligen gewünschten __Sample-Anzahl__:

In [11]:
smpls_per_cntyday = np.array(data.values).flatten('F') # number of samples per countyday (flatten per column first)
smpls_per_cntyday

array([0, 0, 0, ..., 6, 0, 0], dtype=int64)

---
#### day_of_cntyday (FIX)
* Erstellen einer Liste über __County-Days__  
mit der jeweiligen __Day-Id__:

In [12]:
dayids = np.arange(len(data.index))
day_of_cntyday = np.tile(dayids, len(data.columns)) # list of day-ids of all countydays
day_of_cntyday

array([  0,   1,   2, ..., 216, 217, 218])

---
#### day_of_smpl (FIX) <- day_of_cntyday <- smpls_per_cntyday 
* Erstelle Liste über alle __Sample__  
mit dem jeweiligen __Day__:

In [13]:
#day_of_smpl = np.empty((n_total,), dtype=int)
#ii = 0
#for (i,smpls) in enumerate(smpl_per_cntyday):
#    dayid = dayids_of_cntydays[i] # get day-id of each cntyday
#    for x in range(smpls):
#        day_of_smpl[ii] = dayid
#        ii = ii+1
day_of_smpl = np.array([ day_of_cntyday[i] for (i,smpl) in enumerate(smpls_per_cntyday) for x in range(smpl) ])
day_of_smpl

array([ 44,  44,  44, ..., 216, 216, 216])

---
### 2) Erstelle einmalige(!) Liste, die jeder Tag-Id einen Tag-Zeitstempel zuweist

#### time_of_days (FIX)
* Erstellen einer List über __Days__  
mit dem jeweiligen __Day-Zeitstempel__:
* um im Dictionary times_by_day die gewünschte List auswählen zu können

In [14]:
time_of_days = data.index.tolist() # list of timestamps of days
#time_of_days

### 3) Erstelle einmalige(!) Liste, die jedem Sample die Anzahl an möglichen Zeitstemplen zuweist, aus denen es gezogen wird

---
#### av_times_per_day (FIX)
* Erstellen einer List über die __Days__  
mit der jeweilige verfügbaren __Zeitstempel-Anzahl__:

In [15]:
#av_times_per_day = np.empty((len(data.index)), dtype=int)
#for i,d in enumerate(time_of_days):
#   av_times_per_day[i] = len(times_by_day[d])
av_times_per_day = np.array([len(times_by_day[d]) for d in time_of_days])
av_times_per_day

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

---
#### av_times_per_smpl (FIX)
* Erstellen einer Liste über alle __Sample__  
mit der jeweiligen verfügbaren __Zeitstempel-Anzahl__ für den entsprechenden __Day__

In [16]:
#av_times_per_smpl = np.empty((n_total,), dtype=int)
#print(n_total)
#ii = 0
#for (i,smpls) in enumerate(smpl_per_cntyday):
#    dayid = dayids_of_cntydays[i] # get day-id of each cntyday
#    for x in range(smpls):
#        av_times_per_smpl[ii] = av_times_per_day[dayid]
#        ii = ii+1
av_times_per_smpl = np.array([ av_times_per_day[day_of_cntyday[i]] for (i,smpls) in np.ndenumerate(smpls_per_cntyday) for x in range(smpls) ])
av_times_per_smpl

array([10, 10, 10, ..., 10, 10, 10])

### 4) Ziehe neue Time-Ids für jedes Sample

---
#### smpl_rnd_all (! performance relevant !)
* Frage nach Zufallszahlen für jedes Sample

In [17]:
rnd_time = np.random.Generator(np.random.PCG64(12345))

smpl_rndtime_all = rnd_time.random((n_total,))
smpl_rndtime_all

array([0.22733602, 0.31675834, 0.79736546, ..., 0.5892803 , 0.24731683,
       0.83689275])

---
#### rnd_timeid_per_smpl (! performance relevant !)
* Erstelle List über alle __Sample__  
mit der jeweiligen __Zufalls-ZeitstempelIds__:

In [18]:
rnd_timeid_per_smpl = av_times_per_smpl * smpl_rndtime_all
rnd_timeid_per_smpl = np.floor(rnd_timeid_per_smpl).astype("int32")
rnd_timeid_per_smpl

array([2, 3, 7, ..., 5, 2, 8])

---
### 5) Wähle zufälligen Zeitstempel für jedes __Sample__  
* beachte: zufälliger Zeitstempel muss aus den möglichen Zeitstempeln gezogen werden, die für den entsprechenden Tag festgelet wurden

In [19]:
#t_all = np.empty((n_total,), dtype=object)
#for (i,day) in enumerate(day_of_smpl):
#    t_all[i] = times_by_day[time_of_days[day]][rnd_timeid_per_sample[i]]
t_all_1 = np.array([ times_by_day[time_of_days[day]][rnd_timeid_per_smpl[i]] for (i,day) in np.ndenumerate(day_of_smpl) ])
display(t_all_1[:10])

array([datetime.datetime(2020, 3, 12, 21, 7, 49, 600516),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 12, 15, 43, 30, 954482),
       datetime.datetime(2020, 3, 12, 17, 29, 43, 252998),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 13, 8, 30, 29, 252590),
       datetime.datetime(2020, 3, 14, 0, 12, 46, 58058),
       datetime.datetime(2020, 3, 14, 23, 14, 47, 486243),
       datetime.datetime(2020, 3, 15, 20, 35, 56, 182337),
       datetime.datetime(2020, 3, 15, 6, 7, 16, 294807)], dtype=object)

## C2 - Location Sampling

### 1) Erstelle einmalig(!) Liste, die jedem Sample eine County-Id zuweist

---
#### cnty_of_cntyday (FIX)
* Erstellen einer Liste über __County-Days__  
mit der jeweiligen __County-Id__:

In [20]:
cntyids = np.arange(len(data.columns))
cntyids_1 = cntyids

In [21]:
cnty_of_cntyday = np.repeat(cntyids, len(data.index)) # list of county-ids of all countydays (column-major)
cnty_of_cntyday_1 = cnty_of_cntyday

---
#### cnty_of_smpl (FIX) <- cnty_of_cntyday <- smpls_per_cntyday 
* Erstelle Liste über alle __Sample__  
mit dem jeweiligen __County__:

In [22]:
#cnty_of_smpl = np.empty((n_total,), dtype=int)
#ii = 0
#for (i,smpls) in enumerate(smpl_per_cntyday):
#    cntyid = cnty_of_cntydays[i] # get county-id of each cntyday
#    for x in range(smpls):
#        day_of_smpl[ii] = cntyid
#        ii = ii+1
cnty_of_smpl = np.array([ cnty_of_cntyday[i] for (i,smpl) in np.ndenumerate(smpls_per_cntyday) for x in range(smpl) ])
cnty_of_smpl_1 = cnty_of_smpl

---
### 2) Erstelle einmalige(!) Liste, die jeder County-Id einen County-Label zuweist

#### label_of_cntys (FIX)
* Erstellen einer List über __Countys__  
mit dem jeweiligen __County-Label__:
* um im Dictionary locations_by_county die gewünschte List auswählen zu können

In [23]:
label_of_cntys = np.array(data.columns) # list of countys labels
label_of_cntys_1 = label_of_cntys

### 3) Erstelle einmalige(!) Liste, die jedem Sample die Anzahl an möglichen Ortskoordinaten zuweist, aus denen es gezogen wird

---
#### av_locs_per_county (FIX)
* Erstellen einer List über die __Countys__  
mit der jeweilige verfügbaren __Ortskoordinaten-Anzahl__:

In [24]:
#av_locs_per_county = np.empty((len(data.columns)), dtype=int)
#for c in label_of_cntys:
#   av_locs_per_county[i] = len(locations_by_county[c])
av_locs_per_cnty = np.array([len(locations_by_county[c]) for c in label_of_cntys])
av_locs_per_cnty_1 = av_locs_per_cnty

---
#### av_locs_per_smpl (FIX)
* Erstellen einer Liste über alle __Sample__  
mit der jeweiligen verfügbaren __Ortskoordinaten-Anzahl__ für den entsprechenden __County__

In [25]:
#av_times_per_smpl = np.empty((n_total,), dtype=int)
#print(n_total)
#ii = 0
#for (i,smpls) in enumerate(smpl_per_cntyday):
#    cntyid = cnty_of_cntydays[i] # get day-id of each cntyday
#    for x in range(smpls):
#        av_locs_per_smpl[ii] = av_locs_per_cnty[cntyid]
#        ii = ii+1
av_locs_per_smpl = np.array([ av_locs_per_cnty[cnty_of_cntyday[i]] for (i,smpls) in np.ndenumerate(smpls_per_cntyday) for x in range(smpls) ])
av_locs_per_smpl_1 = av_locs_per_smpl

### 4) Ziehe neue Loc-Ids für jedes Sample

---
#### smpl_rnd_all (! performance relevant !)
* Frage nach Zufallszahlen für jedes Sample

In [26]:
rnd_loc  = np.random.Generator(np.random.PCG64(12345))

smpl_rndloc_all = rnd_loc.random((n_total,))
smpl_rndloc_all_1 = smpl_rndloc_all

---
#### rnd_locid_per_smpl (! performance relevant !)
* Erstelle List über alle __Sample__  
mit der jeweiligen __Zufalls-Ortskoordinaten-Ids__:

In [27]:
rnd_locid_per_smpl = av_locs_per_smpl * smpl_rndloc_all
rnd_locid_per_smpl = np.floor(rnd_locid_per_smpl).astype("int32")
rnd_locid_per_smpl_1 = rnd_locid_per_smpl

---
### 5) Wähle zufälligen Ortskoordinate für jedes __Sample__  
* beachte: zufälliger Ortskoordiante muss aus den möglichen Ortskoordinaten gezogen werden, die für die entsprechenden Region festgelet wurden

In [28]:
#x_all = np.empty((n_total, 2))
#for (i,cnty) in enumerate(cnty_of_smpl):
#    x_all[i] = locations_by_county[label_of_cntys[cnty]][rnd_locid_per_smpl[i]]
x_all_1 = np.array([ locations_by_county[label_of_cntys[cnty]][rnd_locid_per_smpl[i]] for (i,cnty) in np.ndenumerate(cnty_of_smpl)])
display(x_all_1[:10])

array([[10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [10.22557886, 51.50146323],
       [10.22557886, 51.50146323],
       [10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [ 9.94686559, 51.56576792],
       [ 9.90706472, 51.52194244],
       [10.22557886, 51.50146323],
       [10.433864  , 51.71695713]])

---
---
## C3 - COMPACT result
* requires (A) to be finished -> data, times_by_day

In [29]:
def sample_time_and_space__prep():
    """ 
    Recalculations for a fixed dataframe sample_time_and_space().
  
    Calculation of helper arrays are very costy.
    If the dataframe does not change, precalculated values can be reused.
    """
    
    n_total = data.sum().sum()

    # get number of samples per county-day
    smpls_per_cntyday = np.array(data.values).flatten('F')

    ######## t_all ########

    # get list of day-ids for all county-days
    dayids = np.arange(len(data.index))
    day_of_cntyday = np.tile(dayids, len(data.columns))

    # get list of day-ids for all samples
    # 3: 25.1 ms
    day_of_smpl = np.array([ day_of_cntyday[i] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ])

    # get available times for each sample
    time_of_days = data.index.tolist() # cannot be a np.array as it needs to stay a pandas.timeformat
    av_times_per_day = np.array([len(times_by_day[d]) for d in time_of_days])

    # 2: 54.7 ms
    av_times_per_smpl = np.array([ av_times_per_day[day_of_cntyday[i]] for (i,smpls) in enumerate(smpls_per_cntyday) for x in range(smpls) ])
    
    ######## x_all ########

    # get list of county-ids for all county-days
    cntyids = np.arange(len(data.columns))
    cnty_of_cntyday = np.repeat(cntyids, len(data.index))

    # get list of county-ids for all samples
    # 3: 24.2 ms
    cnty_of_smpl = np.array([ cnty_of_cntyday[i] for (i,smpl) in np.ndenumerate(smpls_per_cntyday) for x in range(smpl) ])
    
    # get available locations for each sample
    label_of_cntys = np.array(data.columns) # list of countys labels
    av_locs_per_cnty = np.array([len(locations_by_county[c]) for c in label_of_cntys])

    # 2: 52.9 ms
    av_locs_per_smpl = np.array([ av_locs_per_cnty[cnty_of_cntyday[i]] for (i,smpls) in np.ndenumerate(smpls_per_cntyday) for x in range(smpls) ])
    
    return (n_total,
            day_of_smpl, time_of_days, av_times_per_smpl, 
            cnty_of_smpl, label_of_cntys, av_locs_per_smpl)

In [30]:
def sample_time_and_space(n_total, day_of_smpl, time_of_days, av_times_per_smpl, cnty_of_smpl, label_of_cntys, av_locs_per_smpl, rnd_time, rnd_loc):
    
    ######## t_all ########
    
    # calc random time-id for each sample
    # 4: 691 µs
    #rnds = rnd_time.random((n_total,))
    #print(rnds)
    rnd_timeid_per_smpl = np.floor( av_times_per_smpl * rnd_time.random((n_total,)) ).astype("int32")

    # collect times for each sample with its random time-id
    # 1: 125 ms
    t_all = np.array([ times_by_day[time_of_days[day]][rnd_timeid_per_smpl[i]] for (i,day) in enumerate(day_of_smpl) ])

    ######## x_all ########

    # calc random location-id for each sample
    # 4: 727 µs  
    rnd_locid_per_smpl = np.floor( av_locs_per_smpl * rnd_loc.random((n_total,)) ).astype("int32")

    # collect locations for each sample with its random location-id
    # 1: 61.8 ms
    x_all = np.array([ locations_by_county[label_of_cntys[cnty]][rnd_locid_per_smpl[i]] for (i,cnty) in np.ndenumerate(cnty_of_smpl)])

    return t_all, x_all
    

In [31]:
# set seed to check results
# Parallel Random Number Generation - https://docs.scipy.org/doc/numpy/reference/random/parallel.html
# Multithreaded Generation - https://docs.scipy.org/doc/numpy/reference/random/multithreading.html
rnd_time = np.random.Generator(np.random.PCG64(12345))
rnd_loc  = np.random.Generator(np.random.PCG64(12345))

# Recalculations for a fixed dataframe sample_time_and_space().
n_total, day_of_smpl, time_of_days, av_times_per_smpl, cnty_of_smpl, label_of_cntys, av_locs_per_smpl = sample_time_and_space__prep()

# Calculate time and space samples
t_all, x_all = sample_time_and_space(n_total, day_of_smpl, time_of_days, av_times_per_smpl, cnty_of_smpl, label_of_cntys, av_locs_per_smpl, rnd_time, rnd_loc)

######## output ########

# comparision with original algorithm for first results to compute t_all_old:
#day, n, len(times), rnds, idx:  2020-03-12 00:00:00 4 10 [0.42456225 0.50906649 0.17990126 0.72574739] [4 5 1 7]
#day, n, len(times), rnds, idx:  2020-03-14 00:00:00 2 10 [0.16849485 0.14487249] [1 1]
#day, n, len(times), rnds, idx:  2020-03-15 00:00:00 2 10 [0.53578134 0.88086702] [5 8]
#day, n, len(times), rnds, idx:  2020-03-16 00:00:00 4 10 [0.17153944 0.99667843 0.33747713 0.63443554] [1 9 3 6]
#day, n, len(times), rnds, idx:  2020-03-17 00:00:00 5 10 [0.76530828 0.93646131 0.87875071 0.06091913 0.80040787] [7 9 8 0 8]
display(t_all[:10])

# comparision with original algorithm for first results to compute x_all_old:
#day, n, locs.shape[0], rnds, idx:  2020-03-12 00:00:00 4 5 [0.22733602 0.31675834 0.79736546 0.67625467] [1 1 3 3]
#day, n, locs.shape[0], rnds, idx:  2020-03-14 00:00:00 2 5 [0.39110955 0.33281393] [1 1]
#day, n, locs.shape[0], rnds, idx:  2020-03-15 00:00:00 2 5 [0.59830875 0.18673419] [2 0]
#day, n, locs.shape[0], rnds, idx:  2020-03-16 00:00:00 4 5 [0.67275604 0.94180287 0.24824571 0.94888115] [3 4 1 4]
#day, n, locs.shape[0], rnds, idx:  2020-03-17 00:00:00 5 5 [0.66723745 0.09589794 0.44183967 0.88647992 0.6974535 ] [3 0 2 4 3]
display(x_all[:10])

array([datetime.datetime(2020, 3, 12, 21, 7, 49, 600516),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 12, 15, 43, 30, 954482),
       datetime.datetime(2020, 3, 12, 17, 29, 43, 252998),
       datetime.datetime(2020, 3, 12, 2, 58, 4, 805697),
       datetime.datetime(2020, 3, 13, 8, 30, 29, 252590),
       datetime.datetime(2020, 3, 14, 0, 12, 46, 58058),
       datetime.datetime(2020, 3, 14, 23, 14, 47, 486243),
       datetime.datetime(2020, 3, 15, 20, 35, 56, 182337),
       datetime.datetime(2020, 3, 15, 6, 7, 16, 294807)], dtype=object)

array([[10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [10.22557886, 51.50146323],
       [10.22557886, 51.50146323],
       [10.13994319, 51.57935343],
       [10.13994319, 51.57935343],
       [ 9.94686559, 51.56576792],
       [ 9.90706472, 51.52194244],
       [10.22557886, 51.50146323],
       [10.433864  , 51.71695713]])

In [32]:
# compare if detailed explained algorithm gives the same result for 't_all'
np.array_equal(t_all_1, t_all)

True

In [33]:
# compare if detailed explained algorithm gives the same result for 'x_all'
np.array_equal(x_all_1, x_all)

True

In [34]:
# compare if old algorithm gives the same result for 't_all'
np.array_equal(t_all_old, t_all)

True

In [35]:
# compare if old algorithm gives the same result for 'x_all'
np.array_equal(x_all_old, x_all)

True

In [43]:
print(np.shape(x_all), type(x_all), type(x_all[0]), type(x_all[0][0]))
print(np.shape(x_all_1), type(x_all_1), type(x_all_1[0]), type(x_all_1[0][0]))
print(np.shape(x_all_old), type(x_all_old), type(x_all_old[0]), type(x_all_old[0][0]))
print(np.shape(x_all), type(x_all), type(x_all[0]), type(x_all[0][0]))

(243092, 2) <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.float64'>
(243092, 2) <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.float64'>
(243092, 2) <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.float64'>
(243092, 2) <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.float64'>
