In [1]:
%env PURE_PYTHON True
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime
from BTrees.OOBTree import OOBTree
import numpy as np
from collections import Counter
from scipy import stats
import pandas as pd
import pprint
import timeit

CHUNKS_SIZE = 10000
KEY_LENGTH = 8
ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

env: PURE_PYTHON=True


In [None]:
_debug_random_sampling = []

In [251]:
import os
os.environ["PURE_PYTHON"] = "True"

from BTrees.OOBTree import OOBTree as _OOBTree
from collections import Counter
import numpy as np





class OOBTreeExtLean(_OOBTree):

    def __init__(self):
        super(OOBTreeExtLean, self).__init__()
        self.walking_path_to_fanout_distribution = {}

    def random_sampling(self, k):
        self.walking_path_to_fanout_distribution = {}
        all_accept_reject_measures = {
            'accept': [],
            'reject': [],
            'revisited_paths': Counter()
        }

        k = min(len(self), k)
        sampled_values = []
        all_walking_paths_set = set()
        all_walking_paths_stats = []
        while len(sampled_values) < k:
            sampled_value, walking_path, walking_path_stats  = \
                self._get_value_and_path_by_random_walk_from_node(node=self)

            if _this_value_was_sampled_already(walking_path, all_walking_paths_set):
                all_accept_reject_measures['revisited_paths'][str(walking_path)] += 1
                continue

            accept_reject_measures = {
                'path': walking_path,
                'value': sampled_value,
            }

            all_accept_reject_measures['accept'].append(accept_reject_measures)

            all_walking_paths_set.add(str(walking_path))
            all_walking_paths_stats.append(walking_path_stats)
            sampled_values.append(sampled_value)

        add_to_debug_global(locals())

        return sampled_values

    def _get_value_and_path_by_random_walk_from_node(self, node):
        walking_path = []
        current_node = node
        prob_along_path = 1
        walking_path_stats = []
        while not isinstance(current_node, self._bucket_type):
            next_random_step, chosen_random_step_prob = self._random_next_move_respect_fanout_prob(current_node, walking_path)
            prob_along_path *= chosen_random_step_prob
            walking_path.append((next_random_step, current_node.size, chosen_random_step_prob, prob_along_path))
            current_node = current_node._data[next_random_step].child
            walking_path_stats.append({
                'next_random_step': next_random_step,
                'chosen_random_step_prob':
                    chosen_random_step_prob, 'prob_along_path':prob_along_path})

        next_random_step = np.random.randint(low=0, high=current_node.size)
        chosen_random_step_prob = 1/current_node.size
        prob_along_path *= chosen_random_step_prob
        walking_path.append((next_random_step, current_node.size, chosen_random_step_prob, prob_along_path))
        walking_path_stats.append({
            'next_random_step': next_random_step,
            'chosen_random_step_prob':
                chosen_random_step_prob, 'prob_along_path': prob_along_path,
            'entire_walking_path': walking_path})

        leaf = current_node._keys
        return leaf[next_random_step], walking_path, walking_path_stats


    def _random_next_move_respect_fanout_prob(self, current_node, walking_path):
        walking_path_str = str(walking_path)
        if walking_path_str in self.walking_path_to_fanout_distribution:
            node_distribution = self.walking_path_to_fanout_distribution[walking_path_str]
        else:
            all_sizes = np.array([node.child.size for node in current_node._data])
            node_distribution = all_sizes / sum(all_sizes)
            self.walking_path_to_fanout_distribution[walking_path_str] = node_distribution

        next_random_step = np.random.choice(current_node.size, p=node_distribution)
        chosen_random_step_prob = node_distribution[next_random_step]
        return next_random_step, chosen_random_step_prob


    def join(self, right_tree):
        pass


def add_to_debug_global(all_vars):
    global _debug_random_sampling
    _debug_random_sampling.append({
        'params': {
            'k': all_vars['k'],
        },
        'all_accept_reject_measures': all_vars['all_accept_reject_measures'],
        'all_walking_paths_stats': all_vars['all_walking_paths_stats']
    })


def _this_value_was_sampled_already(walking_path, all_walking_paths_set):
    return str(walking_path) in all_walking_paths_set


In [3]:
def generate_btree_index_x_values_with_dist(num_of_values, disired_prefix_to_percent_dist, my_index=None):
    my_index = my_index if my_index is not None else OOBTreeExt()
    for prefix, amount_percent in disired_prefix_to_percent_dist.items():
        amount = int(num_of_values * amount_percent)
        my_index = insert_to_index_random(my_index, amount, prefix)

    return my_index


def insert_to_index_random(my_index, amount, prefix=''):
    amount_in_iteration = min(CHUNKS_SIZE, amount)
    print('generating %s values, chunk of %s, with prefix=\'%s\'' %(amount, amount_in_iteration, prefix))

    proceed = 0
    for i in range(0, amount, amount_in_iteration):
        alphabet = list(ALPHABET)
        np_alphabet = np.array(alphabet)
        np_codes = np.random.choice(np_alphabet, [amount_in_iteration, KEY_LENGTH])
        my_index.update({
            prefix + ''.join(np_codes[i]): "".join(np_codes[i])
            for i in range(len(np_codes))
        })

        proceed += amount_in_iteration
        if (proceed % 150000) == 0:
            print('done generating %s values' % (proceed))
    return my_index


In [343]:
prefix_to_percent = {
    'gggg': 0.25,
    'hhhh': 0.15,
    'mmmm': 0.10,
    'rrrr': 0.03,
    '': 0.47
}
num_of_values = 1_000_000
my_index = generate_btree_index_x_values_with_dist(num_of_values, prefix_to_percent, OOBTreeExtLean())


generating 250000 values, chunk of 10000, with prefix='gggg'
done generating 150000 values
generating 150000 values, chunk of 10000, with prefix='hhhh'
done generating 150000 values
generating 100000 values, chunk of 10000, with prefix='mmmm'
generating 30000 values, chunk of 10000, with prefix='rrrr'
generating 470000 values, chunk of 10000, with prefix=''
done generating 150000 values
done generating 300000 values
done generating 450000 values


In [344]:
len(my_index)

#my_index_items = my_index.items()

1000000

reminder: as n=1m<4, h=3
my theory  - with h=3, same probability to get to a bucket, different to choose in the bucket

In [345]:
# re-create index after doing a change in calss
#my_index = OOBTreeExtLean()
#my_index.update(my_index_items)
print(len(my_index))

1000000


In [346]:
len(my_index.random_sampling(k=10_000))

10000

In [347]:
_debug_random_sampling[-1]['all_walking_paths_stats'][0]


[{'next_random_step': 240,
  'chosen_random_step_prob': 0.003906660225578122,
  'prob_along_path': 0.003906660225578122},
 {'next_random_step': 140,
  'chosen_random_step_prob': 0.004890604890604891,
  'prob_along_path': 1.910593160514397e-05},
 {'next_random_step': 6,
  'chosen_random_step_prob': 0.05263157894736842,
  'prob_along_path': 1.0055753476391562e-06,
  'entire_walking_path': [(240,
    281,
    0.003906660225578122,
    0.003906660225578122),
   (140, 186, 0.004890604890604891, 1.910593160514397e-05),
   (6, 19, 0.05263157894736842, 1.0055753476391562e-06)]}]

In [372]:
all_walking_path_stats_0 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==0]
all_walking_path_stats_1 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==1]
all_walking_path_stats_2 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==2]
all_walking_path_stats_3 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==3]
all_walking_path_stats_4 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==4]

In [373]:
print(Counter([x[0][2]* x[1][2]* x[2][2] for x in all_walking_path_stats_0]))
print(Counter([x[0][2]* x[1][2]* x[2][2] for x in all_walking_path_stats_1]))
print(Counter([x[0][2]* x[1][2]* x[2][2] for x in all_walking_path_stats_2]))
print(Counter([x[0][2]* x[1][2]* x[2][2] for x in all_walking_path_stats_3]))
print(Counter([x[0][2]* x[1][2]* x[2][2] for x in all_walking_path_stats_4]))

Counter({1.0245633951162135e-06: 269, 1.0245633951162138e-06: 32})
Counter({9.73625264538072e-07: 276, 9.736252645380722e-07: 7})
Counter({9.661176382416017e-07: 182, 9.661176382416015e-07: 69})
Counter({1.0005272552952567e-06: 179, 1.0005272552952565e-06: 103})
Counter({1.003637822884553e-06: 300, 1.0036378228845531e-06: 211})


In [391]:
print(len(my_index._data[0].child.keys()))
print(len(my_index._data[1].child.keys()))

1000000
997335


In [397]:
print("%.12f" % 1.0245633951162135e-06)
print("%.12f" % 9.73625264538072e-07)

0.000001024563
0.000000973625


In [252]:
prefix_to_percent = {
    'gggg': 0.25,
    'hhhh': 0.15,
    'mmmm': 0.10,
    'rrrr': 0.03,
    '': 0.47
}
print(datetime.utcnow())
num_of_values = 4_000_000
my_index_4m = generate_btree_index_x_values_with_dist(num_of_values, prefix_to_percent, OOBTreeExtLean())
print(datetime.utcnow())

2020-12-19 16:22:12.517234
generating 1000000 values, chunk of 10000, with prefix='gggg'
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
done generating 750000 values
done generating 900000 values
generating 600000 values, chunk of 10000, with prefix='hhhh'
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
generating 400000 values, chunk of 10000, with prefix='mmmm'
done generating 150000 values
done generating 300000 values
generating 120000 values, chunk of 10000, with prefix='rrrr'
generating 1880000 values, chunk of 10000, with prefix=''
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
done generating 750000 values
done generating 900000 values
done generating 1050000 values
done generating 1200000 values
done generating 1350000 values
done generating 1500000 values
done g

In [278]:
print(datetime.utcnow())
len(my_index_4m.random_sampling(k=100_000))
print(datetime.utcnow()) # to 11 seconds 

2020-12-19 16:52:04.870099
2020-12-19 16:52:23.508926


In [280]:
Counter([(x[-1]['entire_walking_path'][0],x[-1]['entire_walking_path'][1]) for x in _debug_random_sampling[-1]['all_walking_paths_stats']]).most_common(3)

[(((0, 7, 0.19593998234774934, 0.19593998234774934),
   (46, 222, 0.007975877832895632, 0.0015627933617853754)),
  164),
 (((0, 7, 0.19593998234774934, 0.19593998234774934),
   (49, 222, 0.007748921959601854, 0.0015183236319784746)),
  159),
 (((5, 7, 0.14827890556045895, 0.14827890556045895),
   (24, 168, 0.009812421185372004, 0.001454975074265222)),
  155)]

In [338]:
all_walking_path_stats = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==0 and x[1]['next_random_step']==46]
all_walking_path_stats_0_47 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==0 and x[1]['next_random_step']==47]
all_walking_path_stats2 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==0 and x[1]['next_random_step']==49]
# common walk prefix: [((1, 132), 23), ((4, 52), 22), ((5, 101), 21)]

In [296]:
Counter([x[2][2]* x[3][2] for x in all_walking_path_stats2])

Counter({0.0002000800320128051: 72, 0.00020008003201280514: 87})

In [298]:
all_walking_path_stats2

[[(0, 7, 0.19593998234774934, 0.19593998234774934),
  (49, 222, 0.007748921959601854, 0.0015183236319784746),
  (211, 239, 0.003401360544217687, 5.16436609516488e-06),
  (15, 17, 0.058823529411764705, 3.0378624089205176e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (49, 222, 0.007748921959601854, 0.0015183236319784746),
  (181, 239, 0.004001600640256103, 6.075724817841035e-06),
  (13, 20, 0.05, 3.0378624089205176e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (49, 222, 0.007748921959601854, 0.0015183236319784746),
  (145, 239, 0.005202080832332933, 7.898442263193346e-06),
  (21, 26, 0.038461538461538464, 3.0378624089205176e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (49, 222, 0.007748921959601854, 0.0015183236319784746),
  (210, 239, 0.005802320928371348, 8.8098009858695e-06),
  (28, 29, 0.034482758620689655, 3.037862408920517e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (49, 222, 0.007748921959601854, 0.0015183236319784746)

In [303]:
my_index_4m._data[0].child._data[0].child._data[0].child

BTrees.OOBTree.OOBucket([('AAAGRcNB', 'AAAGRcNB'), ('AAAHALFQ', 'AAAHALFQ'), ('AAAIJoUD', 'AAAIJoUD'), ('AAAJGLzD', 'AAAJGLzD'), ('AAAQHwez', 'AAAQHwez'), ('AAASPxaO', 'AAASPxaO'), ('AAAWqLJo', 'AAAWqLJo'), ('AAAcAKui', 'AAAcAKui'), ('AAAijPtx', 'AAAijPtx'), ('AAAjNQvj', 'AAAjNQvj'), ('AAAkpIpl', 'AAAkpIpl'), ('AAAlvneF', 'AAAlvneF'), ('AAAomLWz', 'AAAomLWz'), ('AABCIvgM', 'AABCIvgM'), ('AABEFQAy', 'AABEFQAy'), ('AABElLfs', 'AABElLfs')])

In [304]:
all_walking_path_stats_step0 = [x[-1]['entire_walking_path'] for x in _debug_random_sampling[-1]['all_walking_paths_stats']
                         if x[0]['next_random_step']==0]

In [335]:
Counter([x[0][2]* x[1][2] * x[2][2]* x[3][2] for x in all_walking_path_stats])

Counter({2.985848990801252e-07: 111, 2.9858489908012524e-07: 53})

In [408]:
Counter([x[0][2]* x[1][2] * x[2][2]* x[3][2] for x in all_walking_path_stats_0_47])

Counter({2.992762483180852e-07: 71, 2.9927624831808513e-07: 10})

In [407]:
Counter([x[0][2]* x[1][2]* x[2][2]* x[3][2] for x in all_walking_path_stats_step0])

Counter({3.033465398812008e-07: 31,
         2.9903434568298594e-07: 74,
         3.0724196826621603e-07: 63,
         2.923076017105671e-07: 30,
         3.0162443583207426e-07: 60,
         2.9278573527658347e-07: 10,
         3.080154445499631e-07: 71,
         3.073587809238342e-07: 37,
         3.0544303957284433e-07: 60,
         2.999823893777165e-07: 27,
         3.124336988775241e-07: 74,
         3.120047130214396e-07: 61,
         2.998444066533703e-07: 73,
         2.9341696819038516e-07: 49,
         2.9563271403612676e-07: 55,
         3.090786142303625e-07: 48,
         2.9978159795243023e-07: 105,
         3.1366638735241e-07: 15,
         3.0501154299021104e-07: 75,
         3.091059131717359e-07: 47,
         3.040891290497165e-07: 47,
         2.992762483180852e-07: 71,
         3.0130113487763176e-07: 76,
         2.947529166211862e-07: 75,
         3.0172788139267297e-07: 31,
         3.014518465443949e-07: 15,
         3.039321016291561e-07: 53,
         2.9148696

In [333]:
[x for x in all_walking_path_stats_step0 if x[0][2]* x[1][2]==0.0008830417775941756]


[[(0, 7, 0.19593998234774934, 0.19593998234774934),
  (93, 222, 0.004506695198262166, 0.0008830417775941756),
  (34, 139, 0.005839917554105119, 5.156891177980414e-06),
  (3, 17, 0.058823529411764705, 3.033465398812008e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (156, 222, 0.004506695198262166, 0.0008830417775941756),
  (69, 139, 0.008289124668435014, 7.319643381914586e-06),
  (16, 25, 0.04, 2.9278573527658347e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (39, 222, 0.004506695198262166, 0.0008830417775941756),
  (44, 139, 0.010093978419770276, 8.913404646791192e-06),
  (27, 29, 0.034482758620689655, 3.073587809238342e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (220, 222, 0.004506695198262166, 0.0008830417775941756),
  (121, 139, 0.008828522920203734, 7.795954572987628e-06),
  (20, 26, 0.038461538461538464, 2.998444066533703e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (106, 222, 0.004506695198262166, 0.0008830417775941756)

In [331]:
[x for x in all_walking_path_stats_step0 if x[0][2]* x[1][2]==0.0008576305034188037 and x[1][0]==143]

[[(0, 7, 0.19593998234774934, 0.19593998234774934),
  (143, 222, 0.004377006127808579, 0.0008576305034188037),
  (5, 135, 0.006624825662482566, 5.681652567976733e-06),
  (8, 19, 0.05263157894736842, 2.9903434568298594e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (143, 222, 0.004377006127808579, 0.0008576305034188037),
  (108, 135, 0.006276150627615063, 5.382618222293747e-06),
  (9, 18, 0.05555555555555555, 2.9903434568298594e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (143, 222, 0.004377006127808579, 0.0008576305034188037),
  (78, 135, 0.005578800557880056, 4.784549530927775e-06),
  (9, 16, 0.0625, 2.9903434568298594e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (143, 222, 0.004377006127808579, 0.0008576305034188037),
  (113, 135, 0.005927475592747559, 5.083583876610761e-06),
  (2, 17, 0.058823529411764705, 2.9903434568298594e-07)],
 [(0, 7, 0.19593998234774934, 0.19593998234774934),
  (143, 222, 0.004377006127808579, 0.000857630503418803

In [370]:
#sampled_data = my_index_4m.random_sampling(k=100_000)
sampled_data2 = my_index.random_sampling(k=100_000)

In [366]:
def (values):
    return {value: occurences/len(values) for value, occurences in Counter([key[:4] for key in values]).most_common(10)}


In [371]:
_calculate_prefix_ditribution(sampled_data2)

{'gggg': 0.25037,
 'hhhh': 0.15136,
 'mmmm': 0.10119,
 'rrrr': 0.03006,
 'qpgT': 2e-05,
 'icLr': 2e-05,
 'rqJL': 2e-05,
 'IGDb': 2e-05,
 'XqZy': 2e-05,
 'oQfS': 2e-05}

In [368]:
prefix_to_percent

{'gggg': 0.25, 'hhhh': 0.15, 'mmmm': 0.1, 'rrrr': 0.03, '': 0.47}