In [2]:
%env PURE_PYTHON True
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime
from BTrees.OOBTree import OOBTree
import numpy as np
from collections import Counter
from scipy import stats
import pandas as pd
import pprint
import timeit

CHUNKS_SIZE = 10000
KEY_LENGTH = 8
ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

env: PURE_PYTHON=True


In [3]:
_debug_random_sampling = []

In [5]:
import os

DEFAULT_EXPLORING_STEP = 0
os.environ["PURE_PYTHON"] = "True"

from BTrees.OOBTree import OOBTree as _OOBTree
from collections import Counter
import numpy as np


class OOBTreeExtLean(_OOBTree):

    def __init__(self):
        super(OOBTreeExtLean, self).__init__()
        self.walking_path_to_fanout_distribution = {}
        self.default_exploring_step = DEFAULT_EXPLORING_STEP

    def random_sampling(self, k):
        self._first_walk_to_determine_structure()
        self.walking_path_to_fanout_distribution = {}
        all_accept_reject_measures = {
            'accept': [],
            'reject': [],
            'revisited_paths': Counter()
        }

        k = min(len(self), k)
        sampled_values = []
        all_walking_paths_set = set()
        all_walking_paths_stats = []
        while len(sampled_values) < k:
            sampled_value, walking_path, walking_path_stats  = \
                self._get_value_and_path_by_random_walk_from_node(node=self)

            if _this_value_was_sampled_already(walking_path, all_walking_paths_set):
                all_accept_reject_measures['revisited_paths'][str(walking_path)] += 1
                continue

            accept_reject_measures = {
                'path': walking_path,
                'value': sampled_value,
            }

            all_accept_reject_measures['accept'].append(accept_reject_measures)

            all_walking_paths_set.add(str(walking_path))
            all_walking_paths_stats.append(walking_path_stats)
            sampled_values.append(sampled_value)

        add_to_debug_global(locals())

        return sampled_values

    def _walk_to_determine_prob(self, node, forcing_step):
        if isinstance(node._data[0].child, self._bucket_type):
            next_step = self.default_exploring_step
            fanout = len(node._data)
            chosen_random_step_prob = 1 / fanout
            fraction = f"{1}/{fanout}"
        else:
            all_sizes = np.array([node.child.size for node in node._data])
            node_distribution = all_sizes / sum(all_sizes)
            next_step = forcing_step if forcing_step is not None else self.default_exploring_step
            fraction = f"{all_sizes[next_step]}/{sum(all_sizes)}"
            chosen_random_step_prob = node_distribution[next_step]

        return next_step, chosen_random_step_prob, fraction

    def _first_walk_to_determine_structure(self):
        root = self
        tree_probs = {}
        for i in range(len(root._data)):
            current_node = root
            prob_to_bucket = 1
            walking_path_stats = []
            while not isinstance(current_node, self._bucket_type):
                force_step = current_node == root
                next_step, step_prob, fraction = self._walk_to_determine_prob(current_node, forcing_step=i if force_step else None)
                # assert next_step == 0, 'walking always on 0 just to determine structure'
                prob_to_bucket *= step_prob

                current_node = current_node._data[next_step].child

                walking_path_stats.append({
                    'next_random_step': next_step,
                    'chosen_random_step_prob': step_prob,
                    'prob_along_path': prob_to_bucket,
                    'fraction': fraction
                        })
            tree_probs[i] = walking_path_stats

        all_sizes = np.array([node.child.size for node in root._data])
        node_distribution = np.array(all_sizes / sum(all_sizes))

        branch_coefs = np.array([x[-1]['prob_along_path'] for x in tree_probs.values()])

        gaus = np.zeros((len(branch_coefs) + 1, len(branch_coefs)))

        for i in range(len(branch_coefs)):
            gaus[i][0] = branch_coefs[0]
            gaus[i][i] = -1 * branch_coefs[i]
            gaus[-1][i] = node_distribution[i]

        gaus_eq = np.zeros(len(branch_coefs))
        gaus_eq[-1] = 1

        x = np.linalg.solve(gaus[1:,], gaus_eq)

        self.root_probs_coefs = np.linalg.solve(gaus[1:,], gaus_eq)

    def _get_value_and_path_by_random_walk_from_node(self, node):
        walking_path = []
        current_node = node
        prob_along_path = 1
        walking_path_stats = []
        while not isinstance(current_node, self._bucket_type):
            root = not walking_path_stats
            if root:
                next_random_step, chosen_random_step_prob = self._random_next_move_respect_fanout_prob_from_root(
                    current_node, walking_path)
            else:
                next_random_step, chosen_random_step_prob = self._random_next_move_respect_fanout_prob(current_node, walking_path)
            prob_along_path *= chosen_random_step_prob
            walking_path.append((next_random_step, current_node.size, chosen_random_step_prob, prob_along_path))
            current_node = current_node._data[next_random_step].child
            walking_path_stats.append({
                'next_random_step': next_random_step,
                'chosen_random_step_prob':
                    chosen_random_step_prob, 'prob_along_path':prob_along_path})

        next_random_step = np.random.randint(low=0, high=current_node.size)
        chosen_random_step_prob = 1/current_node.max_leaf_size  # todo: size
        prob_along_path *= chosen_random_step_prob
        walking_path.append((next_random_step, current_node.size, chosen_random_step_prob, prob_along_path))
        walking_path_stats.append({
            'next_random_step': next_random_step,
            'chosen_random_step_prob':
                chosen_random_step_prob, 'prob_along_path': prob_along_path,
            'entire_walking_path': walking_path})

        leaf = current_node._keys
        return leaf[next_random_step], walking_path, walking_path_stats


    def _random_next_move_respect_fanout_prob(self, current_node, walking_path):
        walking_path_str = str(walking_path)
        if walking_path_str in self.walking_path_to_fanout_distribution:
            node_distribution = self.walking_path_to_fanout_distribution[walking_path_str]
        else:
            all_sizes = np.array([node.child.size for node in current_node._data])
            node_distribution = all_sizes / sum(all_sizes)
            self.walking_path_to_fanout_distribution[walking_path_str] = node_distribution

        next_random_step = np.random.choice(current_node.size, p=node_distribution)
        chosen_random_step_prob = node_distribution[next_random_step]
        return next_random_step, chosen_random_step_prob

    def _random_next_move_respect_fanout_prob_from_root(self, current_node, walking_path):
        walking_path_str = str(walking_path)
        if walking_path_str in self.walking_path_to_fanout_distribution:
            node_distribution = self.walking_path_to_fanout_distribution[walking_path_str]
        else:
            all_sizes = np.array([node.child.size for node in current_node._data])
            node_distribution = np.array(all_sizes / sum(all_sizes))
            node_distribution *= self.root_probs_coefs
            #node_distribution = node_distribution / sum(node_distribution)
            self.walking_path_to_fanout_distribution[walking_path_str] = node_distribution
        next_random_step = np.random.choice(current_node.size, p=node_distribution)
        chosen_random_step_prob = node_distribution[next_random_step]
        return next_random_step, chosen_random_step_prob


    def join(self, right_tree):
        pass


def add_to_debug_global(all_vars):
    global _debug_random_sampling
    _debug_random_sampling.append({
        'params': {
            'k': all_vars['k'],
        },
        'tree_size': len(all_vars['self']),
        'all_accept_reject_measures': all_vars['all_accept_reject_measures'],
        'all_walking_paths_stats': all_vars['all_walking_paths_stats']
    })


def _this_value_was_sampled_already(walking_path, all_walking_paths_set):
    return str(walking_path) in all_walking_paths_set


In [6]:
def generate_btree_index_x_values_with_dist(num_of_values, disired_prefix_to_percent_dist, my_index=None):
    my_index = my_index if my_index is not None else OOBTreeExt()
    for prefix, amount_percent in disired_prefix_to_percent_dist.items():
        amount = int(num_of_values * amount_percent)
        my_index = insert_to_index_random(my_index, amount, prefix)

    return my_index


def insert_to_index_random(my_index, amount, prefix=''):
    amount_in_iteration = min(CHUNKS_SIZE, amount)
    print('generating %s values, chunk of %s, with prefix=\'%s\'' %(amount, amount_in_iteration, prefix))

    proceed = 0
    for i in range(0, amount, amount_in_iteration):
        alphabet = list(ALPHABET)
        np_alphabet = np.array(alphabet)
        np_codes = np.random.choice(np_alphabet, [amount_in_iteration, KEY_LENGTH])
        my_index.update({
            prefix + ''.join(np_codes[i]): "".join(np_codes[i])
            for i in range(len(np_codes))
        })

        proceed += amount_in_iteration
        if (proceed % 150000) == 0:
            print('done generating %s values' % (proceed))
    return my_index


In [7]:
prefix_to_percent = {
    'gggg': 0.25,
    'hhhh': 0.15,
    'mmmm': 0.10,
    'rrrr': 0.03,
    '': 0.47
}
print(datetime.utcnow())
num_of_values = 4_000_000
my_index_4m = generate_btree_index_x_values_with_dist(num_of_values, prefix_to_percent, OOBTreeExtLean())
print(datetime.utcnow())

2021-01-23 16:17:41.240747
generating 1000000 values, chunk of 10000, with prefix='gggg'
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
done generating 750000 values
done generating 900000 values
generating 600000 values, chunk of 10000, with prefix='hhhh'
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
generating 400000 values, chunk of 10000, with prefix='mmmm'
done generating 150000 values
done generating 300000 values
generating 120000 values, chunk of 10000, with prefix='rrrr'
generating 1880000 values, chunk of 10000, with prefix=''
done generating 150000 values
done generating 300000 values
done generating 450000 values
done generating 600000 values
done generating 750000 values
done generating 900000 values
done generating 1050000 values
done generating 1200000 values
done generating 1350000 values
done generating 1500000 values
done g

In [8]:
print(datetime.utcnow())
sampled = my_index_4m.random_sampling(k=10_000)
print(datetime.utcnow()) # to 11 seconds 

2021-01-23 16:21:42.133014
1
2021-01-23 16:21:45.931630


In [9]:
def _calculate_prefix_ditribution(values):
    return {value: occurences/len(values) for value, occurences in Counter([key[:4] for key in values]).most_common(10)}


In [10]:
_calculate_prefix_ditribution(sampled)

{'gggg': 0.2498,
 'hhhh': 0.151,
 'mmmm': 0.1015,
 'rrrr': 0.0309,
 'Xggt': 0.0002,
 'SPyo': 0.0001,
 'aLWR': 0.0001,
 'CsCr': 0.0001,
 'ZMRk': 0.0001,
 'evjT': 0.0001}

In [None]:
my_index_4m.