In [1]:
import numpy as np
import pandas as pd

from psql_functions import execQuery
from miss_data import add_missing_dates, add_missing_counts
from sample_range_query import load_range_queries_n_split

import re
import os
owd = os.getcwd()

In [2]:
os.chdir(owd)

In [3]:
"""
def load_range_queries_n_split(file, n_structures):
    all_queries = pd.read_csv(file, sep='\n',header=None).to_numpy().flatten()
    split_queries = np.array_split(all_queries, n_structures)
    return split_queries
"""

In [4]:
param_dic = {
    "host"      : "localhost",
    "database"  : "bachelorBesoeg2014",
    "user"      : "postgres",
    "password"  : "password",
    "port"      : "5432"
}

query = """select time_ from _775147;"""
result = execQuery(param_dic, query)
dates = [(date[0]) for date in result]

query = """select count_ from _775147;"""
result = execQuery(param_dic, query)

counts = [(count[0]) for count in result]

all_dates = add_missing_dates(dates)
all_counts =  add_missing_counts(counts, dates, all_dates)

n_data_structures = 50


Executed query and closed connection.
Executed query and closed connection.


In [5]:
import numpy as np
import pandas as pd

from datetime import datetime

class HH_OLH_degree:
    def __init__(self, epsilon, degree, dates, counts):
        """Setup of the datastructere
        Parameters:
        T (int): The lenght of the stream
        epsilon (float): The height of the full binary tree. 
        dates (Array): The dates of the stream
        counts (Array): The count for each of the dates
        Returns:
        A epsilon differintial datastructe
        """
        self.epsilon = epsilon
        self.all_dates = dates
        self.all_counts = counts
        #Check if we are we have missing dates.
        if len(dates) < (dates[-1]-dates[0]).days:
            #print('here')
            self.all_dates = self.__add_missing_dates(dates)
            self.all_counts = self.__add_missing_counts(counts,dates)
            
        #Make dict for date indexing
        values = np.arange(0,len(self.all_dates))
        zip_iterator = zip(self.all_dates, values)
        self.idx_dict =  dict(zip_iterator)
     
        self.degree = degree
        self.h = int(np.ceil(np.log(len(self.all_dates)) / np.log(degree)))
        self.level_prob = np.full(self.h+1,1/(self.h+1))
    
        self.max_val = np.max(np.fromiter(self.idx_dict.values(), dtype = int))
        self.max_idxs = (self.get_index(self.max_val,self.h))
        self.max_idxs.reverse()
        
        self.tree_levels = self.__process(self.all_dates, self.all_counts)
        
        
        
    def __add_missing_dates(self, old_dates):
        """Add missing dates in a list
        Parameters:
        old_dates (list of datetime.date): List of dates that is not countious
        Returns:
        List of countious starting with the first value of 
        """
        start_date = old_dates[0]
        end_date = old_dates[-1]
        all_dates = pd.date_range(start = start_date, end = end_date).to_pydatetime().tolist()
        return [(date.date()) for date in all_dates]
    
    def __add_missing_counts(self, old_counts, old_dates):
        """Adds 0 to the list of counts where there was missing dates
        Parameters:
        old_counts (list of int): List counts for each day with 
        old_dates (list of datetime.date): List of dates that is not countious
        Returns:
        List of countious starting with the first value of 
        """
        zip_iterator = zip(old_dates, old_counts)
        missing_dict =  dict(zip_iterator)
        all_counts = np.zeros(len(self.dates))
        for i, date in enumerate(self.dates):
            val = missing_dict.get(date, 0)
            all_counts[i] = val
            
        return all_counts
    
    def __process(self, dates, counts):
        tree_levels = []
        for i in np.arange(0,self.h+1):
            level = np.zeros(int(self.degree**np.ceil(i)))
            tree_levels.append(level)
        
        for index, (date, day_count) in enumerate(zip(dates, counts)):
            idxs = self.get_index(index,self.h)
            idxs.reverse()
            #print(self.max_idxs)
            for person in range(int(day_count)):
                level = np.random.choice(np.arange(0, self.h+1), p = self.level_prob ) 
        
                if level != 0:
                    """
                    print(f' We are at level {level}')
                    print(f' max index at level is {self.max_idxs[level]}')
                    print(f'Now we hash down to {(self.degree**level)}')
                    """
                    hash_value = self.max_idxs[level]
                    response = self.OLH_func(idxs[level], hash_value)
                    
                else:
                    response = 0
                tree_levels[level][response] = tree_levels[level][response] + 1

        return tree_levels

    def get_index(self, date_idx, n_layers):
        """Calculates the path of index in full binary string

        Parameters:
        date_idx (int): The node in the bouttom layer we want to calculate a path to. 
        The bottom layer has index from 0 to 2**h-1
        n_layers (int): The height of the full binary tree. 

        Returns:
        list: of index in the path from the starting from the bottom and going up

        """
        idx = []
        for i in np.arange(0,n_layers):
            if i == 0:
                idx.append(int(date_idx))
            else:
                idx.append(int(idx[i-1]//self.degree))
        idx.append(0)
        return idx
    
    def get_group(self, idx, level):
        """Calculates the path of index in full binary string

        Parameters:
        date_idx (int): The node in the bouttom layer we want to calculate a path to. 
        The bottom layer has index from 0 to 2**h-1
        n_layers (int): The height of the full binary tree. 0 index

        Returns:
        list: of index in the path from the starting from the bottom and going up

        """
        if level == 0:
            return id
        elif idx == 0:
            return np.arange(0,self.degree)
        else:
            group_index = idx //self.degree
            level_indicis = np.arange(0,self.degree**level)

            split_ratio = (len(level_indicis) // self.degree)
            level_indicis_split = np.array_split(level_indicis, split_ratio)
            
            return level_indicis_split[group_index]
    
    def OLH_func(self, x, g):
        if np.random.uniform(0,1) < np.exp(self.epsilon)/(np.exp(self.epsilon)+g-1):
            return x
        else:
            return np.random.randint(low = 0, high = g)
    
    def OLH_aggre(self, count, N, g):
        p = np.exp(self.epsilon)/(np.exp(self.epsilon)+g-1)
        #print(p - 1/g)
        #print(f'p = {p}')
        return (count - (1-p)*N/g) / (p)
    
    def turns_right(self, path):
        #0 is left 1 is right
        direction_lst = []
        for i in range(len(path)-1):
            current = path[i]
            nxt = path[i+1]
            if nxt == 0:
                #We went left
                direction_lst.append(0)
            elif current == 0 and current < nxt:
                #We went right
                direction_lst.append(1)
            elif self.degree * current < nxt:
                #We went right
                direction_lst.append(1)
            else:
                #print('else')            
                direction_lst.append(0)

        return direction_lst

    def turns_left(self, path):
        #0 is left 1 is right
        direction_lst = []
        for i in range(len(path)-1):
            current = path[i]
            nxt = path[i+1]
            #Checks
            if nxt == 0:
                #We went left
                direction_lst.append(0)
            #Checks
            elif nxt == current*self.degree + self.degree - 1:
                #We went right
                direction_lst.append(1)
            elif current == 0 and current < nxt:
                #We went left
                direction_lst.append(0)
            else:
                direction_lst.append(0)

        return direction_lst
    
    def answer(self, dates):
        """Calculates the path of index in full binary string

        Parameters:
        dates (tuple of string): Two dates in the format string 2000-12-19. 

        Returns:
        float: The private range count
        """
            
        date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
        date_obj_1 = datetime.strptime(dates[1],'%Y-%m-%d').date()


        idx_0 = self.idx_dict[date_obj_0]
        idx_1 = self.idx_dict[date_obj_1]
        
        #print(idx_0)
        #print(idx_1)
        idx_left = idx_0-1
        idx_right = idx_1+1
        
        path_to_left = np.flip(np.array(self.get_index(idx_left,self.h)))
        path_to_right = np.flip(np.array(self.get_index(idx_right,self.h)))
        
        left_or_right_list_leftside = self.turns_left(path_to_left)
        left_or_right_list_rightside = self.turns_right(path_to_right)
        
        range_count = 0.0
        
        #Starting in 0
        if idx_0 == 0:
            level_offset = 1

            for i in range(len(left_or_right_list_rightside)):
                if left_or_right_list_rightside[i] == 1:
                    group = self.get_group(path_to_right[i+level_offset], i+level_offset)
                    idx_sss = np.where(group == path_to_right[i+level_offset])[0][0]

                    count_nodes = self.tree_levels[i+level_offset][group[:idx_sss]]
                    hash_value = self.max_idxs[i+level_offset]
                    
                    for node in count_nodes:
                        range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)
    
        elif idx_1 == np.max(np.fromiter(self.idx_dict.values(), dtype = int)):
            
            level_offset = 1

            for i in range(len(left_or_right_list_leftside)):
                if left_or_right_list_leftside[i] == 0:

                    group = self.get_group(path_to_left[i+level_offset], i+level_offset)
                    idx_sss = np.where(group == path_to_left[i+level_offset])[0][0]

                    count_nodes = self.tree_levels[i+level_offset][group[idx_sss+1:]]
                    
                    hash_value = self.max_idxs[i+level_offset]
                    for node in count_nodes:
                        #print(node)
                        
                        range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)
                    
                    
        else:
            level_offset = 1
            left_count = []
            left_count_group = []

            for i in range(len(left_or_right_list_rightside)):
                if left_or_right_list_rightside[i] == 1:
                    group = self.get_group(path_to_right[i+level_offset], i+level_offset)
                    idx_sss = np.where(group == path_to_right[i+level_offset])[0][0]

                    left_count_group.append(group[:idx_sss]) 

                    count_nodes = self.tree_levels[i+level_offset][group[:idx_sss]]
                    left_count.append(count_nodes)

                else:
                    left_count_group.append(np.array([]))
                    left_count.append(np.array([]))

            #The search right side
            right_count = []
            right_count_group = []

            for i in range(len(left_or_right_list_leftside)):
                if left_or_right_list_leftside[i] == 0:

                    group = self.get_group(path_to_left[i+level_offset], i+level_offset)
                    idx_sss = np.where(group == path_to_left[i+level_offset])[0][0]

                    right_count_group.append(group[idx_sss+1:]) 

                    count_nodes = self.tree_levels[i+level_offset][group[idx_sss+1:]]
                    right_count.append(count_nodes)

                else:
                    right_count_group.append(np.array([]))
                    right_count.append(np.array([]))

            for i in range(len(left_count_group)):
                
                hash_value = self.max_idxs[i + level_offset]
                if left_count_group[i].size != 0 and right_count_group[i].size != 0:
                    #Both not zero
                    group_left = self.get_group(left_count_group[i][0], i+ level_offset)
                    group_right = self.get_group(right_count_group[i][0], i+ level_offset)

                    if not (np.array_equal(group_left,group_right)):
                        for node in left_count_group[i]:
                            range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)
                        for node in right_count_group[i]:
                            range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)
                    else:
                        count_nodes = np.intersect1d(left_count_group[i], right_count_group[i])
                        for node in count_nodes:
                            range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)

                if left_count_group[i].size != 0 and right_count_group[i].size == 0:
                    #Left not zero
                    for node in left_count_group[i]:
                        range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)

                if right_count_group[i].size != 0 and left_count_group[i].size == 0:
                    #Right not zero
                    for node in right_count_group[i]:

                        range_count = range_count + self.OLH_aggre(node, np.sum(self.tree_levels[i+level_offset]), hash_value)
        #Correct for sampling differnt levels in the tree               
        return range_count * (self.h+1)
    
    def real_answer(self, dates):
        if len(dates) < 2:
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
            return self.all_counts[self.idx_dict[date_obj_0]]
        else:
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
            date_obj_1 = datetime.strptime(dates[1],'%Y-%m-%d').date()
            sum_ = np.sum(self.all_counts[self.idx_dict[date_obj_0]: self.idx_dict[date_obj_1]+1])  
            return sum_ 


In [6]:
os.getcwd()
os.chdir(os.getcwd()+'/'+'range_queries/flat')

In [7]:
files = [f for f in os.listdir('.') if re.match(r'.*\.csv', f)]
files

['flat_N=1024_r=44.csv',
 'flat_N=128_r=9.csv',
 'flat_N=2048_r=64.csv',
 'flat_N=256_r=24.csv',
 'flat_N=32_r=6.csv',
 'flat_N=512_r=35.csv',
 'range_queries_flat_N=1024_r=44.csv',
 'range_queries_flat_N=128_r=9.csv',
 'range_queries_flat_N=2048_r=64.csv',
 'range_queries_flat_N=256_r=24.csv',
 'range_queries_flat_N=32_r=6.csv',
 'range_queries_flat_N=512_r=35.csv']

In [8]:
n_32_flat = []
n_128_flat = []
n_256_flat = []
n_512_flat = []
n_1024_flat = []
n_2048_flat = []

for f in files:
    if 'N=32' in f:
        n_32_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=128' in f:
        n_128_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=256' in f:
        n_256_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=512' in f:
        n_512_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=1024' in f:
        n_1024_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=2048' in f:
        n_2048_flat.append(load_range_queries_n_split(f, n_data_structures))

n_32_flat = [item for sublist in n_32_flat for item in sublist]
n_128_flat = [item for sublist in n_128_flat for item in sublist]
n_256_flat = [item for sublist in n_256_flat for item in sublist]
n_512_flat = [item for sublist in n_512_flat for item in sublist]
n_1024_flat = [item for sublist in n_1024_flat for item in sublist]
n_2048_flat = [item for sublist in n_2048_flat for item in sublist]
    

In [9]:
os.chdir(owd)
os.getcwd()
os.chdir(os.getcwd()+'/'+'range_queries/local_hh')
os.getcwd()
files = [f for f in os.listdir('.') if re.match(r'.*\.csv', f)]
files

['hh_N=1024_B=2_r=402.csv',
 'hh_N=1024_B=3_r=296.csv',
 'hh_N=1024_B=4_r=202.csv',
 'hh_N=2048_B=2_r=485.csv',
 'hh_N=2048_B=3_r=294.csv',
 'hh_N=2048_B=4_r=288.csv',
 'hh_N=256_B=2_r=44.csv',
 'hh_N=256_B=3_r=217.csv',
 'hh_N=256_B=4_r=130.csv',
 'hh_N=512_B=2_r=325.csv',
 'hh_N=512_B=3_r=217.csv',
 'hh_N=512_B=4_r=200.csv']

In [10]:
n_256_hh = []
n_512_hh = []
n_1024_hh = []
n_2048_hh = []

for f in files:
    if 'N=256' in f:
        n_256_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=512' in f:
        n_512_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=1024' in f:
        n_1024_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=2048' in f:
        n_2048_hh.append(load_range_queries_n_split(f, n_data_structures))

n_256_hh = [item for sublist in n_256_hh for item in sublist]
n_512_hh = [item for sublist in n_512_hh for item in sublist]
n_1024_hh = [item for sublist in n_1024_hh for item in sublist]
n_2048_hh = [item for sublist in n_2048_hh for item in sublist]

In [11]:
len(n_256_hh)

150

In [12]:
def model_answers(model, query):
    estimation = model.answer(query)
    correct = model.real_answer(query)  
    return estimation, correct

In [16]:
epsilons = np.array([2,1.4,1.2,1,0.8,0.6,0.5,0.3])
n = np.array([32,128,256,512,1024,2048])
degrees = np.array([2,3,4])

os.chdir(owd)
#We have 50 datastruces

for e in epsilons:
    for N in n:
        for degree in degrees:
            csv_name_est_flat = 'results/sample_querys/local_hh/' + f'est_flat_queries_e={e}_N={N}_B={degree}.csv'
            csv_name_cor_flat = 'results/sample_querys/local_hh/' + f'cor_flat_queries_e={e}_N={N}_B={degree}.csv'

            csv_name_est_hh = 'results/sample_querys/local_hh/' + f'est_hh_queries_e={e}_N={N}_B={degree}.csv'
            csv_name_cor_hh = 'results/sample_querys/local_hh/' + f'cor_hh_queries_e={e}_N={N}_B={degree}.csv'

            answers_flat = []
            correct_answers_flat = []

            answers_hh = []
            correct_answers_hh = []

            for i in range(0,n_data_structures):
                """
                degree = 4
                local_HH = HH_OLH_degree(epsilon, degree, all_dates[:], all_counts[:])
                """
                FLAT_OLH = HH_OLH_degree(e,degree, all_dates[:N], all_counts[:N])

                if N == 32:
                    for query in n_32_flat[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)

                if N == 128:
                    for query in n_128_flat[i]:

                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)


                if N == 256:
                    for query in n_32_flat[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)
                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)             

                if N == 512:
                    for query in n_512_flat[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)

                    for query in n_512_hh[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_hh.append(est)
                        correct_answers_hh.append(cor)

                if N == 1024:
                    for query in n_1024_flat[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)

                    for query in n_1024_hh[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)
                        est, cor = model_answers(FLAT_OLH, res)
                        answers_hh.append(est)
                        correct_answers_hh.append(cor)

                if N == 2048:
                    for query in n_2048_flat[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_flat.append(est)
                        correct_answers_flat.append(cor)

                    for query in n_2048_hh[i]:
                        dates_split = query.split(', ')
                        res = tuple(dates_split)

                        est, cor = model_answers(FLAT_OLH, res)
                        answers_hh.append(est)
                        correct_answers_hh.append(cor)


                        
            np.savetxt(csv_name_est_flat, answers_flat, delimiter=',')
            np.savetxt(csv_name_cor_flat, correct_answers_flat, delimiter=',')

            np.savetxt(csv_name_est_hh, answers_hh, delimiter=',')
            np.savetxt(csv_name_cor_hh, correct_answers_hh, delimiter=',')

"""
n_32_flat = []
n_128_flat = []
n_256_flat = []
n_512_flat = []
n_1024_flat = []
n_2048_flat = []

n_256_hh = []
n_512_hh = []
n_1024_hh = []
n_2048_hh = []

"""
            
                

'\nn_32_flat = []\nn_128_flat = []\nn_256_flat = []\nn_512_flat = []\nn_1024_flat = []\nn_2048_flat = []\n\nn_256_hh = []\nn_512_hh = []\nn_1024_hh = []\nn_2048_hh = []\n\n'

In [None]:
"""
FLAT_OLH = OLH_flat(epsilon, all_dates[:32], all_counts[:32])
print(FLAT_OLH.answer(query_dates))
print(FLAT_OLH.real_answer(query_dates))
print(FLAT_OLH.noise_counts)
print(FLAT_OLH.var)
"""
