In [1]:
import numpy as np
import pandas as pd

from psql_functions import execQuery
from miss_data import add_missing_dates, add_missing_counts
#from sample_range_query import load_range_queries

import re
import os
owd = os.getcwd()

In [2]:
os.chdir(owd)

In [3]:
def load_range_queries_n_split(file, n_structures):
    all_queries = pd.read_csv(file, sep='\n',header=None).to_numpy().flatten()
    split_queries = np.array_split(all_queries, n_structures)
    return split_queries

In [4]:
param_dic = {
    "host"      : "localhost",
    "database"  : "bachelorBesoeg2014",
    "user"      : "postgres",
    "password"  : "password",
    "port"      : "5432"
}

query = """select time_ from _775147;"""
result = execQuery(param_dic, query)
dates = [(date[0]) for date in result]

query = """select count_ from _775147;"""
result = execQuery(param_dic, query)

counts = [(count[0]) for count in result]

all_dates = add_missing_dates(dates)
all_counts =  add_missing_counts(counts, dates, all_dates)

n_data_structures = 50


Executed query and closed connection.
Executed query and closed connection.


In [5]:
import numpy as np
import pandas as pd
from scipy.stats import laplace
from datetime import datetime
from datetime import timedelta 

class OLH_flat:
    def __init__(self, epsilon, dates, counts):
        """Setup of the datastructere
        Parameters:
        T (int): The lenght of the stream
        epsilon (float): The height of the full binary tree. 
        dates (Array): The dates of the stream
        counts (Array): The count for each of the dates
        Returns:
        A epsilon differintial datastructe
        """
        
        self.epsilon = epsilon
        self.all_dates = dates
        self.all_counts = counts
        
        if len(dates) < (dates[-1]-dates[0]).days:
            print('here')
            self.all_dates = self.__add_missing_dates(dates)
            self.all_counts = self.__add_missing_counts(counts,dates)
            
        #Make dict for date indexing
        values = np.arange(0,len(self.all_dates))
        zip_iterator = zip(self.all_dates, values)
        self.idx_dict =  dict(zip_iterator)
        
        self.noise_counts = self.__process(self.all_dates, self.all_counts)
        #Check if we are we have missing dates.
        
        self.p = np.exp(self.epsilon)/(np.exp(self.epsilon)+len(self.all_dates)-1)
        self.var = self.OLH_var(self.p, len(self.all_dates))
        
    def __add_missing_dates(self, old_dates):
        """Add missing dates in a list
        Parameters:
        old_dates (list of datetime.date): List of dates that is not countious
        Returns:
        List of countious starting with the first value of 
        """
        start_date = old_dates[0]
        end_date = old_dates[-1]
        all_dates = pd.date_range(start = start_date, end = end_date).to_pydatetime().tolist()
        return [(date.date()) for date in all_dates]
    
    def OLH_var(self, p, N):
        return 4*p*(1-p)/(N*(2*p-1)**2)
    
    def __add_missing_counts(self, old_counts, old_dates):
        """Adds 0 to the list of counts where there was missing dates
        Parameters:
        old_counts (list of int): List counts for each day with 
        old_dates (list of datetime.date): List of dates that is not countious
        Returns:
        List of countious starting with the first value of 
        """
        zip_iterator = zip(old_dates, old_counts)
        missing_dict =  dict(zip_iterator)
        all_counts = np.zeros(len(self.all_dates))
        for i, date in enumerate(self.all_dates):
            val = missing_dict.get(date, 0)
            all_counts[i] = val
            
        return all_counts
    
    def OLH_func(self, x, g):
        if np.random.uniform(0,1) < np.exp(self.epsilon)/(np.exp(self.epsilon)+g-1):
            #print('if')
            return x
        else:
            return np.random.randint(low = 0, high = g)
    
    def OLH_aggre(self, count, N, g):
        p = np.exp(self.epsilon)/(np.exp(self.epsilon)+g-1)
        #print(p - 1/g)
        #print(f'p = {p}')
        return (count - (1-p)*N/g) / (p)

    def OLH_answer(self, count, N, g):
        p = np.exp(self.epsilon)/(np.exp(self.epsilon)+g-1)
        #print(p - 1/g)
        #print(f'p = {p}')
        return (count- N/g) / (p)

    def __process(self, dates, counts):
        OLH_count = np.zeros(len(counts))
        D = len(dates)
        
        for idx, count in enumerate(counts):
            for i in range(0,int(count)):
                response = self.OLH_func(idx, D)
                OLH_count[response] = OLH_count[response] + 1
                
        return OLH_count
    
    def answer(self, dates):
        """Calculates the path of index in full binary string

        Parameters:
        dates (tuple of string): Two dates in the format string 2000-12-19. 

        Returns:
        float: The private range count
        """
        N = np.sum(self.noise_counts)
        D = len(self.noise_counts)
        if (len(dates) < 2):
            #There is only one date
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()

            idx = self.idx_dict[date_obj_0]
            noise_count = self.noise_counts[idx]
            return self.OLH_answer(noise_count, N, D)
            
        else:
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
            date_obj_1 = datetime.strptime(dates[1],'%Y-%m-%d').date()
            idx_0 = self.idx_dict[date_obj_0]
            idx_1 = self.idx_dict[date_obj_1]
            #print(idx_0)
            #print(idx_1)
            #idx_0 is not 0
            noise_sum = 0.0
            for i in range(idx_0, idx_1+1):
                #print(i)
                #print(self.OLH_answer(self.noise_counts[i], N, D))
                noise_sum = noise_sum + self.OLH_aggre(self.noise_counts[i], N, D)
            return noise_sum
    
    def real_answer(self, dates):
        if len(dates) < 2:
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
            return self.all_counts[self.idx_dict[date_obj_0]]
        else:
            date_obj_0 = datetime.strptime(dates[0],'%Y-%m-%d').date()
            date_obj_1 = datetime.strptime(dates[1],'%Y-%m-%d').date()
            #print(self.all_counts[self.idx_dict[date_obj_0]: self.idx_dict[date_obj_1]+1])
            sum_ = np.sum(self.all_counts[self.idx_dict[date_obj_0]: self.idx_dict[date_obj_1]+1])  
            return sum_ 
"""
query_dates = ('2014-01-02','2014-01-9')
epsilon = 0.7
FLAT_OLH = OLH_flat(epsilon, all_dates[:32], all_counts[:32])
print(FLAT_OLH.answer(query_dates))
print(FLAT_OLH.real_answer(query_dates))
print(FLAT_OLH.noise_counts)
print(FLAT_OLH.var)
"""

"\nquery_dates = ('2014-01-02','2014-01-9')\nepsilon = 0.7\nFLAT_OLH = OLH_flat(epsilon, all_dates[:32], all_counts[:32])\nprint(FLAT_OLH.answer(query_dates))\nprint(FLAT_OLH.real_answer(query_dates))\nprint(FLAT_OLH.noise_counts)\nprint(FLAT_OLH.var)\n"

In [6]:
os.getcwd()
os.chdir(os.getcwd()+'/'+'range_queries/flat')

In [7]:
files = [f for f in os.listdir('.') if re.match(r'.*\.csv', f)]
files

['flat_N=1024_r=44.csv',
 'flat_N=128_r=9.csv',
 'flat_N=2048_r=64.csv',
 'flat_N=256_r=24.csv',
 'flat_N=32_r=6.csv',
 'flat_N=512_r=35.csv',
 'range_queries_flat_N=1024_r=44.csv',
 'range_queries_flat_N=128_r=9.csv',
 'range_queries_flat_N=2048_r=64.csv',
 'range_queries_flat_N=256_r=24.csv',
 'range_queries_flat_N=32_r=6.csv',
 'range_queries_flat_N=512_r=35.csv']

In [8]:
n_32_flat = []
n_128_flat = []
n_256_flat = []
n_512_flat = []
n_1024_flat = []
n_2048_flat = []

for f in files:
    if 'N=32' in f:
        n_32_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=128' in f:
        n_128_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=256' in f:
        n_256_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=512' in f:
        n_512_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=1024' in f:
        n_1024_flat.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=2048' in f:
        n_2048_flat.append(load_range_queries_n_split(f, n_data_structures))

n_32_flat = [item for sublist in n_32_flat for item in sublist]
n_128_flat = [item for sublist in n_128_flat for item in sublist]
n_256_flat = [item for sublist in n_256_flat for item in sublist]
n_512_flat = [item for sublist in n_512_flat for item in sublist]
n_1024_flat = [item for sublist in n_1024_flat for item in sublist]
n_2048_flat = [item for sublist in n_2048_flat for item in sublist]
    

In [9]:
os.chdir(owd)
os.getcwd()
os.chdir(os.getcwd()+'/'+'range_queries/local_hh')
os.getcwd()
files = [f for f in os.listdir('.') if re.match(r'.*\.csv', f)]
files

['hh_N=1024_B=2_r=402.csv',
 'hh_N=1024_B=3_r=296.csv',
 'hh_N=1024_B=4_r=202.csv',
 'hh_N=2048_B=2_r=485.csv',
 'hh_N=2048_B=3_r=294.csv',
 'hh_N=2048_B=4_r=288.csv',
 'hh_N=256_B=2_r=44.csv',
 'hh_N=256_B=3_r=217.csv',
 'hh_N=256_B=4_r=130.csv',
 'hh_N=512_B=2_r=325.csv',
 'hh_N=512_B=3_r=217.csv',
 'hh_N=512_B=4_r=200.csv']

In [10]:
n_256_hh = []
n_512_hh = []
n_1024_hh = []
n_2048_hh = []

for f in files:
    if 'N=256' in f:
        n_256_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=512' in f:
        n_512_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=1024' in f:
        n_1024_hh.append(load_range_queries_n_split(f, n_data_structures))
    if 'N=2048' in f:
        n_2048_hh.append(load_range_queries_n_split(f, n_data_structures))

n_256_hh = [item for sublist in n_256_hh for item in sublist]
n_512_hh = [item for sublist in n_512_hh for item in sublist]
n_1024_hh = [item for sublist in n_1024_hh for item in sublist]
n_2048_hh = [item for sublist in n_2048_hh for item in sublist]

In [11]:
len(n_256_hh)

150

In [12]:
def model_answers(model, query):
    estimation = model.answer(query)
    correct = model.real_answer(query)  
    return estimation, correct

In [13]:
epsilons = np.array([2,1.4,1.2,1,0.8,0.6,0.5,0.3])
n = np.array([32,128,256,512,1024,2048])
os.chdir(owd)
#We have 50 datastruces

for e in epsilons:
    for N in n:
        csv_name_est_flat = 'results/sample_querys/flat/' + f'est_flat_queries_e={e}_N={N}.csv'
        csv_name_cor_flat = 'results/sample_querys/flat/' + f'cor_flat_queries_e={e}_N={N}.csv'
        
        csv_name_est_hh = 'results/sample_querys/flat/' + f'est_hh_queries_e={e}_N={N}.csv'
        csv_name_cor_hh = 'results/sample_querys/flat/' + f'cor_hh_queries_e={e}_N={N}.csv'
        
        answers_flat = []
        correct_answers_flat = []
        
        answers_hh = []
        correct_answers_hh = []
        
        for i in range(0,n_data_structures):
            FLAT_OLH = OLH_flat(e, all_dates[:N], all_counts[:N])
            
            if N == 32:
                for query in n_32_flat[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)

            if N == 128:
                for query in n_128_flat[i]:

                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)

                
            if N == 256:
                for query in n_32_flat[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)
                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)             

            if N == 512:
                for query in n_512_flat[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)
                
                for query in n_512_hh[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)
                    
                    est, cor = model_answers(FLAT_OLH, res)
                    answers_hh.append(est)
                    correct_answers_hh.append(cor)

            if N == 1024:
                for query in n_1024_flat[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)
                
                for query in n_1024_hh[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)
                    est, cor = model_answers(FLAT_OLH, res)
                    answers_hh.append(est)
                    correct_answers_hh.append(cor)

            if N == 2048:
                for query in n_2048_flat[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_flat.append(est)
                    correct_answers_flat.append(cor)
                
                for query in n_2048_hh[i]:
                    dates_split = query.split(', ')
                    res = tuple(dates_split)

                    est, cor = model_answers(FLAT_OLH, res)
                    answers_hh.append(est)
                    correct_answers_hh.append(cor)
            
            
        np.savetxt(csv_name_est_flat, answers_flat, delimiter=',')
        np.savetxt(csv_name_cor_flat, correct_answers_flat, delimiter=',')
        
        np.savetxt(csv_name_est_hh, answers_hh, delimiter=',')
        np.savetxt(csv_name_cor_hh, correct_answers_hh, delimiter=',')

"""
n_32_flat = []
n_128_flat = []
n_256_flat = []
n_512_flat = []
n_1024_flat = []
n_2048_flat = []

n_256_hh = []
n_512_hh = []
n_1024_hh = []
n_2048_hh = []

"""
            
                

'\nn_32_flat = []\nn_128_flat = []\nn_256_flat = []\nn_512_flat = []\nn_1024_flat = []\nn_2048_flat = []\n\nn_256_hh = []\nn_512_hh = []\nn_1024_hh = []\nn_2048_hh = []\n\n'

In [14]:
"""
FLAT_OLH = OLH_flat(epsilon, all_dates[:32], all_counts[:32])
print(FLAT_OLH.answer(query_dates))
print(FLAT_OLH.real_answer(query_dates))
print(FLAT_OLH.noise_counts)
print(FLAT_OLH.var)
"""


'\nFLAT_OLH = OLH_flat(epsilon, all_dates[:32], all_counts[:32])\nprint(FLAT_OLH.answer(query_dates))\nprint(FLAT_OLH.real_answer(query_dates))\nprint(FLAT_OLH.noise_counts)\nprint(FLAT_OLH.var)\n'