In [1]:
import sys
sys.path.append("/home/jarlehti/projects/gradu")

In [2]:
import os
import itertools
import pandas as pd
import pickle
from src.utils.preprocess_dataset import get_adult_train_small, get_adult_train_large, clean_adult_with_discretization, get_adult_train_no_discretization, get_adult_train_raw, ADULT_COLUMNS_SMALL, ADULT_COLUMNS_LARGE, get_adult_train_high_discretization, get_adult_train_independence_pruning, get_adult_train_low_discretization
from src.napsu_mq.dataframe_data import DataFrameData
from src.napsu_mq.mst import MST_selection, Domain, Dataset
from src.napsu_mq.marginal_query import FullMarginalQuerySet
from src.napsu_mq.marginal_query_torch import FullMarginalQuerySet as FullMarginalQuerySetTorch
from src.utils.query_utils import calculate_query_number

In [3]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
DATASETS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "datasets")
ORIG_RESULTS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "orig_results")

In [4]:
adult_small = get_adult_train_small(DATASETS_FOLDER)
adult_large = get_adult_train_large(DATASETS_FOLDER)
adult_no_discretization = get_adult_train_no_discretization(DATASETS_FOLDER)
adult_high_discretization = get_adult_train_high_discretization(DATASETS_FOLDER)
adult_raw = get_adult_train_raw(DATASETS_FOLDER)
adult_independence_pruning = get_adult_train_independence_pruning(DATASETS_FOLDER)
adult_low_discretization = get_adult_train_low_discretization(DATASETS_FOLDER)

In [5]:
adult_small.dtypes

age               category
education-num     category
marital-status    category
sex               category
hours-per-week    category
compensation      category
dtype: object

In [6]:
adult_reduced = pd.read_csv(os.path.join(DATASETS_FOLDER, "adult-reduced-discretised-copy.csv"), dtype="category")

In [7]:
adult_high_discretization

Unnamed: 0,age,workclass,education-num,marital-status,sex,hours-per-week,had-capital-gains,had-capital-losses,compensation
0,"(16.927, 53.5]",State-gov,13,Never-married,Male,"(0.902, 50.0]",1,0,0
1,"(16.927, 53.5]",Self-emp-not-inc,13,Married,Male,"(0.902, 50.0]",0,0,0
2,"(16.927, 53.5]",Private,9,Divorced,Male,"(0.902, 50.0]",0,0,0
3,"(16.927, 53.5]",Private,7,Married,Male,"(0.902, 50.0]",0,0,0
4,"(16.927, 53.5]",Private,13,Married,Female,"(0.902, 50.0]",0,0,0
...,...,...,...,...,...,...,...,...,...
30157,"(16.927, 53.5]",Private,12,Married,Female,"(0.902, 50.0]",0,0,0
30158,"(16.927, 53.5]",Private,9,Married,Male,"(0.902, 50.0]",0,0,1
30159,"(53.5, 90.0]",Private,9,Widowed,Female,"(0.902, 50.0]",0,0,0
30160,"(16.927, 53.5]",Private,9,Never-married,Male,"(0.902, 50.0]",0,0,0


In [8]:
def calculate_canonical_queries(data, epsilon, column_feature_set=[]):
    dataframe = DataFrameData(data)
    n, d = dataframe.int_array.shape
    print(dataframe.values_by_col)
    delta = (n ** (-2))
    domain_key_list = list(dataframe.values_by_col.keys())
    domain_value_count_list = [len(dataframe.values_by_col[key]) for key in domain_key_list]
    
    domain = Domain(domain_key_list, domain_value_count_list)

    query_sets = MST_selection(Dataset(dataframe.int_df, domain), epsilon, delta,
                                   cliques_to_include=column_feature_set)

    queries = FullMarginalQuerySet(query_sets, dataframe.values_by_col)
    
    query_list = queries.flatten()
    queries = queries.get_canonical_queries()
    query_number = calculate_query_number(queries.queries)
    return query_number

In [9]:
print(calculate_canonical_queries(adult_small, 1))

{'age': [0, 1, 2, 3, 4], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                                                                   | 0/12 [00:00<?, ?it/s]
5it [00:00, 5095.12it/s]

2it [00:00, 1746.54it/s]

32it [00:00, 7649.48it/s]

5it [00:00, 6717.34it/s]

10it [00:00, 4407.63it/s]

16it [00:00, 5367.42it/s]

25it [00:00, 8868.20it/s]

10it [00:00, 4895.88it/s]

10it [00:00, 6812.25it/s]

5it [00:00, 9023.89it/s]

2it [00:00, 4785.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 127.51it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 48545.19it/s]


Calculating new queries, not_original_clique_queries length: 29


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 10433.59it/s]

72





In [10]:
query = [('education-num', 'compensation')]
print(calculate_canonical_queries(adult_large, 1, query))

{'age': [0, 1, 2, 3, 4], 'workclass': [0, 1, 2, 3, 4, 5, 6], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'had-capital-gains': [0, 1], 'had-capital-losses': [0, 1], 'compensation': [0, 1]}
Calculating canonical queries, clique_set length: 18


  0%|                                                                                                                                                   | 0/18 [00:00<?, ?it/s]
35it [00:00, 21260.05it/s]

2it [00:00, 6523.02it/s]

16it [00:00, 12517.98it/s]

10it [00:00, 12318.07it/s]

5it [00:00, 16094.80it/s]

4it [00:00, 6751.39it/s]

25it [00:00, 20850.59it/s]

2it [00:00, 15827.56it/s]

2it [00:00, 15857.48it/s]

7it [00:00, 29360.13it/s]

10it [00:00, 23871.96it/s]

2it [00:00, 8719.97it/s]

4it [00:00, 9093.34it/s]

5it [00:00, 12810.95it/s]

32it [00:00, 30059.96it/s]

5it [00:00, 22429.43it/s]

10it [00:00, 9633.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 341.38it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 88534.12it/s]


Calculating new queries, not_original_clique_queries length: 62


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 22567.63it/s]

106





In [11]:
adult_small_full_marginal_set = list(itertools.combinations(adult_small.columns, 2))
print(calculate_canonical_queries(adult_small, 1, adult_small_full_marginal_set))

{'age': [0, 1, 2, 3, 4], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 22


  0%|                                                                                                                                                   | 0/22 [00:00<?, ?it/s]
5it [00:00, 8378.55it/s]

10it [00:00, 10934.06it/s]

2it [00:00, 3647.22it/s]

25it [00:00, 15692.55it/s]

10it [00:00, 10837.99it/s]

10it [00:00, 8276.05it/s]

32it [00:00, 20286.84it/s]

5it [00:00, 22525.80it/s]

16it [00:00, 33042.28it/s]

10it [00:00, 9031.66it/s]

80it [00:00, 22079.64it/s]

25it [00:00, 16384.00it/s]

80it [00:00, 21447.38it/s]

25it [00:00, 15170.37it/s]

32it [00:00, 14466.23it/s]

5it [00:00, 23172.95it/s]

2it [00:00, 15391.94it/s]

4it [00:00, 14768.68it/s]

10it [00:00, 17331.83it/s]

80it [00:00, 19250.97it/s]

10it [00:00, 11980.30it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 272.86it/s]
100%|██████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 29


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 19082.96it/s]

312





In [12]:
adult_no_discretization.dtypes

age                      int64
workclass             category
education-num         category
marital-status        category
sex                   category
hours-per-week           int64
had-capital-gains        int64
had-capital-losses       int64
compensation          category
dtype: object

In [14]:
queries = [('age', 'marital-status'), ('age', 'hours-per-week'), ('age', 'workclass'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'compensation'), ('had-capital-gains', 'compensation'), ('had-capital-gains', 'had-capital-losses')]
adult_no_discretization = adult_no_discretization.astype("category")
print(calculate_canonical_queries(adult_no_discretization, 1, queries))

{'age': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], 'workclass': [0, 1, 2, 3, 4, 5, 6], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4, 5, 6], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93], 'had-capital-gains': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 3

  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 18


  0%|                                                                                                                                                   | 0/18 [00:00<?, ?it/s]
504it [00:00, 42518.39it/s]

118it [00:00, 42721.44it/s]

16it [00:00, 21916.68it/s]

14it [00:00, 21107.21it/s]

94it [00:00, 53035.32it/s]

504it [00:00, 35752.35it/s]

2it [00:00, 12865.96it/s]

2it [00:00, 6452.78it/s]

7it [00:00, 42924.16it/s]

14it [00:00, 16513.01it/s]

90it [00:00, 49734.83it/s]

0it [00:00, ?it/s][A
4232it [00:00, 42316.31it/s][A
10620it [00:00, 42953.08it/s][A
 67%|████████████████████████████████████████████████████████████████████████████████████████████                                              | 12/18 [00:00<00:00, 39.43it/s]
0it [00:00, ?it/s][A
6768it [00:00, 45771.32it/s][A

236it [00:00, 31807.70it/s]

72it [00:00, 50508.43it/s]

32it [00:00, 39383.14it/s]
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 537


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 537/537 [00:00<00:00, 8064.47it/s]

18411





In [15]:
#adult_reduced_full_marginal_set = list(itertools.combinations(adult_reduced.columns, 2))
queries = [
    ('age', 'compensation'), 
    ('age', 'marital-status'), 
    ('age', 'workclass'), 
    ('education', 'compensation'), 
    ('race', 'compensation'), 
    ('race', 'sex'), 
    ('sex', 'compensation'), 
    ('capital-gain', 'compensation'), 
    ('capital-loss', 'compensation'), 
    ('hours-per-week', 'compensation')
]
print(calculate_canonical_queries(adult_reduced, 0.1, queries))

{'age': [0, 1, 2, 3, 4], 'workclass': [0, 1, 2, 3, 4, 5, 6, 7], 'education': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4, 5, 6], 'race': [0, 1, 2, 3, 4], 'sex': [0, 1], 'capital-gain': [0, 1], 'capital-loss': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 21


  0%|                                                                                                                                                   | 0/21 [00:00<?, ?it/s]
16it [00:00, 24663.31it/s]

2it [00:00, 11848.32it/s]

5it [00:00, 7189.41it/s]

32it [00:00, 12771.69it/s]

4it [00:00, 3498.17it/s]

10it [00:00, 14368.98it/s]

5it [00:00, 7910.80it/s]

8it [00:00, 12571.91it/s]

10it [00:00, 11545.02it/s]

2it [00:00, 2532.03it/s]

4it [00:00, 6413.31it/s]

10it [00:00, 7943.76it/s]

40it [00:00, 14415.89it/s]

5it [00:00, 6293.97it/s]

7it [00:00, 6993.84it/s]

2it [00:00, 2146.52it/s]

2it [00:00, 5932.54it/s]

4it [00:00, 4570.20it/s]

35it [00:00, 15903.01it/s]

10it [00:00, 10768.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 243.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 66


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:00<00:00, 24939.10it/s]

130





In [16]:
adult_small_random_query = list(itertools.combinations(adult_small.columns, 2))[0]
print(calculate_canonical_queries(adult_small, 1, [adult_small_random_query]))

{'age': [0, 1, 2, 3, 4], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                                                                   | 0/12 [00:00<?, ?it/s]
80it [00:00, 23507.38it/s]

5it [00:00, 12889.69it/s]

2it [00:00, 6000.43it/s]

5it [00:00, 11202.74it/s]

10it [00:00, 20440.08it/s]

16it [00:00, 33387.49it/s]

25it [00:00, 19385.76it/s]

10it [00:00, 22758.02it/s]

10it [00:00, 23121.85it/s]

5it [00:00, 7573.68it/s]

2it [00:00, 8895.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 311.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 86302.55it/s]


Calculating new queries, not_original_clique_queries length: 29


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 15806.99it/s]

117





In [17]:
adult_raw = get_adult_train_raw(DATASETS_FOLDER)
adult_discretized = clean_adult_with_discretization(adult_raw, n_buckets=5, columns=ADULT_COLUMNS_SMALL)

In [18]:
adult_discretized

Unnamed: 0,age,education-num,marital-status,sex,hours-per-week,compensation
0,"(31.6, 46.2]",13,Never-married,Male,"(20.6, 40.2]",0
1,"(46.2, 60.8]",13,Married,Male,"(0.902, 20.6]",0
2,"(31.6, 46.2]",9,Divorced,Male,"(20.6, 40.2]",0
3,"(46.2, 60.8]",7,Married,Male,"(20.6, 40.2]",0
4,"(16.927, 31.6]",13,Married,Female,"(20.6, 40.2]",0
...,...,...,...,...,...,...
30157,"(16.927, 31.6]",12,Married,Female,"(20.6, 40.2]",0
30158,"(31.6, 46.2]",9,Married,Male,"(20.6, 40.2]",1
30159,"(46.2, 60.8]",9,Widowed,Female,"(20.6, 40.2]",0
30160,"(16.927, 31.6]",9,Never-married,Male,"(0.902, 20.6]",0


In [19]:
adult_reduced_full_marginal_set = list(itertools.combinations(adult_reduced.columns, 2))
queries = [
    ('age', 'compensation'), 
    ('age', 'marital-status'), 
    ('education-num', 'compensation'), 
    ('sex', 'compensation'), 
    ('hours-per-week', 'compensation')
]

adult_small_random_query = list(itertools.combinations(adult_small.columns, 2))

print(calculate_canonical_queries(adult_discretized, 1, adult_small_random_query))

{'age': [0, 1, 2, 3, 4], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 22


  0%|                                                                                                                                                   | 0/22 [00:00<?, ?it/s]
5it [00:00, 18741.30it/s]

10it [00:00, 21890.94it/s]

2it [00:00, 3594.09it/s]

25it [00:00, 19908.41it/s]

10it [00:00, 5096.36it/s]

10it [00:00, 9135.93it/s]

32it [00:00, 19231.66it/s]

5it [00:00, 25668.94it/s]

16it [00:00, 36235.89it/s]

10it [00:00, 8366.85it/s]

80it [00:00, 17230.37it/s]

25it [00:00, 19065.02it/s]

80it [00:00, 22949.48it/s]

25it [00:00, 11587.76it/s]

32it [00:00, 18184.22it/s]

5it [00:00, 6417.23it/s]

2it [00:00, 3711.77it/s]

4it [00:00, 14401.04it/s]

10it [00:00, 17697.49it/s]

80it [00:00, 22014.45it/s]

10it [00:00, 15313.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 231.40it/s]
100%|████████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 29


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 14577.52it/s]

312





In [20]:
adult_extra_high_discretization

NameError: name 'adult_extra_high_discretization' is not defined

In [23]:
#adult_extra_high_discretization = clean_adult_with_discretization(adult_raw, n_buckets=2, columns=ADULT_COLUMNS_SMALL)

adult_extra_high_discretization_full_marginal_set = list(itertools.combinations(adult_extra_high_discretization.columns, 2))
print(calculate_canonical_queries(adult_extra_high_discretization, 1))

[('age', 'education-num'), ('age', 'marital-status'), ('age', 'sex'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'marital-status'), ('education-num', 'sex'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'hours-per-week'), ('marital-status', 'compensation'), ('sex', 'hours-per-week'), ('sex', 'compensation'), ('hours-per-week', 'compensation')]
(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                      | 0/12 [00:00<?, ?it/s]
5it [00:00, 14905.13it/s]

4it [00:00, 11514.90it/s]

2it [00:00, 8160.12it/s]

32it [00:00, 16490.69it/s]

2it [00:00, 10727.12it/s]

10it [00:00, 15147.36it/s]

16it [00:00, 14896.53it/s]

10it [00:00, 9485.08it/s]

10it [00:00, 18001.30it/s]

2it [00:00, 15679.64it/s]

2it [00:00, 22919.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 384.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 111550.64it/s]


Calculating new queries, not_original_clique_queries length: 23


100%|██████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 34440.91it/s]

51





In [25]:
adult_independence_pruning_full_marginal_set = list(itertools.combinations(adult_independence_pruning.columns, 2))

adult_independence_pruning_full_marginal_set.remove(('age', 'sex'))

print(adult_independence_pruning_full_marginal_set)
print(calculate_canonical_queries(adult_independence_pruning, 0.1, adult_independence_pruning_full_marginal_set))


adult_independence_pruning_reduced = adult_independence_pruning.drop(columns=['marital-status'])
adult_independence_pruning_reduced_marginal_set = list(itertools.combinations(adult_independence_pruning_reduced.columns, 2))
print(adult_independence_pruning_reduced_marginal_set)
print(calculate_canonical_queries(adult_independence_pruning_reduced, 0.1, adult_independence_pruning_reduced_marginal_set))

[('age', 'education-num'), ('age', 'marital-status'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'marital-status'), ('education-num', 'sex'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'hours-per-week'), ('marital-status', 'compensation'), ('sex', 'hours-per-week'), ('sex', 'compensation'), ('hours-per-week', 'compensation')]
{'age': [0, 1], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1], 'compensation': [0, 1]}


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 21


  0%|                                                                                                                                                   | 0/21 [00:00<?, ?it/s]
5it [00:00, 12663.96it/s]

4it [00:00, 23172.95it/s]

2it [00:00, 19065.02it/s]

10it [00:00, 16953.53it/s]

4it [00:00, 14588.88it/s]

10it [00:00, 13206.25it/s]

32it [00:00, 31220.69it/s]

2it [00:00, 11634.69it/s]

16it [00:00, 38903.69it/s]

4it [00:00, 10305.42it/s]

32it [00:00, 28765.05it/s]

10it [00:00, 31655.12it/s]

80it [00:00, 34803.89it/s]

4it [00:00, 11328.30it/s]

32it [00:00, 31865.56it/s]

2it [00:00, 17549.39it/s]

2it [00:00, 4576.44it/s]

4it [00:00, 11932.59it/s]

32it [00:00, 24179.02it/s]

10it [00:00, 28630.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 444.59it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 23


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 43611.66it/s]


164
[('age', 'education-num'), ('age', 'sex'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'sex'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('sex', 'hours-per-week'), ('sex', 'compensation'), ('hours-per-week', 'compensation')]
{'age': [0, 1], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'sex': [0, 1], 'hours-per-week': [0, 1], 'compensation': [0, 1]}


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 16


  0%|                                                                                                                                                   | 0/16 [00:00<?, ?it/s]
32it [00:00, 19143.88it/s]

32it [00:00, 15019.89it/s]

4it [00:00, 10407.70it/s]

2it [00:00, 8701.88it/s]

4it [00:00, 8612.53it/s]

32it [00:00, 9694.31it/s]

4it [00:00, 7588.07it/s]

4it [00:00, 4169.29it/s]

2it [00:00, 7619.08it/s]

32it [00:00, 13008.11it/s]

4it [00:00, 4467.97it/s]

16it [00:00, 8887.41it/s]

4it [00:00, 11586.48it/s]

2it [00:00, 9543.35it/s]

2it [00:00, 3362.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 227.45it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 79287.41it/s]


Calculating new queries, not_original_clique_queries length: 19


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 20742.26it/s]

85





In [14]:
adult_low_discretization_full_marginal_set = list(itertools.combinations(adult_low_discretization.columns, 2))

print(calculate_canonical_queries(adult_high_discretization, 0.1, []))

Domain size: 35840
Calculating canonical queries, clique_set length: 18


  0%|                                                                                                      | 0/18 [00:00<?, ?it/s]
2it [00:00, 3562.04it/s]

16it [00:00, 11039.46it/s]

112it [00:00, 18977.22it/s]

10it [00:00, 6951.12it/s]

32it [00:00, 9579.45it/s]

2it [00:00, 8981.38it/s]

10it [00:00, 9029.72it/s]

2it [00:00, 5797.24it/s]

2it [00:00, 901.61it/s]

7it [00:00, 14665.40it/s]

2it [00:00, 8533.68it/s]

10it [00:00, 10991.36it/s]

4it [00:00, 9300.01it/s]

2it [00:00, 7307.15it/s]

32it [00:00, 7254.62it/s]
 83%|████████████████████████████████████████████████████████████████████████████▋               | 15/18 [00:00<00:00, 147.76it/s]
5it [00:00, 9362.29it/s]

4it [00:00, 4436.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 155.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 69470.87it/s]


Calculating new queries, not_original_clique_queries length: 52


100%|██████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 11471.90it/s]

165



