In [1]:
import sys
sys.path.append("/home/jarlehti/projects/gradu")

In [27]:
import os
import itertools
import pandas as pd
import pickle
from src.utils.preprocess_dataset import get_adult_train_small, get_adult_train_large, get_adult_train_no_discretization
from src.napsu_mq.dataframe_data import DataFrameData
from src.napsu_mq.mst import MST_selection, Domain, Dataset
from src.napsu_mq.marginal_query import FullMarginalQuerySet
from src.utils.query_utils import calculate_query_number

In [28]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
DATASETS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "datasets")
ORIG_RESULTS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "orig_results")

In [8]:
adult_small = get_adult_train_small(DATASETS_FOLDER)
adult_large = get_adult_train_large(DATASETS_FOLDER)
adult_no_discretization = get_adult_train_no_discretization(DATASETS_FOLDER)

In [14]:
adult_reduced = pd.read_csv(os.path.join(DATASETS_FOLDER, "adult-reduced-discretised-copy.csv"), dtype="category")

In [31]:
adult_reduced

Unnamed: 0,age,workclass,education,marital-status,race,sex,capital-gain,capital-loss,hours-per-week,compensation
0,"(31.6, 46.2]",State-gov,Bachelors,Never-married,White,Male,True,False,"(20.6, 40.2]",False
1,"(46.2, 60.8]",Self-emp-not-inc,Bachelors,Married-civ-spouse,White,Male,False,False,"(0.902, 20.6]",False
2,"(31.6, 46.2]",Private,HS-grad,Divorced,White,Male,False,False,"(20.6, 40.2]",False
3,"(46.2, 60.8]",Private,11th,Married-civ-spouse,Black,Male,False,False,"(20.6, 40.2]",False
4,"(16.927, 31.6]",Private,Bachelors,Married-civ-spouse,Black,Female,False,False,"(20.6, 40.2]",False
...,...,...,...,...,...,...,...,...,...,...
46038,"(31.6, 46.2]",Private,Bachelors,Never-married,White,Male,False,False,"(20.6, 40.2]",False
46039,"(31.6, 46.2]",Private,Bachelors,Divorced,White,Female,False,False,"(20.6, 40.2]",False
46040,"(31.6, 46.2]",Private,Bachelors,Married-civ-spouse,White,Male,False,False,"(40.2, 59.8]",False
46041,"(31.6, 46.2]",Private,Bachelors,Divorced,Asian-Pac-Islander,Male,True,False,"(20.6, 40.2]",False


In [6]:
def calculate_canonical_queries(data, epsilon, column_feature_set=[]):
    dataframe = DataFrameData(data)
    n, d = dataframe.int_array.shape
    delta = (n ** (-2))
    domain_key_list = list(dataframe.values_by_col.keys())
    domain_value_count_list = [len(dataframe.values_by_col[key]) for key in domain_key_list]

    domain = Domain(domain_key_list, domain_value_count_list)

    query_sets = MST_selection(Dataset(dataframe.int_df, domain), epsilon, delta,
                                   cliques_to_include=column_feature_set)

    queries = FullMarginalQuerySet(query_sets, dataframe.values_by_col)
    query_list = queries.flatten()
    queries = queries.get_canonical_queries()
    query_number = calculate_query_number(queries.queries)
    return query_number

In [24]:
print(calculate_canonical_queries(adult_small, 1))

(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                                                                   | 0/12 [00:00<?, ?it/s]
70it [00:00, 21048.20it/s]

7it [00:00, 18929.81it/s]

2it [00:00, 13706.88it/s]

32it [00:00, 16555.78it/s]

10it [00:00, 19953.87it/s]

14it [00:00, 19638.88it/s]

16it [00:00, 23077.33it/s]

70it [00:00, 35305.59it/s]

14it [00:00, 35437.69it/s]

10it [00:00, 48827.75it/s]

2it [00:00, 26214.40it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 375.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 139810.13it/s]


Calculating new queries, not_original_clique_queries length: 41


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 12532.17it/s]

176





In [19]:
print(calculate_canonical_queries(adult_large, 1))

(30162, 9)
Dataframe data n: 30162
Dataframe data d: 9
Calculating canonical queries, clique_set length: 18


  0%|                                                           | 0/18 [00:00<?, ?it/s]
70it [00:00, 38110.24it/s]

2it [00:00, 6442.86it/s]

16it [00:00, 24555.02it/s]

14it [00:00, 19411.65it/s]

10it [00:00, 12725.44it/s]

4it [00:00, 14463.12it/s]

70it [00:00, 30134.59it/s]

2it [00:00, 19784.45it/s]

2it [00:00, 13595.80it/s]

7it [00:00, 19077.41it/s]

14it [00:00, 18887.18it/s]

2it [00:00, 18600.02it/s]

4it [00:00, 10280.16it/s]

10it [00:00, 44667.77it/s]

32it [00:00, 32498.24it/s]

7it [00:00, 18736.52it/s]

70it [00:00, 32239.08it/s]
100%|█████████████████████████████████████████████████| 18/18 [00:00<00:00, 393.02it/s]
100%|█████████████████████████████████████████████████| 8/8 [00:00<00:00, 70640.91it/s]


Calculating new queries, not_original_clique_queries length: 72


100%|███████████████████████████████████████████████| 72/72 [00:00<00:00, 39798.35it/s]

240





In [17]:
adult_small_full_marginal_set = list(itertools.combinations(adult_small.columns, 2))
print(calculate_canonical_queries(adult_small, 1, adult_small_full_marginal_set))

(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 22


  0%|                                                                                                                                                   | 0/22 [00:00<?, ?it/s]
7it [00:00, 17311.40it/s]

20it [00:00, 35925.52it/s]

2it [00:00, 16131.94it/s]

70it [00:00, 36866.06it/s]

20it [00:00, 33091.16it/s]

14it [00:00, 23100.02it/s]

32it [00:00, 27120.17it/s]

10it [00:00, 29310.30it/s]

16it [00:00, 42500.86it/s]

20it [00:00, 23431.87it/s]

160it [00:00, 48604.96it/s]

70it [00:00, 26793.33it/s]

112it [00:00, 24312.29it/s]

100it [00:00, 38109.25it/s]

32it [00:00, 32768.00it/s]

10it [00:00, 26664.36it/s]

2it [00:00, 8962.19it/s]

4it [00:00, 23431.87it/s]

20it [00:00, 23636.54it/s]

160it [00:00, 28615.41it/s]

14it [00:00, 16653.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 285.32it/s]
100%|█████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 41


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 18675.77it/s]

669





In [7]:
print(calculate_canonical_queries(adult_no_discretization, 1))

(30162, 9)
Dataframe data n: 30162
Dataframe data d: 9
Calculating canonical queries, clique_set length: 18


  0%|                                                                                                                                                   | 0/18 [00:00<?, ?it/s]
385it [00:00, 35706.86it/s]
  0%|                                                                                                                                                   | 0/18 [00:00<?, ?it/s]


IndexError: index 72 is out of bounds for axis 0 with size 72

In [34]:
#adult_reduced_full_marginal_set = list(itertools.combinations(adult_reduced.columns, 2))
queries = [
    ('age', 'compensation'), 
    ('age', 'marital-status'), 
    ('age', 'workclass'), 
    ('education', 'compensation'), 
    ('race', 'compensation'), 
    ('race', 'sex'), 
    ('sex', 'compensation'), 
    ('capital-gain', 'compensation'), 
    ('capital-loss', 'compensation'), 
    ('hours-per-week', 'compensation')
]
print(calculate_canonical_queries(adult_reduced, 1, queries))

(46043, 10)
Dataframe data n: 46043
Dataframe data d: 10


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 21


  0%|                                                                                                                                                   | 0/21 [00:00<?, ?it/s]
16it [00:00, 42500.86it/s]

2it [00:00, 19599.55it/s]

5it [00:00, 8375.21it/s]

32it [00:00, 35544.95it/s]

4it [00:00, 10852.02it/s]

10it [00:00, 33743.40it/s]

5it [00:00, 39794.16it/s]

8it [00:00, 13634.47it/s]

10it [00:00, 13929.94it/s]

2it [00:00, 4731.31it/s]

4it [00:00, 12915.49it/s]

10it [00:00, 15984.39it/s]

40it [00:00, 31583.61it/s]

5it [00:00, 28728.11it/s]

7it [00:00, 24818.37it/s]

2it [00:00, 18766.46it/s]

2it [00:00, 3442.19it/s]

4it [00:00, 8834.76it/s]

35it [00:00, 20989.51it/s]

10it [00:00, 15065.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 427.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████

Calculating new queries, not_original_clique_queries length: 66


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:00<00:00, 33248.15it/s]

130





In [40]:
adult_small_random_query = list(itertools.combinations(adult_small.columns, 2))[0]
print(calculate_canonical_queries(adult_small, 1, [adult_small_random_query]))

(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                                                                   | 0/12 [00:00<?, ?it/s]
160it [00:00, 32814.47it/s]

70it [00:00, 20687.80it/s]

7it [00:00, 9332.53it/s]

2it [00:00, 13066.37it/s]

10it [00:00, 11149.13it/s]

14it [00:00, 13107.20it/s]

16it [00:00, 24412.10it/s]

70it [00:00, 21566.13it/s]

14it [00:00, 19071.21it/s]

10it [00:00, 9406.38it/s]

2it [00:00, 10538.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 312.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 104857.60it/s]


Calculating new queries, not_original_clique_queries length: 41


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 17730.33it/s]

296



