In [1]:
import sys
sys.path.append("/home/jarlehti/projects/gradu")

In [8]:
import os
import itertools
import pandas as pd
import pickle
from src.utils.preprocess_dataset import get_adult_train_small, get_adult_train_large, clean_adult_with_discretization, get_adult_train_no_discretization, get_adult_train_raw, ADULT_COLUMNS_SMALL, ADULT_COLUMNS_LARGE, get_adult_train_high_discretization, get_adult_train_independence_pruning, get_adult_train_low_discretization
from src.napsu_mq.dataframe_data import DataFrameData
from src.napsu_mq.mst import MST_selection, Domain, Dataset
from src.napsu_mq.marginal_query import FullMarginalQuerySet
from src.napsu_mq.marginal_query_torch import FullMarginalQuerySet as FullMarginalQuerySetTorch
from src.utils.query_utils import calculate_query_number

In [9]:
CURRENT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
DATASETS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "datasets")
ORIG_RESULTS_FOLDER = os.path.join(CURRENT_FOLDER, "data", "orig_results")

In [10]:
adult_small = get_adult_train_small(DATASETS_FOLDER)
adult_large = get_adult_train_large(DATASETS_FOLDER)
adult_no_discretization = get_adult_train_no_discretization(DATASETS_FOLDER)
adult_high_discretization = get_adult_train_high_discretization(DATASETS_FOLDER)
adult_raw = get_adult_train_raw(DATASETS_FOLDER)
adult_independence_pruning = get_adult_train_independence_pruning(DATASETS_FOLDER)
adult_low_discretization = get_adult_train_low_discretization(DATASETS_FOLDER)

In [18]:
adult_small.dtypes

age               category
education-num     category
marital-status    category
sex               category
hours-per-week    category
compensation      category
dtype: object

In [12]:
adult_reduced = pd.read_csv(os.path.join(DATASETS_FOLDER, "adult-reduced-discretised-copy.csv"), dtype="category")

In [13]:
adult_high_discretization

Unnamed: 0,age,workclass,education-num,marital-status,sex,hours-per-week,had-capital-gains,had-capital-losses,compensation
0,"(16.927, 53.5]",State-gov,13,Never-married,Male,"(0.902, 50.0]",1,0,0
1,"(16.927, 53.5]",Self-emp-not-inc,13,Married,Male,"(0.902, 50.0]",0,0,0
2,"(16.927, 53.5]",Private,9,Divorced,Male,"(0.902, 50.0]",0,0,0
3,"(16.927, 53.5]",Private,7,Married,Male,"(0.902, 50.0]",0,0,0
4,"(16.927, 53.5]",Private,13,Married,Female,"(0.902, 50.0]",0,0,0
...,...,...,...,...,...,...,...,...,...
30157,"(16.927, 53.5]",Private,12,Married,Female,"(0.902, 50.0]",0,0,0
30158,"(16.927, 53.5]",Private,9,Married,Male,"(0.902, 50.0]",0,0,1
30159,"(53.5, 90.0]",Private,9,Widowed,Female,"(0.902, 50.0]",0,0,0
30160,"(16.927, 53.5]",Private,9,Never-married,Male,"(0.902, 50.0]",0,0,0


In [16]:
def calculate_canonical_queries(data, epsilon, column_feature_set=[]):
    dataframe = DataFrameData(data)
    n, d = dataframe.int_array.shape
    print(dataframe.values_by_col)
    delta = (n ** (-2))
    domain_key_list = list(dataframe.values_by_col.keys())
    domain_value_count_list = [len(dataframe.values_by_col[key]) for key in domain_key_list]
    
    domain = Domain(domain_key_list, domain_value_count_list)

    query_sets = MST_selection(Dataset(dataframe.int_df, domain), epsilon, delta,
                                   cliques_to_include=column_feature_set)

    queries = FullMarginalQuerySet(query_sets, dataframe.values_by_col)
    
    query_list = queries.flatten()
    queries = queries.get_canonical_queries()
    query_number = calculate_query_number(queries.queries)
    return query_number

In [17]:
print(calculate_canonical_queries(adult_small, 1))

age               int8
education-num     int8
marital-status    int8
sex               int8
hours-per-week    int8
compensation      int8
dtype: object
(30162, 6)
{'age': [0, 1, 2, 3, 4], 'education-num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'marital-status': [0, 1, 2, 3, 4], 'sex': [0, 1], 'hours-per-week': [0, 1, 2, 3, 4], 'compensation': [0, 1]}
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                                                                   | 0/12 [00:00<?, ?it/s]
5it [00:00, 14354.22it/s]

2it [00:00, 6732.43it/s]

32it [00:00, 16710.37it/s]

5it [00:00, 8224.13it/s]

10it [00:00, 7516.67it/s]

16it [00:00, 17449.00it/s]

25it [00:00, 10055.39it/s]

10it [00:00, 11966.63it/s]

10it [00:00, 6823.33it/s]

5it [00:00, 6840.03it/s]

2it [00:00, 2201.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 222.71it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 19840.61it/s]


Calculating new queries, not_original_clique_queries length: 29


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 27296.86it/s]

72





In [10]:
query = [('education-num', 'compensation')]
print(calculate_canonical_queries(adult_large, 1, query))

(30162, 9)
Dataframe data n: 30162
Dataframe data d: 9
Calculating canonical queries, clique_set length: 18


  0%|                                                                                                      | 0/18 [00:00<?, ?it/s]
35it [00:00, 15600.49it/s]

2it [00:00, 6326.25it/s]

16it [00:00, 12534.34it/s]

10it [00:00, 18078.90it/s]

5it [00:00, 14675.66it/s]

4it [00:00, 10845.00it/s]

25it [00:00, 16557.33it/s]

2it [00:00, 2713.88it/s]

2it [00:00, 5928.34it/s]

7it [00:00, 10691.96it/s]

10it [00:00, 9747.39it/s]

2it [00:00, 3849.75it/s]

4it [00:00, 5157.46it/s]

5it [00:00, 13521.29it/s]

32it [00:00, 17117.42it/s]

5it [00:00, 17863.30it/s]

10it [00:00, 18741.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 272.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 62484.98it/s]


Calculating new queries, not_original_clique_queries length: 62


100%|██████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 35065.65it/s]

106





In [11]:
adult_small_full_marginal_set = list(itertools.combinations(adult_small.columns, 2))
print(calculate_canonical_queries(adult_small, 1, adult_small_full_marginal_set))

(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 22


  0%|                                                                                                      | 0/22 [00:00<?, ?it/s]
5it [00:00, 15615.43it/s]

10it [00:00, 11775.14it/s]

2it [00:00, 2892.62it/s]

25it [00:00, 11138.47it/s]

10it [00:00, 11090.17it/s]

10it [00:00, 7879.59it/s]

32it [00:00, 12146.40it/s]

5it [00:00, 9515.21it/s]

16it [00:00, 20008.61it/s]

10it [00:00, 11583.28it/s]

80it [00:00, 21300.34it/s]

25it [00:00, 17921.31it/s]

80it [00:00, 23104.34it/s]

25it [00:00, 15839.52it/s]

32it [00:00, 17836.24it/s]

5it [00:00, 20010.99it/s]

2it [00:00, 2441.39it/s]

4it [00:00, 5475.59it/s]

10it [00:00, 10425.81it/s]

80it [00:00, 21258.51it/s]

10it [00:00, 8696.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 242.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 293993.27it/s]


Calculating new queries, not_original_clique_queries length: 29


100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 16919.57it/s]

312





In [25]:
adult_no_discretization.dtypes

age                      int64
workclass             category
education-num         category
marital-status        category
sex                   category
hours-per-week           int64
had-capital-gains        int64
had-capital-losses       int64
compensation          category
dtype: object

In [7]:
queries = [('age', 'marital-status'), ('age', 'hours-per-week'), ('age', 'workclass'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'compensation'), ('had-capital-gains', 'compensation'), ('had-capital-gains', 'had-capital-losses')]
#adult_no_discretization = adult_no_discretization.astype("category")
print(calculate_canonical_queries(adult_no_discretization, 1, queries))

ValueError: DataFrame contains unsupported column type: int64

In [18]:
#adult_reduced_full_marginal_set = list(itertools.combinations(adult_reduced.columns, 2))
queries = [
    ('age', 'compensation'), 
    ('age', 'marital-status'), 
    ('age', 'workclass'), 
    ('education', 'compensation'), 
    ('race', 'compensation'), 
    ('race', 'sex'), 
    ('sex', 'compensation'), 
    ('capital-gain', 'compensation'), 
    ('capital-loss', 'compensation'), 
    ('hours-per-week', 'compensation')
]
print(calculate_canonical_queries(adult_reduced, 0.1, queries))

Domain size: 1792000


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 21


  0%|                                                                                                      | 0/21 [00:00<?, ?it/s]
16it [00:00, 27103.74it/s]

2it [00:00, 12409.18it/s]

5it [00:00, 10412.87it/s]

32it [00:00, 14903.15it/s]

4it [00:00, 3037.15it/s]

10it [00:00, 10885.81it/s]

5it [00:00, 20184.33it/s]

8it [00:00, 25285.93it/s]

10it [00:00, 20281.93it/s]

2it [00:00, 13231.24it/s]

4it [00:00, 6168.09it/s]

10it [00:00, 7381.74it/s]

40it [00:00, 21862.41it/s]

5it [00:00, 6391.81it/s]

7it [00:00, 9209.58it/s]

2it [00:00, 12300.01it/s]

2it [00:00, 1991.60it/s]

4it [00:00, 4611.66it/s]

35it [00:00, 10223.60it/s]

10it [00:00, 7999.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 287.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 199728.76it/s]


Calculating new queries, not_original_clique_queries length: 66


100%|██████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:00<00:00, 41721.79it/s]

130





In [None]:
adult_small_random_query = list(itertools.combinations(adult_small.columns, 2))[0]
print(calculate_canonical_queries(adult_small, 1, [adult_small_random_query]))

In [27]:
adult_raw = get_adult_train_raw(DATASETS_FOLDER)
adult_discretized = clean_adult_with_discretization(adult_raw, n_buckets=5, columns=ADULT_COLUMNS_SMALL)

In [32]:
adult_discretized

Unnamed: 0,age,education-num,marital-status,sex,hours-per-week,compensation
0,"(31.6, 46.2]",13,Never-married,Male,"(20.6, 40.2]",0
1,"(46.2, 60.8]",13,Married-civ-spouse,Male,"(0.902, 20.6]",0
2,"(31.6, 46.2]",9,Divorced,Male,"(20.6, 40.2]",0
3,"(46.2, 60.8]",7,Married-civ-spouse,Male,"(20.6, 40.2]",0
4,"(16.927, 31.6]",13,Married-civ-spouse,Female,"(20.6, 40.2]",0
...,...,...,...,...,...,...
30157,"(16.927, 31.6]",12,Married-civ-spouse,Female,"(20.6, 40.2]",0
30158,"(31.6, 46.2]",9,Married-civ-spouse,Male,"(20.6, 40.2]",1
30159,"(46.2, 60.8]",9,Widowed,Female,"(20.6, 40.2]",0
30160,"(16.927, 31.6]",9,Never-married,Male,"(0.902, 20.6]",0


In [38]:
adult_reduced_full_marginal_set = list(itertools.combinations(adult_reduced.columns, 2))
queries = [
    ('age', 'compensation'), 
    ('age', 'marital-status'), 
    ('education-num', 'compensation'), 
    ('sex', 'compensation'), 
    ('hours-per-week', 'compensation')
]

adult_small_random_query = list(itertools.combinations(adult_small.columns, 2))

print(calculate_canonical_queries(adult_discretized, 1, adult_small_random_query))

(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 22


  0%|                                                                                                      | 0/22 [00:00<?, ?it/s]
7it [00:00, 10660.90it/s]

10it [00:00, 12282.00it/s]

2it [00:00, 13508.23it/s]

35it [00:00, 20612.28it/s]

10it [00:00, 12572.85it/s]

14it [00:00, 11031.42it/s]

32it [00:00, 10802.23it/s]

5it [00:00, 8863.70it/s]

16it [00:00, 12758.34it/s]

10it [00:00, 10562.34it/s]

80it [00:00, 22454.95it/s]

35it [00:00, 17650.67it/s]

112it [00:00, 14505.54it/s]

25it [00:00, 12188.49it/s]

32it [00:00, 12644.16it/s]

5it [00:00, 8341.89it/s]

2it [00:00, 14217.98it/s]

4it [00:00, 16810.84it/s]

10it [00:00, 12505.38it/s]

80it [00:00, 10841.50it/s]

14it [00:00, 9829.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 239.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 95469.74it/s]


Calculating new queries, not_original_clique_queries length: 31


100%|██████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 15523.33it/s]

364





In [20]:
adult_extra_high_discretization

Unnamed: 0,age,education-num,marital-status,sex,hours-per-week,compensation
0,"(16.927, 53.5]",13,Never-married,Male,"(0.902, 50.0]",0
1,"(16.927, 53.5]",13,Married,Male,"(0.902, 50.0]",0
2,"(16.927, 53.5]",9,Divorced,Male,"(0.902, 50.0]",0
3,"(16.927, 53.5]",7,Married,Male,"(0.902, 50.0]",0
4,"(16.927, 53.5]",13,Married,Female,"(0.902, 50.0]",0
...,...,...,...,...,...,...
30157,"(16.927, 53.5]",12,Married,Female,"(0.902, 50.0]",0
30158,"(16.927, 53.5]",9,Married,Male,"(0.902, 50.0]",1
30159,"(53.5, 90.0]",9,Widowed,Female,"(0.902, 50.0]",0
30160,"(16.927, 53.5]",9,Never-married,Male,"(0.902, 50.0]",0


In [23]:
#adult_extra_high_discretization = clean_adult_with_discretization(adult_raw, n_buckets=2, columns=ADULT_COLUMNS_SMALL)

adult_extra_high_discretization_full_marginal_set = list(itertools.combinations(adult_extra_high_discretization.columns, 2))
print(calculate_canonical_queries(adult_extra_high_discretization, 1))

[('age', 'education-num'), ('age', 'marital-status'), ('age', 'sex'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'marital-status'), ('education-num', 'sex'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'hours-per-week'), ('marital-status', 'compensation'), ('sex', 'hours-per-week'), ('sex', 'compensation'), ('hours-per-week', 'compensation')]
(30162, 6)
Dataframe data n: 30162
Dataframe data d: 6
Calculating canonical queries, clique_set length: 12


  0%|                                                                                                      | 0/12 [00:00<?, ?it/s]
5it [00:00, 14905.13it/s]

4it [00:00, 11514.90it/s]

2it [00:00, 8160.12it/s]

32it [00:00, 16490.69it/s]

2it [00:00, 10727.12it/s]

10it [00:00, 15147.36it/s]

16it [00:00, 14896.53it/s]

10it [00:00, 9485.08it/s]

10it [00:00, 18001.30it/s]

2it [00:00, 15679.64it/s]

2it [00:00, 22919.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 384.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 111550.64it/s]


Calculating new queries, not_original_clique_queries length: 23


100%|██████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 34440.91it/s]

51





In [11]:
adult_independence_pruning_full_marginal_set = list(itertools.combinations(adult_independence_pruning.columns, 2))

adult_independence_pruning_full_marginal_set.remove(('age', 'sex'))

print(adult_independence_pruning_full_marginal_set)
print(calculate_canonical_queries(adult_independence_pruning, 0.1, adult_independence_pruning_full_marginal_set))


adult_independence_pruning_reduced = adult_independence_pruning.drop(columns=['sex'])
adult_independence_pruning_reduced_marginal_set = list(itertools.combinations(adult_independence_pruning_reduced.columns, 2))
print(adult_independence_pruning_reduced_marginal_set)
print(calculate_canonical_queries(adult_independence_pruning_reduced, 0.1, adult_independence_pruning_reduced_marginal_set))

[('age', 'education-num'), ('age', 'marital-status'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'marital-status'), ('education-num', 'sex'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('marital-status', 'sex'), ('marital-status', 'hours-per-week'), ('marital-status', 'compensation'), ('sex', 'hours-per-week'), ('sex', 'compensation'), ('hours-per-week', 'compensation')]
Domain size: 1280


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 21


  0%|                                                                                                      | 0/21 [00:00<?, ?it/s]
5it [00:00, 12921.45it/s]

4it [00:00, 21024.08it/s]

2it [00:00, 16384.00it/s]

10it [00:00, 20784.46it/s]

4it [00:00, 5529.74it/s]

10it [00:00, 27130.04it/s]

32it [00:00, 18716.74it/s]

2it [00:00, 5105.67it/s]

16it [00:00, 42286.62it/s]

4it [00:00, 26420.81it/s]

32it [00:00, 32561.31it/s]

10it [00:00, 15330.06it/s]

80it [00:00, 18943.39it/s]

4it [00:00, 20636.18it/s]

32it [00:00, 31410.65it/s]

2it [00:00, 15592.21it/s]

2it [00:00, 8710.91it/s]

4it [00:00, 7112.00it/s]

32it [00:00, 24407.66it/s]

10it [00:00, 27112.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 405.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 367001.60it/s]


Calculating new queries, not_original_clique_queries length: 23


100%|██████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 27075.22it/s]


164
[('age', 'education-num'), ('age', 'marital-status'), ('age', 'hours-per-week'), ('age', 'compensation'), ('education-num', 'marital-status'), ('education-num', 'hours-per-week'), ('education-num', 'compensation'), ('marital-status', 'hours-per-week'), ('marital-status', 'compensation'), ('hours-per-week', 'compensation')]
Domain size: 640


  epsilon = np.sqrt(8 * rho / (r - 1))


Calculating canonical queries, clique_set length: 16


  0%|                                                                                                      | 0/16 [00:00<?, ?it/s]
32it [00:00, 9323.91it/s]

32it [00:00, 14272.41it/s]

10it [00:00, 8878.71it/s]

5it [00:00, 5675.65it/s]

4it [00:00, 4647.43it/s]

80it [00:00, 19329.70it/s]

4it [00:00, 3984.14it/s]

4it [00:00, 2609.21it/s]

2it [00:00, 2162.57it/s]

32it [00:00, 8245.35it/s]

10it [00:00, 7839.82it/s]

16it [00:00, 9660.12it/s]

10it [00:00, 8420.61it/s]

2it [00:00, 7503.23it/s]

2it [00:00, 7175.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 196.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 43736.23it/s]


Calculating new queries, not_original_clique_queries length: 22


100%|██████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 24718.64it/s]

142





In [14]:
adult_low_discretization_full_marginal_set = list(itertools.combinations(adult_low_discretization.columns, 2))

print(calculate_canonical_queries(adult_high_discretization, 0.1, []))

Domain size: 35840
Calculating canonical queries, clique_set length: 18


  0%|                                                                                                      | 0/18 [00:00<?, ?it/s]
2it [00:00, 3562.04it/s]

16it [00:00, 11039.46it/s]

112it [00:00, 18977.22it/s]

10it [00:00, 6951.12it/s]

32it [00:00, 9579.45it/s]

2it [00:00, 8981.38it/s]

10it [00:00, 9029.72it/s]

2it [00:00, 5797.24it/s]

2it [00:00, 901.61it/s]

7it [00:00, 14665.40it/s]

2it [00:00, 8533.68it/s]

10it [00:00, 10991.36it/s]

4it [00:00, 9300.01it/s]

2it [00:00, 7307.15it/s]

32it [00:00, 7254.62it/s]
 83%|████████████████████████████████████████████████████████████████████████████▋               | 15/18 [00:00<00:00, 147.76it/s]
5it [00:00, 9362.29it/s]

4it [00:00, 4436.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 155.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 69470.87it/s]


Calculating new queries, not_original_clique_queries length: 52


100%|██████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 11471.90it/s]

165



