Using HoloClean to clean the adult data from the fairness_and_repair_exploratory_analysis notebook.

Based on https://github.com/HoloClean/holoclean/blob/master/examples/holoclean_repair_example.py .

In [2]:
import sys
sys.path.append('holoclean/')
import holoclean

In [5]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('adult', 'adult.csv')
hc.load_dcs('holoclean/testdata/adult_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

15:22:26 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
15:22:27 - [ INFO] - Loaded 32392 rows with 421096 cells
15:22:32 - [DEBUG] - Time to create index: 0.00 secs
15:22:32 - [DEBUG] - Time to create index: 0.00 secs
15:22:32 - [DEBUG] - Time to create index: 0.00 secs
15:22:33 - [DEBUG] - Time to create index: 0.00 secs
15:22:33 - [DEBUG] - Time to create index: 0.00 secs
15:22:33 - [DEBUG] - Time to create index: 0.00 secs
15:22:33 -

67826it [00:20, 3335.14it/s]7 [00:20<00:27, 3339.99it/s][A
68165it [00:20, 3343.74it/s]7 [00:20<00:27, 3348.93it/s][A
68501it [00:20, 3340.58it/s]7 [00:20<00:27, 3342.18it/s][A
68843it [00:20, 3356.35it/s]7 [00:20<00:26, 3353.40it/s][A
69179it [00:21, 2864.76it/s]7 [00:20<00:31, 2868.37it/s][A
69516it [00:21, 2997.07it/s]7 [00:21<00:29, 2996.21it/s][A
69847it [00:21, 3080.25it/s]7 [00:21<00:28, 3083.80it/s][A
70175it [00:21, 3125.84it/s]7 [00:21<00:28, 3123.43it/s][A
70497it [00:21, 3153.41it/s]7 [00:21<00:28, 3148.11it/s][A
70822it [00:21, 3168.36it/s]7 [00:21<00:27, 3177.59it/s][A
71148it [00:21, 3193.61it/s]7 [00:21<00:27, 3193.11it/s][A
71491it [00:21, 3249.34it/s]7 [00:21<00:26, 3245.19it/s][A
71824it [00:21, 3273.12it/s]7 [00:21<00:26, 3260.15it/s][A
72164it [00:21, 3295.53it/s]7 [00:21<00:26, 3295.75it/s][A
72495it [00:22, 3289.29it/s]7 [00:21<00:26, 3297.25it/s][A
72829it [00:22, 3294.76it/s]7 [00:22<00:26, 3296.30it/s][A
73176it [00:22, 3338.17it/s]7 [00:22<00:

156987it [00:48, 2877.38it/s]7 [00:47<00:00, 3049.53it/s][A
157278it [00:48, 2884.40it/s]7 [00:48<00:00, 2874.08it/s][A
157611it [00:48, 2998.89it/s]7 [00:48<00:00, 2886.16it/s][A
157914it [00:48, 3002.74it/s]7 [00:48<00:00, 2997.93it/s][A
158216it [00:48, 2959.53it/s]7 [00:48<00:00, 2986.87it/s][A
158514it [00:48, 2884.95it/s]7 [00:48<00:00, 2963.14it/s][A
158809it [00:48, 2893.18it/s]7 [00:48<00:00, 2887.33it/s][A
100%|█████████▉| 158809/158857 [00:48<00:00, 2896.17it/s][A
158857it [00:48, 3244.95it/s]7 [00:48<00:00, 3263.25it/s][A
15:24:09 - [DEBUG] - DONE assembling cell domain table in 52.63s
15:24:09 - [ INFO] - number of (additional) weak labels assigned from posterior model: 391
15:24:09 - [DEBUG] - DONE generating domain and weak labels
15:24:37 - [DEBUG] - Time to create index: 0.00 secs
15:24:37 - [DEBUG] - Time to create index: 0.00 secs
15:24:37 - [DEBUG] - Time to create index: 0.00 secs
15:24:42 - [DEBUG] - Time to create table: 0.00 secs
15:24:55 - [DEBUG] - Ti

15:41:03 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nAge 1.0\nWorkclass 1.0\nEducation 1.0\nMaritalstatus 1.0\nOccupation 1.0\nRelationship 1.0\nRace 1.0\nSex 1.0\nHoursPerWeek 1.0\nCountry 1.0\nIncome 1.0\nIncomeBinary 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 0.4481,min -0.0054,avg 0.0938,abs_avg 0.0939,weights:\nAge X Age 0.0\nAge X Workclass 0.0\nAge X Education -0.0\nAge X Maritalstatus 0.0\nAge X Occupation -0.0\nAge X Relationship -0.0\nAge X Race -0.0\nAge X Sex -0.0\nAge X HoursPerWeek -0.0\nAge X Country -0.0\nAge X Income -0.0\nAge X IncomeBinary -0.0\nWorkclass X Age 0.213\nWorkclass X Workclass 0.0\nWorkclass X Education 0.219\nWorkclass X Maritalstatus 0.214\nWorkclass X Occupation 0.189\nWorkclass X Relationship 0.215\nWorkclass X Race 0.213\nWorkclass X Sex 0.213\nWorkclass X HoursPerWeek 0.223\nWorkclass X Country 0.209\nWorkclass X Income 0.213\nWorkclass X IncomeBinary 0.213\nEducation X Age -0.0\nEducation X Workcla

For original

In [5]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('adult_data', 'adult_data.csv')
hc.load_dcs('adult_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

01:27:07 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
01:27:07 - [ INFO] - Loaded 32392 rows with 421096 cells
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 - [DEBUG] - Time to create index: 0.00 secs
01:27:13 -

59463it [00:20, 2974.52it/s]3 [00:20<00:33, 2969.54it/s][A
59764it [00:20, 2984.86it/s]3 [00:20<00:33, 2968.49it/s][A
60063it [00:20, 2967.62it/s]3 [00:20<00:32, 2985.71it/s][A
60360it [00:20, 2967.05it/s]3 [00:20<00:33, 2954.54it/s][A
60662it [00:21, 2979.87it/s]3 [00:20<00:32, 2982.64it/s][A
60963it [00:21, 2976.99it/s]3 [00:20<00:32, 2973.55it/s][A
61269it [00:21, 2989.52it/s]3 [00:21<00:32, 2999.49it/s][A
61571it [00:21, 2989.58it/s]3 [00:21<00:32, 2989.22it/s][A
61870it [00:21, 2962.04it/s]3 [00:21<00:32, 2984.39it/s][A
62179it [00:21, 2988.11it/s]3 [00:21<00:32, 2975.89it/s][A
62482it [00:21, 2993.20it/s]3 [00:21<00:31, 3000.61it/s][A
62794it [00:21, 3029.96it/s]3 [00:21<00:31, 3020.74it/s][A
63098it [00:21, 3017.90it/s]3 [00:21<00:31, 3022.54it/s][A
63400it [00:21, 3016.98it/s]3 [00:21<00:31, 3017.95it/s][A
63702it [00:22, 3008.42it/s]3 [00:21<00:31, 3017.99it/s][A
64003it [00:22, 2995.29it/s]3 [00:21<00:31, 2992.91it/s][A
64307it [00:22, 3006.71it/s]3 [00:22<00:

138731it [00:48, 2907.34it/s]3 [00:47<00:06, 2903.76it/s][A
139024it [00:48, 2909.27it/s]3 [00:48<00:06, 2917.58it/s][A
139315it [00:48, 2901.26it/s]3 [00:48<00:06, 2911.07it/s][A
139611it [00:48, 2912.89it/s]3 [00:48<00:06, 2910.86it/s][A
139903it [00:48, 2908.39it/s]3 [00:48<00:06, 2916.50it/s][A
140194it [00:48, 2894.12it/s]3 [00:48<00:06, 2892.70it/s][A
140484it [00:48, 2893.20it/s]3 [00:48<00:06, 2890.51it/s][A
140774it [00:48, 2880.80it/s]3 [00:48<00:05, 2901.91it/s][A
141064it [00:48, 2886.18it/s]3 [00:48<00:05, 2891.85it/s][A
141353it [00:49, 2839.84it/s]3 [00:48<00:05, 2831.17it/s][A
141643it [00:49, 2852.23it/s]3 [00:49<00:05, 2847.43it/s][A
141939it [00:49, 2877.40it/s]3 [00:49<00:05, 2874.22it/s][A
142228it [00:49, 2874.01it/s]3 [00:49<00:05, 2874.47it/s][A
142516it [00:49, 2865.68it/s]3 [00:49<00:05, 2870.43it/s][A
142809it [00:49, 2881.08it/s]3 [00:49<00:05, 2877.84it/s][A
143099it [00:49, 2879.53it/s]3 [00:49<00:05, 2876.54it/s][A
143389it [00:49, 2881.86

 80%|████████  | 8/10 [09:26<02:21, 70.64s/it]01:44:40 - [DEBUG] - Epoch 9, cost = 0.280715, acc = 98.25%
 90%|█████████ | 9/10 [10:36<01:10, 70.41s/it]01:45:56 - [DEBUG] - Epoch 10, cost = 0.280715, acc = 98.25%
100%|██████████| 10/10 [11:51<00:00, 71.98s/it]
01:45:56 - [ INFO] - DONE training repair model.
01:45:56 - [DEBUG] - Time to fit repair model: 713.18 secs
01:45:56 - [ INFO] - inferring on 583 examples (cells)
01:46:04 - [DEBUG] - Time to execute query: 5.31 secs
01:46:07 - [DEBUG] - Time to create index: 0.00 secs
01:46:07 - [DEBUG] - Time to create index: 0.00 secs
01:46:07 - [ INFO] - DONE inferring repairs.
01:46:07 - [DEBUG] - Time to infer correct cell values: 8.47 secs
01:46:08 - [DEBUG] - Time to create table: 0.00 secs
01:46:08 - [DEBUG] - Time to create index: 0.00 secs
01:46:08 - [DEBUG] - Time to create index: 0.00 secs
01:46:08 - [ INFO] - DONE collecting the inferred values.
01:46:08 - [DEBUG] - Time to collect inferred values: 0.08 secs
01:46:16 - [ INFO] - DON

'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nAge 1.0\nWorkclass 1.0\nEducation 1.0\nMaritalstatus 1.0\nOccupation 1.0\nRelationship 1.0\nRace 1.0\nSex 1.0\nHoursPerWeek 1.0\nCountry 1.0\nIncome 1.0\nIncomeBinary 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 0.9271,min -0.0115,avg 0.1021,abs_avg 0.1023,weights:\nAge X Age 0.0\nAge X Workclass 0.0\nAge X Education -0.0\nAge X Maritalstatus 0.0\nAge X Occupation -0.0\nAge X Relationship -0.0\nAge X Race -0.0\nAge X Sex -0.0\nAge X HoursPerWeek -0.0\nAge X Country -0.0\nAge X Income -0.0\nAge X IncomeBinary -0.0\nWorkclass X Age 0.253\nWorkclass X Workclass 0.0\nWorkclass X Education 0.263\nWorkclass X Maritalstatus 0.234\nWorkclass X Occupation 0.385\nWorkclass X Relationship 0.227\nWorkclass X Race 0.21\nWorkclass X Sex 0.211\nWorkclass X HoursPerWeek 0.251\nWorkclass X Country 0.209\nWorkclass X Income 0.209\nWorkclass X IncomeBinary 0.209\nEducation X Age -0.0\nEducation X Workclas

Holoclean for SmallHolocleanExperimentWithAdult.ipynb

In [3]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('hospital', '../testdata/hospital.csv')
hc.load_dcs('../testdata/hospital_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

14:34:50 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
14:34:50 - [ INFO] - Loaded 48842 rows with 586104 cells
14:35:05 - [DEBUG] - Time to create index: 0.00 secs
14:35:05 - [DEBUG] - Time to create index: 0.00 secs
14:35:06 - [DEBUG] - Time to create index: 0.00 secs
14:35:06 - [DEBUG] - Time to create index: 0.00 secs
14:35:06 - [DEBUG] - Time to create index: 0.00 secs
14:35:06 - [DEBUG] - Time to create index: 0.00 secs
14:35:06 -

61674it [00:21, 3108.59it/s]2 [00:21<00:56, 3114.00it/s][A
61992it [00:21, 3128.99it/s]2 [00:21<00:56, 3118.44it/s][A
62317it [00:21, 3163.92it/s]2 [00:21<00:56, 3132.33it/s][A
62634it [00:21, 3110.65it/s]2 [00:21<00:55, 3160.48it/s][A
62946it [00:21, 3107.44it/s]2 [00:21<00:56, 3101.57it/s][A
63257it [00:21, 3098.36it/s]2 [00:21<00:56, 3101.43it/s][A
63582it [00:22, 3131.25it/s]2 [00:21<00:56, 3111.60it/s][A
63896it [00:22, 2660.12it/s]2 [00:21<00:55, 3136.62it/s][A
64184it [00:22, 2721.13it/s]2 [00:21<01:05, 2656.33it/s][A
64495it [00:22, 2827.18it/s]2 [00:22<01:03, 2728.22it/s][A
64815it [00:22, 2926.73it/s]2 [00:22<01:01, 2822.07it/s][A
65126it [00:22, 2973.80it/s]2 [00:22<00:59, 2937.47it/s][A
65430it [00:22, 2992.76it/s]2 [00:22<00:58, 2970.73it/s][A
65739it [00:22, 3013.47it/s]2 [00:22<00:57, 3002.23it/s][A
66043it [00:22, 2975.84it/s]2 [00:22<00:57, 3007.99it/s][A
66345it [00:23, 2988.59it/s]2 [00:22<00:58, 2971.94it/s][A
66657it [00:23, 3016.11it/s]2 [00:22<00:

146042it [00:48, 3179.17it/s]2 [00:48<00:29, 3184.30it/s][A
146361it [00:49, 3171.85it/s]2 [00:48<00:29, 3164.26it/s][A
146690it [00:49, 3195.39it/s]2 [00:48<00:28, 3190.46it/s][A
147010it [00:49, 3169.72it/s]2 [00:48<00:28, 3166.43it/s][A
147330it [00:49, 3172.42it/s]2 [00:49<00:28, 3172.79it/s][A
147656it [00:49, 3190.56it/s]2 [00:49<00:28, 3195.50it/s][A
147982it [00:49, 3210.58it/s]2 [00:49<00:28, 3205.68it/s][A
148304it [00:49, 3164.26it/s]2 [00:49<00:28, 3159.72it/s][A
148630it [00:49, 3184.14it/s]2 [00:49<00:28, 3183.53it/s][A
148949it [00:49, 3180.25it/s]2 [00:49<00:28, 3185.31it/s][A
149268it [00:49, 3165.48it/s]2 [00:49<00:28, 3168.52it/s][A
149585it [00:50, 3052.16it/s]2 [00:49<00:29, 3056.30it/s][A
149906it [00:50, 3097.28it/s]2 [00:49<00:28, 3098.08it/s][A
150222it [00:50, 3113.72it/s]2 [00:50<00:28, 3115.67it/s][A
150541it [00:50, 3130.14it/s]2 [00:50<00:28, 3124.96it/s][A
150868it [00:50, 3154.62it/s]2 [00:50<00:27, 3164.15it/s][A
151194it [00:50, 3172.95

231400it [01:16, 3154.89it/s]2 [01:15<00:02, 3148.60it/s][A
231720it [01:16, 3157.37it/s]2 [01:16<00:02, 3150.58it/s][A
232037it [01:16, 3149.34it/s]2 [01:16<00:02, 3152.53it/s][A
232358it [01:16, 3166.46it/s]2 [01:16<00:01, 3166.54it/s][A
232680it [01:16, 3175.49it/s]2 [01:16<00:01, 3173.70it/s][A
232998it [01:16, 3175.00it/s]2 [01:16<00:01, 3178.13it/s][A
233316it [01:16, 3172.52it/s]2 [01:16<00:01, 3173.17it/s][A
233639it [01:16, 3178.62it/s]2 [01:16<00:01, 3169.65it/s][A
233960it [01:16, 3178.78it/s]2 [01:16<00:01, 3177.74it/s][A
234278it [01:17, 3150.35it/s]2 [01:16<00:01, 3153.32it/s][A
234595it [01:17, 3148.58it/s]2 [01:16<00:01, 3153.32it/s][A
234910it [01:17, 3114.87it/s]2 [01:17<00:01, 3129.16it/s][A
235224it [01:17, 3113.21it/s]2 [01:17<00:01, 3113.22it/s][A
235541it [01:17, 3125.53it/s]2 [01:17<00:00, 3114.73it/s][A
235857it [01:17, 3125.77it/s]2 [01:17<00:00, 3129.73it/s][A
236170it [01:17, 3120.72it/s]2 [01:17<00:00, 3124.89it/s][A
236488it [01:17, 3136.74

15:06:09 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 11,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nAge 1.0\nWorkclass 1.0\nEducation 1.0\nMaritalstatus 1.0\nOccupation 1.0\nRelationship 1.0\nRace 1.0\nSex 1.0\nHoursPerWeek 1.0\nCountry 1.0\nIncome 1.0\nfeaturizer OccurAttrFeaturizer,size 121,max 1.2124,min -0.0000,avg 0.1211,abs_avg 0.1211,weights:\nAge X Age 0.0\nAge X Workclass 0.0\nAge X Education -0.0\nAge X Maritalstatus 0.0\nAge X Occupation -0.0\nAge X Relationship -0.0\nAge X Race -0.0\nAge X Sex -0.0\nAge X HoursPerWeek -0.0\nAge X Country -0.0\nAge X Income -0.0\nWorkclass X Age 0.249\nWorkclass X Workclass 0.0\nWorkclass X Education 0.272\nWorkclass X Maritalstatus 0.23\nWorkclass X Occupation 0.496\nWorkclass X Relationship 0.225\nWorkclass X Race 0.208\nWorkclass X Sex 0.211\nWorkclass X HoursPerWeek 0.271\nWorkclass X Country 0.2\nWorkclass X Income 0.209\nEducation X Age 0.0\nEducation X Workclass 0.0\nEducation X Education -0.0\nEducation X Maritalstatus -0.0\nEducation X