Using HoloClean to clean the synthetic ACS Census person data without constraints for data_repair_and_fairness.

Based on https://github.com/HoloClean/holoclean/blob/master/examples/holoclean_repair_example.py .

In [2]:
import sys
sys.path.append('holoclean/')
import holoclean

In [7]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('synthetic_data_version_1', 'synthetic_data_version_1.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

10:39:15 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
10:39:16 - [ INFO] - Loaded 32755 rows with 425815 cells
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 -

11:24:23 - [DEBUG] - Time to execute query with id 1: 0.13 secs
11:24:23 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'CIT'   AND  t1."NATIVITY"<>t2."NATIVITY"   AND  t3.rv_val = t2."CIT" GROUP BY _vid_, val_id with id 2
11:28:18 - [DEBUG] - Failed to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'CIT'   AND  t1."NATIVITY"<>t2."NATIVITY"   AND  t3.rv_val = t2."CIT" GROUP BY _vid_, val_id with id 2. Timeout reached.
11:28:18 - [DEBUG] - Starting to execute backup query SELECT _vid_, val_id, 1 violations FROM   "synthetic_data_version_1" as t1, pos_values as t3 WHERE  t1._tid_ = t3._tid_   AND  t3.attribute = 'C

12:02:14 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.4676,min -0.0000,avg 0.1294,abs_avg 0.1294,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P 0.097\nREGION X SEX 0.095\nREGION X REGION -0.0\nREGION X PINCP 1.355\nREGION X ST 0.102\nREGION X COW 0.063\nREGION X CIT 0.095\nREGION X NATIVITY 0.095\nREGION X MSP 0.083\nREGION X SCHL 0.093\nREGION X DIS 0.095\nREGION X MIL 0.081\nPINCP X RAC1

Experiment where I inject errors to my database and take Holoclean to show that Data Cleaning systems do not maintain representations by default.

In [4]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('experiment_data', 'datasets/experiment_data.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

23:50:49 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
23:50:49 - [ INFO] - Loaded 32755 rows with 425815 cells
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 -

195823it [00:06, 23168.56it/s] [00:06<00:03, 21583.21it/s][A
198932it [00:07, 25083.09it/s] [00:06<00:03, 23166.49it/s][A
201960it [00:07, 26444.40it/s] [00:06<00:02, 25085.96it/s][A
204894it [00:07, 27251.15it/s] [00:06<00:02, 26447.61it/s][A
207831it [00:07, 27847.90it/s] [00:06<00:02, 27245.25it/s][A
210711it [00:07, 28063.37it/s] [00:07<00:02, 27831.25it/s][A
213584it [00:07, 27980.79it/s] [00:07<00:02, 28016.88it/s][A
216498it [00:07, 28314.19it/s] [00:07<00:01, 27957.62it/s][A
219483it [00:07, 28755.75it/s] [00:07<00:01, 28315.02it/s][A
222473it [00:07, 29088.11it/s] [00:07<00:01, 28780.19it/s][A
225401it [00:07, 28269.75it/s] [00:07<00:01, 29098.03it/s][A
228246it [00:08, 27414.02it/s] [00:07<00:01, 27928.45it/s][A
231006it [00:08, 26736.46it/s] [00:07<00:01, 27509.95it/s][A
233697it [00:08, 25978.25it/s] [00:07<00:01, 26726.08it/s][A
236312it [00:08, 25444.24it/s] [00:07<00:01, 26045.31it/s][A
238871it [00:08, 25119.71it/s] [00:08<00:01, 25457.92it/s][A
241394it

00:20:33 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.1097,min -0.2212,avg 0.1012,abs_avg 0.1420,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P -0.201\nREGION X SEX -0.221\nREGION X REGION -0.0\nREGION X PINCP -0.086\nREGION X ST 1.11\nREGION X COW -0.073\nREGION X CIT 0.349\nREGION X NATIVITY -0.22\nREGION X MSP -0.184\nREGION X SCHL -0.216\nREGION X DIS -0.22\nREGION X MIL -0.181\nPINCP 