Using HoloClean to clean the synthetic ACS Census person data for data_repair_and_fairness.

Based on https://github.com/HoloClean/holoclean/blob/master/examples/holoclean_repair_example.py .

In [4]:
import sys
sys.path.append('holoclean/')
import holoclean

In [7]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('synthetic_data_version_1', 'synthetic_data_version_1.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

10:39:15 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
10:39:16 - [ INFO] - Loaded 32755 rows with 425815 cells
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 - [DEBUG] - Time to create index: 0.00 secs
10:39:23 -

105275it [00:06, 17145.49it/s] [00:06<00:09, 17203.27it/s][A
107036it [00:06, 17271.73it/s] [00:06<00:09, 17137.84it/s][A
108807it [00:06, 17394.44it/s] [00:06<00:09, 17245.67it/s][A
110621it [00:06, 17603.45it/s] [00:06<00:09, 17389.91it/s][A
112383it [00:07, 17534.39it/s] [00:06<00:09, 17590.82it/s][A
114138it [00:07, 17405.33it/s] [00:06<00:09, 17520.94it/s][A
115880it [00:07, 17377.58it/s] [00:06<00:09, 17422.46it/s][A
117619it [00:07, 17274.39it/s] [00:07<00:09, 17392.11it/s][A
119412it [00:07, 17463.49it/s] [00:07<00:09, 17280.62it/s][A
121188it [00:07, 17549.53it/s] [00:07<00:08, 17478.37it/s][A
122944it [00:07, 17540.89it/s] [00:07<00:08, 17547.24it/s][A
124739it [00:07, 17661.38it/s] [00:07<00:08, 17559.25it/s][A
126518it [00:07, 17696.82it/s] [00:07<00:08, 17683.29it/s][A
128289it [00:07, 17680.63it/s] [00:07<00:08, 17667.34it/s][A
130058it [00:08, 17336.05it/s] [00:07<00:08, 17654.82it/s][A
131794it [00:08, 16947.25it/s] [00:07<00:08, 17356.66it/s][A
133492it

11:24:23 - [DEBUG] - Time to execute query with id 1: 0.13 secs
11:24:23 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'CIT'   AND  t1."NATIVITY"<>t2."NATIVITY"   AND  t3.rv_val = t2."CIT" GROUP BY _vid_, val_id with id 2
11:28:18 - [DEBUG] - Failed to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'CIT'   AND  t1."NATIVITY"<>t2."NATIVITY"   AND  t3.rv_val = t2."CIT" GROUP BY _vid_, val_id with id 2. Timeout reached.
11:28:18 - [DEBUG] - Starting to execute backup query SELECT _vid_, val_id, 1 violations FROM   "synthetic_data_version_1" as t1, pos_values as t3 WHERE  t1._tid_ = t3._tid_   AND  t3.attribute = 'C

12:02:14 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.4676,min -0.0000,avg 0.1294,abs_avg 0.1294,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P 0.097\nREGION X SEX 0.095\nREGION X REGION -0.0\nREGION X PINCP 1.355\nREGION X ST 0.102\nREGION X COW 0.063\nREGION X CIT 0.095\nREGION X NATIVITY 0.095\nREGION X MSP 0.083\nREGION X SCHL 0.093\nREGION X DIS 0.095\nREGION X MIL 0.081\nPINCP X RAC1

Experiment where I inject errors to my database and take Holoclean to show that Data Cleaning systems do not maintain representations by default.

In [6]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('experiment_data', 'experiment_data.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

09:09:34 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
09:09:35 - [ INFO] - Loaded 32755 rows with 425815 cells
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 - [DEBUG] - Time to create index: 0.00 secs
09:09:41 -

269198it [00:06, 38757.61it/s] [00:06<00:00, 41571.48it/s][A
09:11:55 - [DEBUG] - DONE assembling cell domain table in 12.62s
09:11:55 - [ INFO] - number of (additional) weak labels assigned from posterior model: 50
09:11:55 - [DEBUG] - DONE generating domain and weak labels
09:12:36 - [DEBUG] - Time to create index: 0.00 secs
09:12:36 - [DEBUG] - Time to create index: 0.00 secs
09:12:36 - [DEBUG] - Time to create index: 0.00 secs
09:12:40 - [DEBUG] - Time to create table: 0.00 secs
09:12:50 - [DEBUG] - Time to create index: 0.00 secs
09:12:50 - [ INFO] - DONE with domain preparation.
09:12:50 - [DEBUG] - Time to setup the domain: 166.20 secs
09:12:51 - [DEBUG] - Time to execute query: 0.00 secs
09:12:51 - [DEBUG] - Time to execute query: 0.00 secs
09:12:51 - [DEBUG] - Time to execute query: 0.00 secs
09:12:52 - [DEBUG] - Time to execute query: 0.00 secs
09:12:52 - [DEBUG] - Time to execute query: 0.00 secs
09:12:52 - [DEBUG] - featurizing training data...
09:12:52 - [DEBUG] - Time to

09:37:35 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 0.7357,min -0.0460,avg 0.0965,abs_avg 0.1042,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P -0.041\nREGION X SEX -0.041\nREGION X REGION -0.0\nREGION X PINCP 0.206\nREGION X ST 0.136\nREGION X COW -0.008\nREGION X CIT 0.037\nREGION X NATIVITY -0.036\nREGION X MSP -0.039\nREGION X SCHL -0.036\nREGION X DIS -0.035\nREGION X MIL -0.033\nPINC