Using HoloClean to clean the synthetic ACS Census person data without constraints for data_repair_and_fairness.

Based on https://github.com/HoloClean/holoclean/blob/master/examples/holoclean_repair_example.py .

In [6]:
import sys
sys.path.append('holoclean/')
import holoclean

In [7]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('synthetic_data_version_1', 'datasets/synthetic_data_version_1.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

08:02:16 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
08:02:16 - [ INFO] - Loaded 1000 rows with 13000 cells
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [DEBUG] - Time to create index: 0.00 secs
08:02:17 - [

08:02:32 - [DEBUG] - Time to setup repair model: 7.22 secs
08:02:32 - [ INFO] - training with 5725 training examples (cells)
  0%|          | 0/10 [00:00<?, ?it/s]08:02:35 - [DEBUG] - Epoch 1, cost = 0.363371, acc = 99.63%
 10%|█         | 1/10 [00:02<00:22,  2.45s/it]08:02:37 - [DEBUG] - Epoch 2, cost = 0.261075, acc = 99.63%
 20%|██        | 2/10 [00:05<00:19,  2.48s/it]08:02:40 - [DEBUG] - Epoch 3, cost = 0.253058, acc = 99.63%
 30%|███       | 3/10 [00:07<00:17,  2.48s/it]08:02:42 - [DEBUG] - Epoch 4, cost = 0.252563, acc = 99.63%
 40%|████      | 4/10 [00:09<00:14,  2.47s/it]08:02:45 - [DEBUG] - Epoch 5, cost = 0.252535, acc = 99.63%
 50%|█████     | 5/10 [00:12<00:12,  2.48s/it]08:02:47 - [DEBUG] - Epoch 6, cost = 0.252534, acc = 99.63%
 60%|██████    | 6/10 [00:14<00:09,  2.48s/it]08:02:50 - [DEBUG] - Epoch 7, cost = 0.252534, acc = 99.63%
 70%|███████   | 7/10 [00:17<00:07,  2.48s/it]08:02:52 - [DEBUG] - Epoch 8, cost = 0.252534, acc = 99.63%
 80%|████████  | 8/10 [00:19<00:04,

'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.4194,min -0.0000,avg 0.1533,abs_avg 0.1533,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P 0.15\nREGION X SEX 0.137\nREGION X REGION -0.0\nREGION X PINCP 1.288\nREGION X ST 0.234\nREGION X COW 0.075\nREGION X CIT 0.141\nREGION X NATIVITY 0.136\nREGION X MSP 0.114\nREGION X SCHL 0.179\nREGION X DIS 0.139\nREGION X MIL 0.118\nPINCP X RAC1P

Experiment where I inject errors to my database and take Holoclean to show that Data Cleaning systems do not maintain representations by default.

In [4]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('experiment_data', 'datasets/experiment_data.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

23:50:49 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
23:50:49 - [ INFO] - Loaded 32755 rows with 425815 cells
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 - [DEBUG] - Time to create index: 0.00 secs
23:50:55 -

00:20:33 - [DEBUG] - Time to store featurizer weights: 0.00 secs


'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.1097,min -0.2212,avg 0.1012,abs_avg 0.1420,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P -0.201\nREGION X SEX -0.221\nREGION X REGION -0.0\nREGION X PINCP -0.086\nREGION X ST 1.11\nREGION X COW -0.073\nREGION X CIT 0.349\nREGION X NATIVITY -0.22\nREGION X MSP -0.184\nREGION X SCHL -0.216\nREGION X DIS -0.22\nREGION X MIL -0.181\nPINCP 