Using HoloClean to clean the synthetic person data from the main section of the fairness_and_repair_exploratory_analysis notebook.

Based on https://github.com/HoloClean/holoclean/blob/master/examples/holoclean_repair_example.py .

In [1]:
import sys
sys.path.append('holoclean/')
import holoclean

In [2]:
from detect import NullDetector, ViolationDetector
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0,
    domain_thresh_2=0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    epochs=10,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3*60000,
    feature_norm=False,
    weight_norm=False,
    print_fw=True
).session

# 2. Load training data and denial constraints.
hc.load_data('synthetic_data_version_1', 'synthetic_data_version_1.csv')
hc.load_dcs('synthetic_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

03:15:17 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 10, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0, 'domain_thresh_2': 0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_epochs': 3, 'estimator_batch_size': 32, 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}
03:15:17 - [ INFO] - Loaded 32755 rows with 425815 cells
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 - [DEBUG] - Time to create index: 0.00 secs
03:15:26 -

03:36:24 - [DEBUG] - Starting to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'NATIVITY'   AND  t1."CIT"=t2."CIT"   AND  t3.rv_val <> t2."NATIVITY" GROUP BY _vid_, val_id with id 3
03:39:24 - [DEBUG] - Failed to execute query SELECT _vid_, val_id, count(*) violations FROM   "synthetic_data_version_1" as t1, "synthetic_data_version_1" as t2, pos_values as t3 WHERE  t1._tid_ != t2._tid_   AND  t1._tid_ = t3._tid_   AND  t3.attribute = 'NATIVITY'   AND  t1."CIT"=t2."CIT"   AND  t3.rv_val <> t2."NATIVITY" GROUP BY _vid_, val_id with id 3. Timeout reached.
03:39:24 - [DEBUG] - Starting to execute backup query SELECT _vid_, val_id, 1 violations FROM   "synthetic_data_version_1" as t1, pos_values as t3 WHERE  t1._tid_ = t3._tid_   AND  t3.attribute = 'NATIVITY'   AND EXISTS (SELECT t2._tid_               FROM   "syn

'featurizer InitAttrFeaturizer,size 12,max 1.0000,min 1.0000,avg 1.0000,abs_avg 1.0000,weights:\nRAC1P 1.0\nSEX 1.0\nREGION 1.0\nPINCP 1.0\nST 1.0\nCOW 1.0\nCIT 1.0\nNATIVITY 1.0\nMSP 1.0\nSCHL 1.0\nDIS 1.0\nMIL 1.0\nfeaturizer OccurAttrFeaturizer,size 144,max 1.4676,min -0.0000,avg 0.1294,abs_avg 0.1294,weights:\nRAC1P X RAC1P 0.0\nRAC1P X SEX 0.0\nRAC1P X REGION -0.0\nRAC1P X PINCP 0.0\nRAC1P X ST -0.0\nRAC1P X COW -0.0\nRAC1P X CIT -0.0\nRAC1P X NATIVITY -0.0\nRAC1P X MSP -0.0\nRAC1P X SCHL -0.0\nRAC1P X DIS -0.0\nRAC1P X MIL -0.0\nSEX X RAC1P 0.0\nSEX X SEX 0.0\nSEX X REGION -0.0\nSEX X PINCP 0.0\nSEX X ST -0.0\nSEX X COW -0.0\nSEX X CIT 0.0\nSEX X NATIVITY 0.0\nSEX X MSP 0.0\nSEX X SCHL -0.0\nSEX X DIS 0.0\nSEX X MIL 0.0\nREGION X RAC1P 0.097\nREGION X SEX 0.095\nREGION X REGION -0.0\nREGION X PINCP 1.355\nREGION X ST 0.102\nREGION X COW 0.063\nREGION X CIT 0.095\nREGION X NATIVITY 0.095\nREGION X MSP 0.083\nREGION X SCHL 0.093\nREGION X DIS 0.095\nREGION X MIL 0.081\nPINCP X RAC1