# Start a HoloClean Session

In [1]:
from holoclean.holoclean import HoloClean, Session, GlobalVariables

holo =      HoloClean(
            holoclean_path="..",         # path to holoclean package
            verbose=False,
            # to limit possible values for training data
            pruning_threshold1=0.1,
            # to limit possible values for training data to less than k values
            pruning_clean_breakoff=6,
            # to limit possible values for dirty data (applied after
            # Threshold 1)
            pruning_threshold2=0,
            # to limit possible values for dirty data to less than k values
            pruning_dk_breakoff=6,
            # learning parameters
            learning_iterations=30,
            learning_rate=0.001,
            batch_size=5
        )
session = Session(holo)
session.holo_env.reset_database()

  """)


In [2]:
data_path = "data/hospital.csv"

data = session.load_data(data_path)

In [3]:
labeled_data_path = "data/hospital_clean.csv" 

# Create an HoloDetect Object

In [4]:
from holoclean.errordetection.holo_detect import HoloDetect

In [5]:
detector = HoloDetect(labeled_data_path, session)

Could not execute Query  SELECT * FROM Init_flat_295385779671 Check log for info


# Create featurizers
## Add DCs to the system

In [6]:
dc_path = "data/hospital_constraints.txt"
dcs = session.load_denial_constraints(dc_path)
#dcs

In [7]:
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection

In [8]:
dcd = SqlDCErrorDetection(session)

In [9]:
#dcd.forward((514, "State","al" ))

## Co-Occurrence Featurizer

In [10]:
from holoclean.errordetection.co_occur import CoOccurED

co_occur = CoOccurED(session)

In [11]:
#co_occur.forward((514, "State","al" ))

## GRU featurizer: character level

In [12]:
from holoclean.errordetection.gru_character_detector import GRUCharacterDetector
gru = GRUCharacterDetector(data.schema.names)

## GRU Featurizer: word embeddings

In [13]:
from holoclean.errordetection.gru_word_detector import FastTextGRU
word_gru = FastTextGRU(, 20, "wiki.en.bin")

In [14]:
# add detectors to holo detect
detector.featurizers = [dcd, word_gru, co_occur, gru]

## Detect errors

In [15]:
preds, truth = detector.find_errors()

  0%|          | 0/5700 [00:00<?, ?it/s]

10024
tensor([[[ 0.0117,  0.1414, -0.1062,  0.2758, -0.0655, -0.0806,  0.0886,
           0.1665, -0.0254, -0.3505,  0.1147,  0.0986, -0.1604,  0.2270,
          -0.1254, -0.1471, -0.0687, -0.1322,  0.0788,  0.1487]],

        [[ 0.0915,  0.1782,  0.0762,  0.2860, -0.1037, -0.0082, -0.0233,
           0.0004, -0.1870, -0.0712,  0.1582,  0.1937, -0.2397,  0.2159,
           0.0015, -0.0613,  0.1239, -0.3187, -0.0782,  0.0903]],

        [[ 0.1925, -0.0579, -0.2116,  0.0232,  0.1941,  0.0627, -0.0025,
           0.1085, -0.1479, -0.0640,  0.0544, -0.0717, -0.4917,  0.0571,
          -0.1807, -0.0949,  0.2680, -0.0277,  0.0617,  0.4286]]])
torch.Size([3, 1, 20])


  0%|          | 2/5700 [00:12<9:41:37,  6.12s/it]

al_pn-5c
tensor([[[-0.1287, -0.0744, -0.2373, -0.0288,  0.0075, -0.0140,  0.0045,
          -0.1554,  0.2258, -0.1028,  0.3365, -0.0664, -0.1090, -0.0254,
          -0.2706, -0.2324,  0.4148,  0.2647,  0.0128,  0.3928]],

        [[ 0.0049,  0.2722, -0.0633,  0.1093, -0.2983,  0.0966, -0.0396,
          -0.0203, -0.1676, -0.2204,  0.2737, -0.2484, -0.2141, -0.0463,
          -0.1359, -0.1282, -0.0507, -0.0668,  0.2144,  0.1502]],

        [[ 0.2030,  0.0770, -0.1371,  0.1661, -0.0897,  0.1351, -0.2359,
           0.1397, -0.3450, -0.1329, -0.0504, -0.2920, -0.0537, -0.0627,
          -0.1430, -0.2459, -0.0856, -0.1043,  0.3152,  0.4000]]])
torch.Size([3, 1, 20])


  0%|          | 3/5700 [00:33<16:39:18, 10.52s/it]

100x9
tensor([[[ 0.0845, -0.0076, -0.1920,  0.2968, -0.0222, -0.1847,  0.0369,
          -0.0307, -0.0696, -0.2405,  0.0453, -0.0778, -0.0938,  0.0535,
           0.0416, -0.0517,  0.0199, -0.0505,  0.2922, -0.0229]],

        [[ 0.0642,  0.0937,  0.0887,  0.3467, -0.1036,  0.1011, -0.0316,
          -0.2044, -0.2289, -0.2025,  0.1928,  0.0906, -0.1800,  0.0821,
           0.0586,  0.1377,  0.0436, -0.2600,  0.2463,  0.0526]],

        [[ 0.1740, -0.0935, -0.1998,  0.1753,  0.0130,  0.1644, -0.0888,
          -0.0915, -0.1732, -0.0366,  0.0812,  0.0830, -0.3374,  0.0502,
          -0.1333, -0.1647,  0.1817, -0.0281,  0.3819,  0.2018]]])
torch.Size([3, 1, 20])


  0%|          | 4/5700 [00:54<21:47:08, 13.77s/it]

fayette
tensor([[[-0.4668, -0.2072, -0.0737, -0.0620,  0.0836, -0.3425, -0.0735,
          -0.1554, -0.1494,  0.1262,  0.0694, -0.1588, -0.1856, -0.0668,
           0.0490, -0.0239, -0.0270,  0.2092, -0.0061,  0.0507]],

        [[-0.2809, -0.3020, -0.3240, -0.1515,  0.1168, -0.2974,  0.0188,
          -0.0440, -0.1575,  0.0454,  0.0067, -0.1884, -0.1016, -0.0662,
          -0.1362, -0.0787, -0.2417,  0.1370, -0.0935, -0.0014]],

        [[-0.1526,  0.0868, -0.1365, -0.1803, -0.0567, -0.2153, -0.0943,
          -0.1810, -0.3238,  0.0324,  0.2832, -0.1255,  0.0518,  0.1367,
          -0.2467, -0.1769,  0.0830, -0.1517,  0.0107, -0.1458]]])
torch.Size([3, 1, 20])





Traceback (most recent call last):
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/inspect.py", line 1051, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/inspect.py", line 1011, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/Users/joshmcgrath/anaconda2/envs/torch/lib/python2.7/ins

IndexError: string index out of range

In [None]:
dcd.cache

In [None]:
import torch
torch.zeros(3,4).shape[0]