# Tutorial 4: Attaching a Dataset

In [1]:
import libspn as spn
import tensorflow as tf

### Building a Test Graph with Random Weights

In [2]:
iv_x = spn.IVs(num_vars=2, num_vals=2, name="iv_x")
sum_11 = spn.Sum((iv_x, [0,1]), name="sum_11")
sum_12 = spn.Sum((iv_x, [0,1]), name="sum_12")
sum_21 = spn.Sum((iv_x, [2,3]), name="sum_21")
sum_22 = spn.Sum((iv_x, [2,3]), name="sum_22")
prod_1 = spn.Product(sum_11, sum_21, name="prod_1")
prod_2 = spn.Product(sum_11, sum_22, name="prod_2")
prod_3 = spn.Product(sum_12, sum_22, name="prod_3")
root = spn.Sum(prod_1, prod_2, prod_3, name="root")
iv_y = root.generate_ivs(name="iv_y")
spn.generate_weights(root, init_value=spn.ValueType.RANDOM_UNIFORM(0, 1))

### Visualizing the SPN Graph

In [3]:
spn.display_spn_graph(root)

### Specify Training Data
So what does CSVFileDataset do? From the docs we have:
```
"""
    A dataset read from a CSV file. The file can contain labels, which will be
    returned in a separate tensor. The labels should be stored in the first
    ``num_labels`` columns of the CSV file.

    If ``num_labels>0``, the data is returned as a tuple of tensors ``(samples,
    labels)``, where ``labels`` is a tensor of shape ``[batch_size,
    num_labels]``, containing the first ``num_labels`` columns and ``samples``
    is a tensor ``[batch_size, ?]`` containing the data samples. If
    ``num_labels==0``, the data is returned as a single tensor ``samples``.

    This dataset can be overridden to customize the way the data is processed
    grouped and cast. For instance, to divide the batch into three tensors, with
    different dtypes in different columns, define custom dataset::

        class CustomCSVFileDataset(spn.CSVFileDataset):

            def process_data(self, data):
                return [data[0], tf.stack(data[1:3]), tf.stack(data[3:])]

    and then, give defaults of different type::

        dataset = CustomCSVFileDataset(...,
                                       defaults=[[1.0], [1], [1], [1.0], [1.0]])

    Args:
        files (str or list): A string containing a path to a file or a glob
                             matching multiple files, or a list of paths to
                             multiple files. When glob is used, the files will
                             be sorted, unless ``shuffle`` is set to ``True``.
        num_vals (int or list of int): Number of values of each variable. Can be
            a single value or a list of values, one for each of ``num_vars``
            variables. Use ``None``, to indicate that a variable is continuous,
            in the range ``[0, 1]``.
        defaults (list of Tensor): A list of tensors, one tensor per column of
                                   the input record, with a default value for
                                   that column.
        num_epochs (int): Number of epochs of produced data.
        batch_size (int): Size of a single batch.
        shuffle (bool): Shuffle data within each epoch.
        num_labels (int): The number of columns considered labels. If set to
                          ``0``, no labels are returned.
        min_after_dequeue (int): Min number of elements in the data queue after
                                 each dequeue. This is the minimum number of
                                 elements from which the shuffled batch will
                                 be drawn. Relevant only if ``shuffle``
                                 is ``True``.
        num_threads (int): Number of threads enqueuing the data queue. If
                           larger than ``1``, the performance will be better,
                           but examples might not be in order even if
                           ``shuffle`` is ``False``. If ``shuffle`` is ``True``,
                           this might lead to examples repeating in the same
                           batch.
        allow_smaller_final_batch(bool): If ``False``, the last batch will be
                                         omitted if it has less elements than
                                         ``batch_size``.
        seed (int): Optional. Seed used when shuffling.
    """
```

In [4]:
dataset=spn.CSVFileDataset('data.csv', num_vals=[2, 2], defaults=[[-1],[-1],[-1]],
                           num_labels=1, num_epochs=10, batch_size=10, shuffle=False)
samples, labels = dataset.get_data()
iv_x.attach_feed(samples)
iv_y.attach_feed(labels)

[INFO] [spn.Dataset:get_data] Building dataset operations


ValueError: string_input_producer requires a non-null input tensor

### Add Learning Ops

In [5]:
init_weights = spn.initialize_weights(root)
learning = spn.EMLearning(root, initial_accum_value=2)
init_learning = learning.reset_accumulators()
accumulate_updates = learning.accumulate_updates()
update_spn = learning.update_spn()
likelihood = tf.reduce_mean(learning.value.values[root])

### Run Learning

In [6]:
epoch = 0
with spn.session() as (sess, run):
    sess.run(init_weights)
    sess.run(init_learning)
    try:    
        while run():
            likelihood_arr, _ = sess.run([likelihood, accumulate_updates])
            print("Avg. Likelihood: %s" % (likelihood_arr))
            sess.run(update_spn)
            
    except tf.errors.OutOfRangeError:
        print("Done!")

InvalidArgumentError: Shape [-1,2] has negative dimensions
	 [[Node: iv_x/Placeholder = Placeholder[dtype=DT_INT32, shape=[?,2], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op 'iv_x/Placeholder', defined at:
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/jos/.local/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/jos/.local/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/jos/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/jos/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/jos/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/jos/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/jos/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/jos/.local/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-3aa4fcd869c9>", line 1, in <module>
    iv_x = spn.IVs(num_vars=2, num_vals=2, name="iv_x")
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/libspn/graph/ivs.py", line 37, in __init__
    super().__init__(feed, name)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/libspn/graph/node.py", line 757, in __init__
    super().__init__(InferenceType.MARGINAL, name)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/libspn/graph/node.py", line 207, in __init__
    self._create()
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/libspn/graph/node.py", line 786, in _create
    self._placeholder = self._create_placeholder()
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/libspn/graph/ivs.py", line 68, in _create_placeholder
    return tf.placeholder(tf.int32, [None, self._num_vars])
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1530, in placeholder
    return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1954, in _placeholder
    name=name)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/jos/anaconda3/envs/libspn/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Shape [-1,2] has negative dimensions
	 [[Node: iv_x/Placeholder = Placeholder[dtype=DT_INT32, shape=[?,2], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]


In [7]:
with spn.session() as (sess, run):
    sess.run(init_weights)
    sess.run(init_learning)
    try:
        while run():
            likelihoods, _ =  sess.run([likelihood, accumulate_updates])
            print("Avg. Likelihood: %s" % (avg_likelihood))
            sess.run(update_spn)
            
    except tf.errors.OutOfRangeError:
        print("TRAINING DONE!")

NameError: name 'avg_likelihood' is not defined