In [94]:
import csv
import pandas as pd
from pathlib import Path
from collections import OrderedDict

In [95]:
pd.__version__

'1.1.5'

In [96]:
def _load_predictions(path):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t', quotechar='"',quoting=csv.QUOTE_ALL,escapechar='\\')
        headers = next(reader)
        table = pd.DataFrame(list(reader),columns=headers)
    return table

In [97]:
base_dir = Path('/workspace/hsiehcc/tapas/')
preds_dir = base_dir/'results/wtq/model'
split='random-split-1-dev'#'test','random-split-1-dev'

In [103]:
def _format_fl(fl,prec=1,style='%'):
    return f"{fl:.{prec}{style}}"

## ERROR ANALYSIS

In [104]:
# load preprocessed ref file
ref = _load_predictions(preds_dir/'..'/f'{split}.tsv')
ref.loc[:, 'answer_coordinates'] = ref["answer_coordinates"].apply(eval)
ref.loc[:,'answer_coordinates'] = ref["answer_coordinates"].apply(lambda x:set(map(eval,x)))
ref = ref.rename(columns={'answer_coordinates':'answer_coordinates_gold'})
ref = ref[['id','answer_coordinates_gold']]
print('Total tf preprocessed refs:',len(ref))

Total tf preprocessed refs: 2810


In [105]:
ref['answer_coordinates_gold'][0]

{(-1, -1)}

In [106]:
# load predictions marked by official evaluator
marks = _load_predictions(base_dir/'formatted_predictions'/f'{split}_marked.tsv')
print('Total', len(marks))
errlen = len(marks[marks['mark']=='false'])
print('Accuracy',_format_fl(1-errlen/len(marks)))
print('Total Error', errlen, _format_fl(errlen/len(marks)))
#marks.loc[:,'answer'] = marks['answer'].apply(eval)

Total 2810
Accuracy 52.5%
Total Error 1335 47.5%


In [107]:
# load original prediction file with prediction details
preds = _load_predictions(preds_dir/f'{split}.tsv')
missing = len(marks)-len(preds)
print('Errors that have no output:', missing, _format_fl(missing/errlen),'of all errors')

Errors that have no output: 363 27.2% of all errors


In [108]:
# merge the marks with the ones that are predicted
combined = pd.merge(preds,marks,left_on='id',right_on='id',how='inner')
combined = pd.merge(combined,ref,left_on='id',right_on='id',how='inner')
combined = combined.sort_values(by='id',ascending=True, key=lambda y:y.map(lambda x:int(x.split('-')[1])))
combined.loc[:, 'answers'] = combined["answers"].apply(eval)
combined.loc[:, 'answer_coordinates'] = combined["answer_coordinates"].apply(eval)
combined.loc[:,'answer_coordinates'] = combined["answer_coordinates"].apply(lambda x:set(map(eval,x)))
print('Has prediction:', len(combined))

Has prediction: 2447


In [109]:
combined['answer_coordinates'][0]

{(1, 1), (2, 1), (3, 1), (6, 1), (8, 1)}

In [110]:
# Analyze Error Subset with predictions
combined = combined[combined['mark']=='false']
print('Errors that have final output:', len(combined), f', {_format_fl(len(combined)/errlen)} of all errors')

Errors that have final output: 972 , 72.8% of all errors


In [114]:
# Coordinate Prediction Errors
# Didn't produce answer coordinates
empt_coor = len(combined[combined['answers'].map(len)==0]["answers"])
nonempt = combined[combined['answers'].map(len)!=0]
assert len(nonempt)+empt_coor == len(combined)
assert empt_coor==len(combined[combined['answer_coordinates'].map(len)==0]["answer_coordinates"])
print('Errors wo ans coordinate predictions:',empt_coor,_format_fl(empt_coor/errlen))
noempt_coor_err= len(nonempt[nonempt['answer_coordinates']!= nonempt['answer_coordinates_gold']])
all_coor_err=len(combined[combined['answer_coordinates']!= combined['answer_coordinates_gold']])
print('Total Coordinate Errors', all_coor_err,_format_fl(all_coor_err/errlen))
print('Errors w coor pred but wrong', noempt_coor_err,_format_fl(noempt_coor_err/errlen))

Errors wo ans coordinate predictions: 21 1.6%
Total Coordinate Errors 972 72.8%
Errors w coor pred but wrong 951 71.2%


In [112]:
55/errlen

0.04119850187265917

In [116]:
# Aggregation Errors
print('Total Aggregation Errors', len(combined[combined['pred_aggr']!= combined['gold_aggr']]))
wrong_op = len(nonempt[nonempt['pred_aggr']!= nonempt['gold_aggr']])
print('Aggregation Errors among the ans w coor predictions', wrong_op,_format_fl(wrong_op/errlen))

Total Aggregation Errors 495
Aggregation Errors among the ans w coor predictions 487 36.5%


In [16]:
combined['answer_coordinates'][2]

{(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)}

In [17]:
cols = combined.columns.difference(['id'])
combined[cols]

Unnamed: 0,annotator,answer,answer_coordinates,answer_coordinates_gold,answers,gold_aggr,mark,position,pred_aggr,question_id
940,0,[u'confey'],"{(5, 0)}","{(-1, -1)}","[{'column_index': 0, 'row_index': 5, 'begin_to...",0,false,0,0,nt-2-0_0
1101,0,[u'14749.0'],"{(0, 2)}","{(-1, -1)}","[{'column_index': 2, 'row_index': 0, 'begin_to...",0,false,0,1,nt-3-0_0
2257,0,[u'kert toobal'],"{(1, 1)}","{(-1, -1)}","[{'column_index': 1, 'row_index': 1, 'begin_to...",0,false,0,0,nt-9-0_0
998,0,[u'iran'],"{(7, 1)}","{(-1, -1)}","[{'column_index': 1, 'row_index': 7, 'begin_to...",0,false,0,0,nt-24-0_0
1290,0,[u'1.0'],"{(9, 3)}","{(-1, -1)}","[{'column_index': 3, 'row_index': 9, 'begin_to...",0,false,0,3,nt-40-0_0
...,...,...,...,...,...,...,...,...,...,...
791,0,[u'12.0'],"{(0, 1), (13, 1), (6, 1), (3, 1), (17, 1), (2,...","{(-1, -1)}","[{'column_index': 1, 'row_index': 0, 'begin_to...",0,false,0,3,nt-14076-0_0
792,0,[u'w 39-12'],"{(1, 6)}","{(-1, -1)}","[{'column_index': 6, 'row_index': 1, 'begin_to...",0,false,0,1,nt-14082-0_0
795,0,"[u""b'in music""]","{(8, 5)}","{(-1, -1)}","[{'column_index': 5, 'row_index': 8, 'begin_to...",0,false,0,0,nt-14097-0_0
796,0,[u'4.0'],"{(0, 1), (11, 1), (4, 1), (1, 1)}","{(-1, -1)}","[{'column_index': 1, 'row_index': 0, 'begin_to...",0,false,0,3,nt-14107-0_0


In [18]:
d = combined.set_index('id')[['pred_aggr','answer_coordinates','answer','mark']].to_dict(orient='index')

## View Preprocessed TF Record

In [None]:
import tensorflow as tf
raw_ds = tf.data.TFRecordDataset('results/wtq/tf_examples/random-split-1-dev.tfrecord', compression_type="GZIP")

# Read and Dump interactions to json for flask

In [2]:
from tapas.protos import interaction_pb2

In [17]:
from google.protobuf.json_format import MessageToDict

In [100]:
import tensorflow as tf

In [82]:
def _to_df(dic):
    columns = [f"{v}-{k}" for d in dic['columns'] for k,v in d.items()]
    rows = [{columns[i]: ' '.join(list(d.values())) for i, d in enumerate(row['cells'])} for row in dic['rows']]
    df = pd.DataFrame(rows, columns=columns)
    return df

In [101]:
# view interaction protobuf
dics = {}
split = 'random-split-1-dev'
for value in tf.data.TFRecordDataset(f'results/wtq/interactions/{split}.tfrecord'):
    interaction = interaction_pb2.Interaction()
    interaction.ParseFromString(value.numpy())
    d = MessageToDict(interaction)
    d['table_id'] = d['table']['tableId']
    d['table'] = _to_df(d['table']).to_dict()
    dics[d['id'].rsplit('-',1)[0]] = d

In [176]:
d['table'].keys()

dict_keys(['columns', 'rows', 'tableId'])

In [180]:
import json

with open(f'results/wtq/for_flask/{split}.json','w') as f:
    json.dump(dics,f)

In [159]:
print(len(dics))

2810


In [102]:
_to_df(dics[0])

Unnamed: 0,Outcome-text,Year-text,Championship-text,Surface-text,Opponent-text,Score-text
0,(runner-up),(2002),(canada),(hard),(guillermo canas),"(4-6, 5-7)"
1,(winner),(2003),(montreal),(hard),(david nalbandian),"(6-1, 6-3)"
2,(winner),(2003),(cincinnati),(hard),(mardy fish),"(4-6, 7-6(7-3), 7-6(7-4))"
3,(winner),(2004),(miami),(hard),(guillermo coria),"(6-7(2-7), 6-3, 6-1, ret)"
4,(runner-up),(2004),(toronto),(hard),(roger federer),"(5-7, 3-6)"
5,(runner-up),(2005),(cincinnati),(hard),(roger federer),"(3-6, 5-7)"
6,(winner),(2006),(cincinnati),(hard),(juan carlos ferrero),"(6-3, 6-4)"
7,(runner-up),(2010),(indian wells),(hard),(ivan ljubicic),"(6-7(3-7), 6-7(5-7))"
8,(winner),(2010),(miami),(hard),(tomas berdych),"(7-5, 6-4)"


# Analyze Preprocessing Errors

In [15]:
import json

In [2]:
with open ('results/wtq/tf_examples/random-split-1-dev_errors.json') as f:
    errors = json.load(f)

In [3]:
s=set()
t = 0
for k,v in errors.items():
    if k=='Invalid answer':
        s|=set(v) 
    print(k,len(v))
    t+=len(v)
print('total',t)
print(len(s))

Couldn't find all answers 98
Invalid answer 250
Too many rows 15
total 363
250


In [4]:
# Invalid Answers Breakdown
with open('results/wtq/interactions/err_ids.json') as f:
    bkdns = json.load(f)

In [5]:
se= set()
split = 'random-split-1-dev.tsv'
for k,v in bkdns[split].items():
    v = list(map(lambda x:x.split('_')[0],v))
    bkdns[split][k] = v
    print(k)
    print(len(v))
    se|=set(v)
print(f'{split} total',len(se))

Cannot parse answer: [float_value: Contains digits, but Unable to convert value to float]
75
Cannot parse answer: [float_value: Cannot convert to multiple answers to single float]
7
Cannot parse answer: [float_value: Unable to convert value to float]
169
Cannot parse answer: [answer_coordinates: Assignment is ambiguous][float_value: Cannot convert to multiple answers to single float]
1
random-split-1-dev.tsv total 252


In [6]:
print(se.difference(s))
'nt-11953-0' in errors["Too many rows"] and 'nt-3899-0' in errors["Too many rows"]

{'nt-3899-0', 'nt-11953-0'}


True

## Isolate the float value conversion errors
See if we can still convert them to tf examples without float values

In [7]:
err_key = "Cannot parse answer: [float_value: Unable to convert value to float]"
maj_err = set(bkdns['random-split-1-dev.tsv'][err_key])

In [72]:
from tapas.protos import interaction_pb2
import tensorflow.compat.v1 as tf

In [78]:
# view interaction protobuf
todo = []
split = 'random-split-1-dev'
corr = []

for value in tf.python_io.tf_record_iterator(f'results/wtq/interactions/{split}.tfrecord'):
    interaction = interaction_pb2.Interaction()
    interaction.ParseFromString(value)
    if interaction.id in maj_err:
        todo.append(interaction)
    elif len(corr)<3:
        corr.append(interaction)
# eager execution
# for value in tf.data.TFRecordDataset([f'results/wtq/interactions/{split}.tfrecord']):
#     interaction = interaction_pb2.Interaction()
#     interaction.ParseFromString(value.numpy())
#     if interaction.id in maj_err:
#         todo.append(interaction)
#     elif len(corr)<3:
#         corr.append(interaction)
        
#     d = MessageToDict(interaction)
#     d['table_id'] = d['table']['tableId']
#     d['table'] = _to_df(d['table']).to_dict()
#     dics[d['id'].rsplit('-',1)[0]] = d

In [79]:
from tapas.utils import tf_example_utils

In [80]:
from tapas.utils import pruning_utils
token_selector = pruning_utils.HeuristicExactMatchTokenSelector(
    'tapas_wtq_wikisql_sqa_inter_masklm_large_reset/vocab.txt', 
    512, 
    pruning_utils.SelectionType.COLUMN,
    # Only relevant for SQA where questions come in sequence
    use_previous_answer=True,
    use_previous_questions=True,
  )

In [81]:
config = tf_example_utils.ClassifierConversionConfig(
    vocab_file='tapas_wtq_wikisql_sqa_inter_masklm_large_reset/vocab.txt', 
    max_seq_length=512, 
    max_column_id=512, 
    max_row_id=512, 
    strip_column_names=False, 
    cell_trim_length=-1, 
    add_aggregation_candidates=False, 
    expand_entity_descriptions=False, 
    use_entity_title=False, 
    entity_descriptions_sentence_limit=5, 
    use_document_title=False, 
    update_answer_coordinates=False, 
    drop_rows_to_fit=False, 
    use_context_title=False, 
    trim_question_ids=False, 
    label_sampling_rate={}
)


In [82]:
converter = tf_example_utils.ToClassifierTensorflowExample(config)

In [83]:
examples = []
for ex in todo:
    ex = token_selector.annotated_interaction(ex)
    if len(ex.questions[0].text)>0:
        print(ex.questions[0].text)
    examples.append(converter.convert(ex,0))

In [84]:
features = MessageToDict(examples[0].features)

In [19]:
features['feature'].keys()

dict_keys(['input_ids', 'column_ids', 'input_mask', 'aggregation_function_id', 'segment_ids', 'table_id', 'numeric_relations', 'numeric_values_scale', 'numeric_values', 'row_ids', 'prev_label_ids', 'question_id_ints', 'question_id', 'answer', 'column_ranks', 'label_ids', 'classification_class_index', 'table_id_hash', 'inv_column_ranks'])

In [17]:
type(examples[0].features.feature['input_ids'])

tensorflow.core.example.feature_pb2.Feature

In [67]:
from tapas.models import tapas_classifier_model
from tapas.models.bert import modeling
from tapas.utils import tasks, hparam_utils
from tapas.utils import text_utils


task = tasks.Task.WTQ
hparams = hparam_utils.get_hparams(task)

In [24]:
checkpoint_dir = 'results/wtq/model'
print(tf.train.latest_checkpoint(
    checkpoint_dir
))


results/wtq/model/model.ckpt-0


In [25]:
bert_config_file = 'tapas_wtq_wikisql_sqa_inter_masklm_large_reset/bert_config.json'
bert_config = modeling.BertConfig.from_json_file(bert_config_file)

In [26]:
tapas_config = tapas_classifier_model.TapasClassifierConfig(
    bert_config= bert_config, 
    init_checkpoint='tapas_wtq_wikisql_sqa_inter_masklm_large_reset/model.ckpt', 
    learning_rate=1.93581e-05, 
    num_train_steps=50000, 
    num_warmup_steps=6448, 
    use_tpu=False, 
    positive_weight=10.0, 
    num_aggregation_labels=4, 
    num_classification_labels=0, 
    aggregation_loss_importance=1.0, 
    use_answer_as_supervision=True, 
    answer_loss_importance=1.0, 
    use_normalized_answer_loss=False, 
    huber_loss_delta=0.121194, 
    temperature=0.0352513, 
    agg_temperature=1.0, 
    use_gumbel_for_cells=False, 
    use_gumbel_for_agg=False, 
    average_approximation_function=tapas_classifier_model.AverageApproximationFunction.RATIO, 
    cell_select_pref=0.207951, 
    answer_loss_cutoff=0.664694, 
    grad_clipping=10.0, 
    max_num_rows=64, 
    max_num_columns=32, 
    average_logits_per_cell=False, 
    select_one_column=True, 
    allow_empty_column_selection=False, 
    disabled_features=[], 
    init_cell_selection_weights_to_zero=True, 
    disable_position_embeddings=False, 
    reset_position_index_per_cell=False, 
    disable_per_token_loss=False, 
    span_prediction=tapas_classifier_model.SpanPredictionMode(
          hparams.get('span_prediction',
                      tapas_classifier_model.SpanPredictionMode.NONE)),
    proj_value_length=None,
    reset_output_cls=False,
    classification_label_weight=None,
    mask_examples_without_labels=False,
    table_pruning_config_file=None)

In [27]:
model_fn = tapas_classifier_model.model_fn_builder(tapas_config)

In [28]:
from tapas.run_task_main import _predict

Instructions for updating:
non-resource variables are not supported in the long term


In [29]:
file_patterns='results/wtq/tf_examples/random-split-1-dev.tfrecord'
data_format='tfrecord'
compression_type='GZIP'
is_training=False
max_seq_length=512
max_predictions_per_seq=20
add_aggregation_function_id=True
add_classification_labels=False
add_answer=True
include_id=False
add_candidate_answers=False,
max_num_candidates=0

In [85]:
feature_types = {
      "input_ids":
          tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "input_mask":
          tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "segment_ids":
          tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "column_ids":
          tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "row_ids":
          tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "prev_label_ids":
          tf.io.FixedLenFeature([max_seq_length],
                             tf.int64,
                             default_value=[0] * max_seq_length),
      "column_ranks":
          tf.io.FixedLenFeature(
              [max_seq_length],
              tf.int64,
              default_value=[0] * max_seq_length,
          ),
      "inv_column_ranks":
          tf.io.FixedLenFeature(
              [max_seq_length],
              tf.int64,
              default_value=[0] * max_seq_length,
          ),
      "numeric_relations":
          tf.io.FixedLenFeature([max_seq_length],
                             tf.int64,
                             default_value=[0] * max_seq_length),
  }
feature_types.update({
        "label_ids":
            tf.io.FixedLenFeature(
                [max_seq_length],
                tf.int64,
                default_value=[0] * max_seq_length,
            ),
    })
feature_types.update({
        "question_id_ints":
            tf.io.FixedLenFeature([text_utils.DEFAULT_INTS_LENGTH],
                               tf.int64,
                               default_value=[0] *
                               text_utils.DEFAULT_INTS_LENGTH),
    })

if add_aggregation_function_id:
      feature_types.update({
          "aggregation_function_id": tf.io.FixedLenFeature([1], tf.int64),
      })
if add_classification_labels:
      feature_types.update({
          "classification_class_index": tf.io.FixedLenFeature([1], tf.int64),
      })
    # Features for the weakly supervised setting.
if add_answer:
      feature_types.update({
          "numeric_values":
              tf.io.FixedLenFeature(
                  [max_seq_length],
                  tf.float32,
                  default_value=[0] * max_seq_length,
              ),
          "numeric_values_scale":
              tf.io.FixedLenFeature(
                  [max_seq_length],
                  tf.float32,
                  default_value=[0] * max_seq_length,
              ),
          "answer":
              tf.io.FixedLenFeature(
                  [1],
                  tf.float32,
                  default_value=[0],
              ),
      })
  

In [86]:
def build_parser_function(feature_types,
                          params):
  """Returns a parse function that can be used by read_dataset."""
  del params

  def parse_examples(serialized_examples):
    features = tf.io.parse_single_example(serialized_examples, feature_types)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(features.keys()):
      t = features[name]
      if t.dtype == tf.int64:
        t = tf.cast(t, tf.int32)
        features[name] = t
    return features

  return parse_examples

def _parse_fn(serialized_example):
    features = dict(
        build_parser_function(feature_types,
                                      params)(serialized_example))
    return features

In [87]:
example = examples[0]
ex_str = example.SerializeToString()

In [88]:
def cast_int(fts):
    for name in list(fts.keys()):
        t = fts[name]
        if t.dtype == tf.int64:
            t = tf.cast(t, tf.int32)
            fts[name] = t
    return fts

In [89]:
serialized = []
for example in examples:
    serialized.append(example.SerializeToString())
#fts = tf.io.parse_single_example(ex_str, feature_types)
fts = cast_int(tf.io.parse_example(serialized, feature_types))


In [90]:
parsed = _parse_fn(ex_str)

In [91]:
def input_fn(features, params):
    parse_fn = _parse_fn
    dataset = tf.data.Dataset.from_tensor_slices(features)
    return dataset

In [92]:
import functools

In [93]:
predict_input_fn = functools.partial(input_fn,features=fts)

In [151]:
# To get a single element from tapas's V1 dataset
#('tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter')
# tf.data.experimental.get_single_element(
#     dataset
# )


In [150]:
# predict_input_fn = some sort of input fn that iterates through the dataset

taken from 
Traceback:
```
    tapas/models/tapas_classifier_model.py:1170
        tapas/datasets/table_dataset.py:190
            tapas/datasets/dataset.py:78
```        

In [34]:
ds = tf.data.Dataset.from_tensor_slices(fts)

In [71]:
run_config = tf.compat.v1.estimator.tpu.RunConfig(
      cluster=None,
      master=None,
      model_dir='results/wtq/model',
      tf_random_seed=None,
      save_checkpoints_steps=1000,
      keep_checkpoint_max=5,
      keep_checkpoint_every_n_hours=4.0,
      tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
          iterations_per_loop=1000,
          num_shards=8,
          per_host_input_for_training=3))

In [58]:
estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
      params={'gradient_accumulation_steps': 1},
      use_tpu=False,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=512 // 1,
      eval_batch_size=None,
      predict_batch_size=32)

INFO:tensorflow:Using config: {'_model_dir': 'results/wtq/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 4.0, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_inf

In [59]:
checkpoint = estimator.latest_checkpoint()

In [65]:
def test_input_fn(params):
    return tf.data.Dataset.from_tensor_slices(fts)

In [66]:
next(estimator.predict(input_fn=test_input_fn))

INFO:tensorflow:prediction_loop marked as finished


ValueError: Tensor("optimizations:0", shape=(3,), dtype=string) must be from the same graph as Tensor("TensorSliceDataset_1:0", shape=(), dtype=variant).

In [75]:
%debug

> [0;32m/opt/conda/envs/tapas/lib/python3.6/inspect.py[0m(1132)[0;36mgetfullargspec[0;34m()[0m
[0;32m   1130 [0;31m        [0;31m# else. So to be fully backwards compatible, we catch all[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1131 [0;31m        [0;31m# possible exceptions here, and reraise a TypeError.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1132 [0;31m        [0;32mraise[0m [0mTypeError[0m[0;34m([0m[0;34m'unsupported callable'[0m[0;34m)[0m [0;32mfrom[0m [0mex[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1133 [0;31m[0;34m[0m[0m
[0m[0;32m   1134 [0;31m    [0margs[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  U


*** NameError: name 'U' is not defined


ipdb>  u


> [0;32m/opt/conda/envs/tapas/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py[0m(257)[0;36mgetfullargspec[0;34m()[0m
[0;32m    255 [0;31m    [0;32mif[0m [0md[0m[0;34m.[0m[0mdecorator_argspec[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    256 [0;31m      [0;32mreturn[0m [0m_convert_maybe_argspec_to_fullargspec[0m[0;34m([0m[0md[0m[0;34m.[0m[0mdecorator_argspec[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 257 [0;31m  [0;32mreturn[0m [0m_getfullargspec[0m[0;34m([0m[0mtarget[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    258 [0;31m[0;34m[0m[0m
[0m[0;32m    259 [0;31m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tapas/lib/python3.6/site-packages/tensorflow/python/util/function_utils.py[0m(57)[0;36mfn_args[0;34m()[0m
[0;32m     55 [0;31m    [0;32mif[0m [0m_is_callable_object[0m[0;34m([0m[0mfn[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m      [0mfn[0m [0;34m=[0m [0mfn[0m[0;34m.[0m[0m__call__[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 57 [0;31m    [0margs[0m [0;34m=[0m [0mtf_inspect[0m[0;34m.[0m[0mgetfullargspec[0m[0;34m([0m[0mfn[0m[0;34m)[0m[0;34m.[0m[0margs[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     58 [0;31m    [0;32mif[0m [0m_is_bound_method[0m[0;34m([0m[0mfn[0m[0;34m)[0m [0;32mand[0m [0margs[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     59 [0;31m      [0;31m# If it's a bound method, it may or may not have a self/cls first[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tapas/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py[0m(2998)[0;36m_call_input_fn[0;34m()[0m
[0;32m   2996 [0;31m      [0mValueError[0m[0;34m:[0m [0;32mif[0m [0minput_fn[0m [0mtakes[0m [0minvalid[0m [0marguments[0m [0;32mor[0m [0mdoes[0m [0;32mnot[0m [0mhave[0m[0;31m [0m[0;31m`[0m[0mparams[0m[0;31m`[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   2997 [0;31m    """
[0m[0;32m-> 2998 [0;31m    [0minput_fn_args[0m [0;34m=[0m [0mfunction_utils[0m[0;34m.[0m[0mfn_args[0m[0;34m([0m[0minput_fn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   2999 [0;31m    [0mconfig[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mconfig[0m  [0;31m# a deep copy.[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3000 [0;31m    [0mkwargs[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  type(input_fn)


<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


ipdb>  Q


*** NameError: name 'Q' is not defined


ipdb>  q


In [59]:
for k,v in ds[0].items():
    print(k,v.shape)

aggregation_function_id (1,)
answer (1,)
column_ids (512,)
column_ranks (512,)
input_ids (512,)
input_mask (512,)
inv_column_ranks (512,)
label_ids (512,)
numeric_relations (512,)
numeric_values (512,)
numeric_values_scale (512,)
prev_label_ids (512,)
question_id_ints (64,)
row_ids (512,)
segment_ids (512,)


In [58]:
expd = {k: v[None,:] for k,v in ds[0].items()}

# Modeling Debug Breakpoints

    run_task_main.py: 482
                    : 508
                    : 557
                    : 

## Preprocessing Breakpoints

major preprocessing error: 244/363 are bc "Unable to convert value to float"

```
Num Type         Disp Enb   Where
1   breakpoint   keep yes   at utils/interaction_utils_parser.py:254
2   breakpoint   keep yes   at utils/interaction_utils_parser.py:249
3   breakpoint   keep yes   at utils/interaction_utils_parser.py:243
4   breakpoint   keep yes   at run_task_main.py:279
        breakpoint already hit 1 time
5   breakpoint   keep no    at utils/tf_example_utils.py:231
        breakpoint already hit 1 time
6   breakpoint   keep yes   at utils/tf_example_utils.py:1067
```

* invalid answer -- 
    can't convert float value or no answer coordinate
    sqa_utils:\_parse_questions --> sets interaction.answer.is_valid=False
    
        |_interaction_utils_parser:parse_question
        
            |_ ..._parse_question :243,249,254
            
```
if not question.answer.answer_coordinates and not question.answer.HasField(
"float_value"):
    raise ValueError("Cannot parse answer: {}".format(error_message))
```
        
* Couldn't find all answers -- 

    b:tf_example_utils.py:226, 231
    
        run_task_main:279
    
            tf_example_utils.ToClassifierTensorflowExample.convert
                     :1227 -> 340 -> 242 -> 214
                 
* Too many rows
  tf_example_utils......convert:
  
      1177/1178 -> \_get_num_rows:1067