In [1]:
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering

In [3]:
import pandas as pd
import torch

In [3]:
from tqdm import tqdm

In [4]:
import csv

In [5]:
import os

In [6]:
import gc

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq", drop_rows_to_fit=True)

In [11]:
model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")
#model.to(device)

In [12]:
paths = {}
paths['train'] = "../WikiTableQuestions/data/random-split-1-train.tsv"
#train_path = "../WikiTableQuestions/data/training.tsv"
paths['dev'] = "../WikiTableQuestions/data/random-split-1-dev.tsv"
paths['test'] = "../WikiTableQuestions/data/pristine-unseen-tables.tsv"
table_csv_path = "../WikiTableQuestions/"

In [13]:
def _load_table(path):
    with open(table_csv_path+path) as f:
        reader = csv.reader(f, delimiter=',', quotechar='"',quoting=csv.QUOTE_ALL,escapechar='\\')
        headers = next(reader)
        table = pd.DataFrame(list(reader),columns=headers).astype(str)
    return table


In [14]:
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        table = _load_table(os.path.join(table_csv_path,item.context))
        encoding = self.tokenizer(table=table,
                                  queries=item.utterance,
                                  truncation=True,
                                  padding="max_length",
                                  return_tensors="pt"
        )
        # remove the batch dimension which the tokenizer adds by default
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding['id'] = idx
        # add the float_answer which is also required (weak supervision for aggregation case)
        return encoding
    def __len__(self):
       return len(self.data)

In [15]:
def get_dataloader(path, batch_size = 4):
    with open(path) as f:
        reader = csv.reader(f,delimiter='\t',quotechar='"',quoting=csv.QUOTE_NONE,escapechar='\\')
        heads = next(reader)
        data = pd.DataFrame(list(reader),columns=heads)
    dataset = TableDataset(data, tokenizer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [16]:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}

In [17]:
from tapas.utils import text_utils
import math

In [18]:
def _collect_cells_from_table(cell_coos, table):
    cell_values = []
    for cell in cell_coos:
        value = str(table.iloc[cell[0],cell[1]])
        cell_values.append(value)
    return cell_values

In [19]:
def _safe_convert_to_float(value):
    float_value = text_utils.convert_to_float(value)
    if math.isnan(float_value):
        raise ValueError('Value is NaN %s' % value)
    return float_value

def _parse_value(value):
  """Parses a cell value to a number or lowercased string."""
  try:
    return _safe_convert_to_float(value)
  except ValueError:
    try:
      return value.lower()
    except ValueError:
      return value


In [20]:
def execute(aggregation_type, cell_coos,
            table):
  """Executes predicted structure against a table to produce the denotation."""
  values = _collect_cells_from_table(cell_coos, table)
  values_parsed = [_parse_value(value) for value in values]
  values_parsed = tuple(values_parsed)
  if aggregation_type == "NONE":
    # In this case there is no aggregation
    return values_parsed, values
  else:  # Should perform aggregation.
    if not values and (aggregation_type == "AVERAGE" or
                       aggregation_type == "SUM"):
      # Summing or averaging an empty set results in an empty set.
      # NB: SQL returns null for sum over an empty set.
      return tuple(), values
    if aggregation_type == "COUNT":
      denotation = len(values)
    else:
      # In this case all values must be numbers (to be summed or averaged).
      try:
        values_num = [text_utils.convert_to_float(value) for value in values]
      except ValueError:
        return values_parsed, values
      if aggregation_type == "SUM":
        denotation = sum(values_num)
      elif aggregation_type == "AVERAGE":
        denotation = sum(values_num) / len(values_num)
      else:
        raise ValueError('Unknwon aggregation type: %s' % aggregation_type)
    return tuple([float(denotation)]), values

In [21]:
dl = get_dataloader(paths['dev'])
ds = dl.dataset

In [22]:
table = _load_table(os.path.join(table_csv_path,ds.data.iloc[0].context))

In [21]:
idl = iter(dl)
b = next(idl)

In [22]:
b=next(idl)

In [23]:
ids = b.pop('id')

In [24]:
ids

tensor([4, 5, 6, 7])

In [25]:
model.to(device)
b = {k:v.to(device) for k,v in b.items()}

In [26]:
outputs = model(**b)

In [20]:
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    {k:v.to('cpu') for k,v in b.items()},
    outputs.logits.cpu().detach(),
    outputs.logits_aggregation.cpu().detach()
)

NameError: name 'b' is not defined

In [28]:
coors = predicted_answer_coordinates
aggs = predicted_aggregation_indices

In [29]:
for i,coor in enumerate(coors):
    table = _load_table(os.path.join(table_csv_path, ds.data.iloc[int(ids[i])].context))
    print(execute(id2aggregation[aggs[i]],coor,table))

(('tatiana volosozhar / maxim trankov',), ['Tatiana Volosozhar / Maxim Trankov'])
(('new delhi, india',), ['New Delhi, India'])
(('sweden',), ['Sweden'])
((1694.0,), ['1694'])


In [80]:
device = 'cpu'
model.to(device)
out = {}
for split in ['dev','test']:
    dl = get_dataloader(paths[split], batch_size=32)
    ds = dl.dataset
    out[split] = []
    for b_num,batch in enumerate(tqdm(dl)):
        ids = batch.pop('id')
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            try:
                outputs = model(**batch)
            except IndexError:
                continue
            coors, aggs = tokenizer.convert_logits_to_predictions(
                {k:v.to('cpu') for k,v in batch.items()},
                outputs.logits.cpu().detach(),
                outputs.logits_aggregation.cpu().detach()
            )
        for i, coor in enumerate(coors):
            table = _load_table(os.path.join(table_csv_path, ds.data.iloc[int(ids[i])].context))
            denos,res = execute(id2aggregation[aggs[i]],coor,table)
            out[split].append((denos,res))

    

 24%|████████████▎                                       | 21/89 [02:11<07:05,  6.25s/it]


IndexError: index out of range in self

In [29]:
%debug

> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/functional.py[0m(1852)[0;36membedding[0;34m()[0m
[0;32m   1850 [0;31m        [0;31m# remove once script supports set_grad_enabled[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1851 [0;31m        [0m_no_grad_embedding_renorm_[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mmax_norm[0m[0;34m,[0m [0mnorm_type[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1852 [0;31m    [0;32mreturn[0m [0mtorch[0m[0;34m.[0m[0membedding[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mpadding_idx[0m[0;34m,[0m [0mscale_grad_by_freq[0m[0;34m,[0m [0msparse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1853 [0;31m[0;34m[0m[0m
[0m[0;32m   1854 [0;31m[0;34m[0m[0m
[0m


ipdb>  type(self.embedding)


*** NameError: name 'self' is not defined


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/sparse.py[0m(124)[0;36mforward[0;34m()[0m
[0;32m    122 [0;31m[0;34m[0m[0m
[0m[0;32m    123 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0minput[0m[0;34m:[0m [0mTensor[0m[0;34m)[0m [0;34m->[0m [0mTensor[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 124 [0;31m        return F.embedding(
[0m[0;32m    125 [0;31m            [0minput[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mweight[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mpadding_idx[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mmax_norm[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    126 [0;31m            self.norm_type, self.scale_grad_by_freq, self.sparse)
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(327)[0;36mforward[0;34m()[0m
[0;32m    325 [0;31m        [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mnumber_of_token_type_embeddings[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    326 [0;31m            [0mname[0m [0;34m=[0m [0;34mf"token_type_embeddings_{i}"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 327 [0;31m            [0membeddings[0m [0;34m+=[0m [0mgetattr[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m)[0m[0;34m([0m[0mtoken_type_ids[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0;34m:[0m[0;34m,[0m [0mi[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    328 [0;31m[0;34m[0m[0m
[0m[0;32m    329 [0;31m        [0membeddings[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mLayerNorm[0m[0;34m([0m[0membeddings[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(906)[0;36mforward[0;34m()[0m
[0;32m    904 [0;31m        [0mhead_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mget_head_mask[0m[0;34m([0m[0mhead_mask[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mnum_hidden_layers[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    905 [0;31m[0;34m[0m[0m
[0m[0;32m--> 906 [0;31m        embedding_output = self.embeddings(
[0m[0;32m    907 [0;31m            [0minput_ids[0m[0;34m=[0m[0minput_ids[0m[0;34m,[0m [0mposition_ids[0m[0;34m=[0m[0mposition_ids[0m[0;34m,[0m [0mtoken_type_ids[0m[0;34m=[0m[0mtoken_type_ids[0m[0;34m,[0m [0minputs_embeds[0m[0;34m=[0m[0minputs_embeds[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    908 [0;31m        )
[0m


ipdb>  type(self.embeddings)


<class 'transformers.models.tapas.modeling_tapas.TapasEmbeddings'>


ipdb>  self.embeddings.num_embeddings


*** torch.nn.modules.module.ModuleAttributeError: 'TapasEmbeddings' object has no attribute 'num_embeddings'


ipdb>  self.embeddings.print_base()


*** torch.nn.modules.module.ModuleAttributeError: 'TapasEmbeddings' object has no attribute 'print_base'


ipdb>  type(self.embeddings.word_embeddings)


<class 'torch.nn.modules.sparse.Embedding'>


ipdb>  self.embeddings.word_embeddings.num_embeddings


30522


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(1147)[0;36mforward[0;34m()[0m
[0;32m   1145 [0;31m        [0mreturn_dict[0m [0;34m=[0m [0mreturn_dict[0m [0;32mif[0m [0mreturn_dict[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m [0;32melse[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0muse_return_dict[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1146 [0;31m[0;34m[0m[0m
[0m[0;32m-> 1147 [0;31m        outputs = self.tapas(
[0m[0;32m   1148 [0;31m            [0minput_ids[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1149 [0;31m            [0mattention_mask[0m[0;34m=[0m[0mattention_mask[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  torch.max(input_ids)


tensor(30521)


ipdb>  self.tapas.embeddings


TapasEmbeddings(
  (word_embeddings): Embedding(30522, 1024, padding_idx=0)
  (position_embeddings): Embedding(1024, 1024)
  (token_type_embeddings_0): Embedding(3, 1024)
  (token_type_embeddings_1): Embedding(256, 1024)
  (token_type_embeddings_2): Embedding(256, 1024)
  (token_type_embeddings_3): Embedding(2, 1024)
  (token_type_embeddings_4): Embedding(256, 1024)
  (token_type_embeddings_5): Embedding(256, 1024)
  (token_type_embeddings_6): Embedding(10, 1024)
  (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/tmp/ipykernel_3784/2165780.py[0m(15)[0;36m<module>[0;34m()[0m
[0;32m     13 [0;31m        [0mmodel[0m[0;34m.[0m[0meval[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m        [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0mno_grad[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m            [0moutputs[0m [0;34m=[0m [0mmodel[0m[0;34m([0m[0;34m**[0m[0mbatch[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m            coors, aggs = tokenizer.convert_logits_to_predictions(
[0m[0;32m     17 [0;31m                [0;34m{[0m[0mk[0m[0;34m:[0m[0mv[0m[0;34m.[0m[0mto[0m[0;34m([0m[0;34m'cpu'[0m[0;34m)[0m [0;32mfor[0m [0mk[0m[0;34m,[0m[0mv[0m [0;32min[0m [0mbatch[0m[0;34m.[0m[0mitems[0m[0;34m([0m[0;34m)[0m[0;34m}[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  ids


tensor([672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685,
        686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699,
        700, 701, 702, 703])


ipdb>  ds


<__main__.TableDataset object at 0x7f39bf1fb880>


ipdb>  ds[672]


{'input_ids': tensor([  101,  1996,  2060,  2308,  1005,  1055,  3453,  2007,  1996,  2168,
         3926,  2051,  2004,  8183,  6643, 12417,  1999,  2262,   102,  3179,
         2095,  2273,  1005,  1055,  3453,  2051,  1006,  1049,  1024,  1055,
         2308,  1005,  1055,  3453,  2051,  1006,  1049,  1024,  1055,  3083,
         2901, 20481,  2072,  2702,  5736,  1006,  4700,  1024,  4720, 12684,
         2204,  2121,  3511,  1006,  5179,  1024,  5641,  3416,  2889,  2726,
         6583, 11475,  1006,  9092,  4700,  1024,  2340, 15585,  5416, 12069,
        16107,  1006,  5187,  1024,  2385,  3822,  2826,  8945,  4710, 17712,
         7856,  2100,  4700,  1024,  5840,  1045, 20922, 11265, 27390,  2050,
         5187,  1024,  2539,  4343,  2857,  5639, 21101,  2229,  1006, 16351,
         2099,  4805,  1024,  2340,  1045, 20922, 11265, 27390,  2050,  5187,
         1024,  5890,  4833,  2807,  5639, 21101,  2229,  1006, 16351,  2099,
         4700,  1024,  4002, 21025,  4674,  6382, 

ipdb>  model(**ds[672])


*** TypeError: forward() got an unexpected keyword argument 'id'


ipdb>  e = ds[672]
ipdb>  e.pop('id')


672


ipdb>  model(**e)


*** ValueError: Wrong shape for input_ids (shape torch.Size([512])) or attention_mask (shape torch.Size([512]))


ipdb>  batch.shape


*** AttributeError: 'dict' object has no attribute 'shape'


ipdb>  batch['input_ids'].shape


torch.Size([32, 512])


ipdb>  e = {k:v.unsqueeze(0) for k,v in e.items()}
ipdb>  e['input_ids'].shape


torch.Size([1, 512])


ipdb>  model(**e)


TableQuestionAnsweringOutput(loss=None, logits=tensor([[-1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04,
         -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04,
         -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04,
         -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0012e+04, -1.0031e+04,
         -1.0032e+04, -1.0034e+04, -1.0034e+04, -1.0034e+04, -1.0034e+04,
         -1.0033e+04, -1.0033e+04, -1.0033e+04, -1.0033e+04, -1.0033e+04,
         -1.4503e+02, -1.4503e+02, -1.4503e+02, -1.4503e+02, -1.0030e+04,
         -1.0030e+04, -1.0030e+04, -1.0030e+04, -1.0030e+04, -1.0024e+04,
         -1.0025e+04, -1.0025e+04, -1.0025e+04, -1.0025e+04, -1.0025e+04,
         -1.0025e+04, -1.0023e+04, -1.0023e+04, -1.0023e+04, -1.6582e+01,
         -1.6582e+01, -1.6582e+01, -1.6582e+01, -1.6582e+01, -1.0021e+04,
         -1.0021e+04, -1.0021e+04, -1.0022e+04, -1.0021e+04, -1.0023e+04,
         -1.0023e+04, -1.0023e+04, -1.0023e+04, -1.0023e+04, -1.0

ipdb>  e = {k:v.unsqueeze(0) for k,v in ds[673].items() if k!='id'}
ipdb>  model(**e)


TableQuestionAnsweringOutput(loss=None, logits=tensor([[-10025.8682, -10025.8682, -10025.8682, -10025.8682, -10025.8682,
         -10025.8682, -10024.9902,   -120.1096, -10028.1289, -10028.4385,
         -10032.3018, -10029.1934, -10001.0068,     40.5817, -10005.4443,
          -9999.3262,  -9999.6553, -10004.0439, -10026.6182,    -65.3360,
         -10027.0996, -10026.2617, -10026.5078, -10022.0879, -10020.7344,
            -63.4198, -10019.3984, -10021.7285, -10026.9883, -10020.3584,
         -10023.4551,    -25.8319, -10020.5820, -10021.1797, -10023.9072,
         -10020.4678, -10030.7500,    -53.3926, -10028.3330, -10026.6260,
         -10029.3369, -10026.8203, -10033.2549,    -95.9602,    -95.9602,
         -10031.9531, -10031.6836, -10034.7754, -10031.3086, -10032.2021,
            -62.5598, -10030.8506, -10030.9717, -10035.0566, -10029.4746,
         -10032.9395,   -112.8097, -10033.3906, -10031.9111, -10037.2568,
         -10031.5771, -10034.3984,    -95.6047, -10033.2139, -100

ipdb>  e = {k:v.unsqueeze(0) for k,v in ds[674].items() if k!='id'}
ipdb>  model(**e)


TableQuestionAnsweringOutput(loss=None, logits=tensor([[-10049.5928, -10049.5928, -10049.5928, -10049.5928, -10049.5928,
         -10049.5928, -10049.5928, -10049.5928, -10049.5928, -10049.5928,
         -10049.5928, -10049.5928, -10107.3887, -10107.4600, -10097.0869,
         -10169.3242,   -331.1098, -10062.0576, -10080.1152, -10073.7158,
         -10118.5996, -10080.6934, -10118.9717, -10076.7773,   -189.2221,
         -10061.2461, -10047.4688, -10037.3457, -10037.3457, -10037.3457,
         -10021.3682, -10006.5547, -10023.1514, -10019.2578,    280.4255,
         -10011.6885, -10011.6885, -10011.6885, -10007.5850, -10006.8662,
         -10070.0059, -10057.6396, -10075.5498, -10082.1406,   -162.9949,
         -10049.6045, -10048.4355, -10042.9238, -10073.2051, -10058.1211,
         -10069.7002, -10066.2783,   -134.9747, -10044.8047, -10056.9482,
         -10043.9834, -10088.7051, -10086.7275, -10095.8252, -10092.2129,
           -200.9371, -10070.9414, -10093.8027, -10050.2051, -101

ipdb>  e = {k:v.unsqueeze(0) for k,v in ds[675].items() if k!='id'}
ipdb>  model(**e)


TableQuestionAnsweringOutput(loss=None, logits=tensor([[-10010.7520, -10010.7520, -10010.7520, -10010.7520, -10010.7520,
         -10010.7520, -10010.7520, -10010.7520, -10010.7520, -10010.7520,
         -10010.7520, -10010.7520, -10010.7520, -10010.7520, -10010.7520,
         -10010.7520,   -115.3408, -10116.3398, -10106.7861, -10071.4707,
         -10080.2998, -10080.2998,   -148.1307,   -148.1307,   -148.1307,
         -10139.2559, -10139.2559, -10071.0430, -10071.0430, -10094.5801,
         -10094.5801, -10136.7051, -10136.7051, -10136.7051,    -35.1360,
            -35.1360, -10039.7480, -10057.6074, -10057.6074, -10042.0381,
         -10042.0381, -10044.1768, -10044.1768, -10044.1768,    -46.4297,
            -46.4297,    -46.4297, -10087.3633, -10087.3633, -10087.3633,
         -10075.0518, -10075.0518, -10049.7012, -10049.7012, -10055.7383,
         -10055.7383, -10055.7383,    -38.0143,    -38.0143, -10055.0879,
         -10055.0879, -10055.0879, -10055.4980, -10055.4980, -100

ipdb>  q


In [None]:
t = [672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685,
        686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699,
        700, 701, 702, 703]
error_idxs = []
outs = []
dl = get_dataloader(paths['dev'], batch_size=32)
ds = dl.dataset
for idx in range(len(ds)):
    e = {k:v.unsqueeze(0) for k,v in ds[idx].items() if k!='id'}
    try:
        o = model(**e)
        outs.append(o)
    except IndexError:
        print(idx)
        error_idxs.append(idx)

In [81]:
err = {k:v.unsqueeze(0) for k,v in ds[683].items() if k!='id'}

In [53]:
err['input_ids'].shape

torch.Size([1, 512])

In [23]:
item = ds.data.iloc[683]
table = _load_table(table_csv_path+item.context)

In [24]:
table.columns

Index(['Year', 'City', 'State, province, dept., etc.', 'Country', 'Notes'], dtype='object')

In [25]:
table

Unnamed: 0,Year,City,"State, province, dept., etc.",Country,Notes
0,4000 BC,Puerto Hormiga Culture,Cartagena,Colombia,
1,3710 BC,Aspero,Norte Chico,Peru,
2,2627 BC,Caral,Norte Chico,Peru,
3,700 BC,Ticul,Yucatán,Mexico,
4,500 BC,Cholula,Puebla,Mexico,
...,...,...,...,...,...
656,1960,Brasília,Distrito Federal,Brazil,Created in 1960 as the national capital.
657,1970,Belmopan,Cayo,Belize,
658,1970,Linden,Upper Demerara-Berbice,Guyana,City formed by combining the towns of Christia...
659,1970,Cancún,Quintana Roo,Mexico,


In [41]:
table = table[~table["Year"].str.contains('BC')][["Year"]]
table.reset_index(drop=True, inplace=True)
table
#table = table.drop(columns=['Year'])

Unnamed: 0,Year
0,200
1,524
2,500
3,600
4,1000
...,...
650,1960
651,1970
652,1970
653,1970


In [40]:
for i,row in table.iterrows():
    row = row.to_frame().transpose()
    row.reset_index(drop=True,inplace=True)
    inp = tokenizer(table=row,
          queries=item.utterance,
          truncation=True,
          padding="max_length",
          return_tensors="pt"
        )
    out = model(**inp)
    if torch.max(inp['token_type_ids'])>100:
        print(torch.max(inp['token_type_ids']))

In [34]:
len(table)

655

In [42]:
inp = tokenizer(table=table,
          queries=item.utterance,
          truncation=True,
          padding="max_length",
          return_tensors="pt"
        )

In [79]:
print(len(ds))

2831


In [44]:
torch.unique(inp['token_type_ids'][:,:,4])

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169])

In [45]:
len(inp['token_type_ids'][:,:,4])

1

In [39]:
model(**inp)

IndexError: index out of range in self

In [44]:
for k,v in err.items():
    print(k)
    print(torch.max(v))
    print(torch.min(v))

input_ids
tensor(29408)
tensor(0)
attention_mask
tensor(1)
tensor(0)
token_type_ids
tensor(341)
tensor(0)


In [82]:
model.train()
model(**err)

IndexError: index out of range in self

In [62]:
%debug

> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/functional.py[0m(1852)[0;36membedding[0;34m()[0m
[0;32m   1850 [0;31m        [0;31m# remove once script supports set_grad_enabled[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1851 [0;31m        [0m_no_grad_embedding_renorm_[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mmax_norm[0m[0;34m,[0m [0mnorm_type[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1852 [0;31m    [0;32mreturn[0m [0mtorch[0m[0;34m.[0m[0membedding[0m[0;34m([0m[0mweight[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mpadding_idx[0m[0;34m,[0m [0mscale_grad_by_freq[0m[0;34m,[0m [0msparse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1853 [0;31m[0;34m[0m[0m
[0m[0;32m   1854 [0;31m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/sparse.py[0m(124)[0;36mforward[0;34m()[0m
[0;32m    122 [0;31m[0;34m[0m[0m
[0m[0;32m    123 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0minput[0m[0;34m:[0m [0mTensor[0m[0;34m)[0m [0;34m->[0m [0mTensor[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 124 [0;31m        return F.embedding(
[0m[0;32m    125 [0;31m            [0minput[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mweight[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mpadding_idx[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mmax_norm[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    126 [0;31m            self.norm_type, self.scale_grad_by_freq, self.sparse)
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(327)[0;36mforward[0;34m()[0m
[0;32m    325 [0;31m        [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mnumber_of_token_type_embeddings[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    326 [0;31m            [0mname[0m [0;34m=[0m [0;34mf"token_type_embeddings_{i}"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 327 [0;31m            [0membeddings[0m [0;34m+=[0m [0mgetattr[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m)[0m[0;34m([0m[0mtoken_type_ids[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0;34m:[0m[0;34m,[0m [0mi[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    328 [0;31m[0;34m[0m[0m
[0m[0;32m    329 [0;31m        [0membeddings[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mLayerNorm[0m[0;34m([0m[0membeddings[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(906)[0;36mforward[0;34m()[0m
[0;32m    904 [0;31m        [0mhead_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mget_head_mask[0m[0;34m([0m[0mhead_mask[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mnum_hidden_layers[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    905 [0;31m[0;34m[0m[0m
[0m[0;32m--> 906 [0;31m        embedding_output = self.embeddings(
[0m[0;32m    907 [0;31m            [0minput_ids[0m[0;34m=[0m[0minput_ids[0m[0;34m,[0m [0mposition_ids[0m[0;34m=[0m[0mposition_ids[0m[0;34m,[0m [0mtoken_type_ids[0m[0;34m=[0m[0mtoken_type_ids[0m[0;34m,[0m [0minputs_embeds[0m[0;34m=[0m[0minputs_embeds[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    908 [0;31m        )
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(1147)[0;36mforward[0;34m()[0m
[0;32m   1145 [0;31m        [0mreturn_dict[0m [0;34m=[0m [0mreturn_dict[0m [0;32mif[0m [0mreturn_dict[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m [0;32melse[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0muse_return_dict[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1146 [0;31m[0;34m[0m[0m
[0m[0;32m-> 1147 [0;31m        outputs = self.tapas(
[0m[0;32m   1148 [0;31m            [0minput_ids[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1149 [0;31m            [0mattention_mask[0m[0;34m=[0m[0mattention_mask[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  l


[1;32m   1142 [0m            [0;34m>>[0m[0;34m>[0m [0mlogits[0m [0;34m=[0m [0moutputs[0m[0;34m.[0m[0mlogits[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1143 [0m            [0;34m>>[0m[0;34m>[0m [0mlogits_aggregation[0m [0;34m=[0m [0moutputs[0m[0;34m.[0m[0mlogits_aggregation[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1144 [0m        """
[1;32m   1145 [0m        [0mreturn_dict[0m [0;34m=[0m [0mreturn_dict[0m [0;32mif[0m [0mreturn_dict[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m [0;32melse[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0muse_return_dict[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1146 [0m[0;34m[0m[0m
[0;32m-> 1147 [0;31m        outputs = self.tapas(
[0m[1;32m   1148 [0m            [0minput_ids[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1149 [0m            [0mattention_mask[0m[0;34m=[0m[0mattention_mask[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1150 [0m            [0mtoken_type_ids[0m[0;34m

ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(906)[0;36mforward[0;34m()[0m
[0;32m    904 [0;31m        [0mhead_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mget_head_mask[0m[0;34m([0m[0mhead_mask[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mnum_hidden_layers[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    905 [0;31m[0;34m[0m[0m
[0m[0;32m--> 906 [0;31m        embedding_output = self.embeddings(
[0m[0;32m    907 [0;31m            [0minput_ids[0m[0;34m=[0m[0minput_ids[0m[0;34m,[0m [0mposition_ids[0m[0;34m=[0m[0mposition_ids[0m[0;34m,[0m [0mtoken_type_ids[0m[0;34m=[0m[0mtoken_type_ids[0m[0;34m,[0m [0minputs_embeds[0m[0;34m=[0m[0minputs_embeds[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    908 [0;31m        )
[0m


ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py[0m(327)[0;36mforward[0;34m()[0m
[0;32m    325 [0;31m        [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mnumber_of_token_type_embeddings[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    326 [0;31m            [0mname[0m [0;34m=[0m [0;34mf"token_type_embeddings_{i}"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 327 [0;31m            [0membeddings[0m [0;34m+=[0m [0mgetattr[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m)[0m[0;34m([0m[0mtoken_type_ids[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0;34m:[0m[0;34m,[0m [0mi[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    328 [0;31m[0;34m[0m[0m
[0m[0;32m    329 [0;31m        [0membeddings[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mLayerNorm[0m[0;34m([0m[0membeddings[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  getattr(self,token_type_embeddings_4)


*** NameError: name 'token_type_embeddings_4' is not defined


ipdb>  getattr(self,'token_type_embeddings_4')


Embedding(256, 1024)


ipdb>  self.token_type_embeddings_4


Embedding(256, 1024)


ipdb>  torch.max(token_type_id[:,:,i]


*** SyntaxError: unexpected EOF while parsing


ipdb>  torch.max(token_type_id[:,:,i])


*** NameError: name 'token_type_id' is not defined


ipdb>  torch.max(token_type_ids[:,:,i])


tensor(341)


ipdb>  token_type_ids[:,:,i].shape


torch.Size([1, 512])


ipdb>  self.token_type_embeddings_3


Embedding(2, 1024)


ipdb>  self.token_type_embeddings_2


Embedding(256, 1024)


ipdb>  self.token_type_embeddings_4(token_type_ids[:,:,i])


*** IndexError: index out of range in self


ipdb>  torch.max(self.token_type_embeddings_4)


*** TypeError: max(): argument 'input' (position 1) must be Tensor, not Embedding


ipdb>  torch.max(self.token_type_embeddings_4.weight)


tensor(0.1059, grad_fn=<MaxBackward1>)


ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/module.py[0m(727)[0;36m_call_impl[0;34m()[0m
[0;32m    725 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_slow_forward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    726 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 727 [0;31m            [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    728 [0;31m        for hook in itertools.chain(
[0m[0;32m    729 [0;31m                [0m_global_forward_hooks[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  d


> [0;32m/opt/conda/envs/tp/lib/python3.8/site-packages/torch/nn/modules/sparse.py[0m(124)[0;36mforward[0;34m()[0m
[0;32m    122 [0;31m[0;34m[0m[0m
[0m[0;32m    123 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0minput[0m[0;34m:[0m [0mTensor[0m[0;34m)[0m [0;34m->[0m [0mTensor[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 124 [0;31m        return F.embedding(
[0m[0;32m    125 [0;31m            [0minput[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mweight[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mpadding_idx[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mmax_norm[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    126 [0;31m            self.norm_type, self.scale_grad_by_freq, self.sparse)
[0m


ipdb>  input


tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0, 341,   0,   0,   0,   0,
           0, 340, 340,   0,   0,   0,   0,   0, 339, 339,   0,   0,   0,   0,
           6,   0,   0,   0,   0,   0,   3,   0,   0,   0,   0,   0,   2,   0,
           0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   4,   4,
           0,   0,   0,   0,   0,   0,   0,   3,   0,   0,   0,   0,   0,   0,
           5,   0,   0,   0,   0,   0,   7,   0,   0,   0,   7,   0,   0,   0,
           0,   8,   8,   0,   0,   0,   0,   0,   9,   0,   0,   0,   0,   0,
           9,   0,   0,   0,   0,   0,   0,  10,   0,   0,   0,   0,   0,  11,
          11,   0,   0,   0,   0,   0,   0,   0,   0,   0,  12,  12,   0,   0,
           0,   0,   0,  13,  13,   0,   0,   0,   0,   0,   0,   0,   0,  14,
          14,   0,   0,   0,  15,  15,   0,   0,   0,  16,  16,   0,   0,   0,
           0,   0,   0,  17,  17,   0,   0,   0,  18

ipdb>  q


In [None]:
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
answers = []
for coordinates in predicted_answer_coordinates:
  if len(coordinates) == 1:
    # only a single cell:
    answers.append(table.iat[coordinates[0]])
  else:
    # multiple cells
    cell_values = []
    for coordinate in coordinates:
       cell_values.append(table.iat[coordinate])
    answers.append(", ".join(cell_values))

## Deprecated

In [176]:
with open(test_path) as f:
    headers = f.readline().strip('\n').split('\t')
    lines = [l.strip('\n').split('\t') for l in f]
data = pd.DataFrame(lines,columns=headers)
dataset = TableDataset(data, tokenizer)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

In [178]:
data.to_pickle('test.pkl')

In [177]:
len(data)

4344

In [150]:
from datasets import load_dataset

In [452]:
ds = load_dataset('pandas',data_files={
    #'train':'random-split-1-train.pkl',
    'dev':'random-split-1-dev.pkl',
    'test':'test.pkl'
})

Using custom data configuration default-618396cc63ee8cfa
Reusing dataset pandas (/root/.cache/huggingface/datasets/pandas/default-618396cc63ee8cfa/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


In [19]:
def load_table(example):
    return _load_table(example['context'])
def encode(example):
    table = load_table(example)
    question = example['utterance']
    encoding = tokenizer(table=table,queries=question, truncation=True, padding="max_length", return_tensors='pt')
    encoding = {key: val.squeeze(0) for key, val in encoding.items()}
    return encoding

In [None]:
ds=ds.map(encode)

  0%|          | 0/2831 [00:00<?, ?ex/s]

In [404]:
encode(ds['test'][0]).keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids'])

In [None]:
for split in ds:
    for i in range(len(ds[split])):
        inputs = encode(ds[split][i])

In [431]:
from multiprocessing import Pool,cpu_count

In [436]:
with Pool(cpu_count()//2) as p:
    ds['test'] = p.map(encode,ds['test'])


In [440]:
with Pool(cpu_count()) as p:
    ds['dev'] = p.map(encode,ds['dev'])

In [444]:
for split in ds:
    torch.save(ds[split],f'preprocessed_{split}.pt')

In [441]:
dstb = load_dataset('pandas',data_files={
    #'train':'random-split-1-train.pkl',
    'dev':'random-split-1-dev.pkl',
    'test':'test.pkl'
})

Using custom data configuration default-618396cc63ee8cfa
Reusing dataset pandas (/root/.cache/huggingface/datasets/pandas/default-618396cc63ee8cfa/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


In [445]:
dstb['test']

Dataset({
    features: ['id', 'utterance', 'context', 'targetValue'],
    num_rows: 4344
})

In [446]:
def load_table_wrap(ex):
    ex['context'] = load_table(ex)
    return ex

In [447]:
dstb = dstb.map(load_table_wrap)

  0%|          | 0/2831 [00:00<?, ?ex/s]

  0%|          | 0/4344 [00:00<?, ?ex/s]

In [None]:
def batch_encode(ex):
    table = load_table(example)
    question = example['utterance']
    encoding = tokenizer(table=table,queries=question, truncation=True, padding="max_length", return_tensors='pt')
    encoding = {key: val.squeeze(0) for key, val in encoding.items()}
    return encoding