In [1]:
from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering

In [2]:
import pandas as pd
import torch
from pathlib import Path
import csv

In [3]:
pd.__version__

'1.0.5'

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq", drop_rows_to_fit=True)
model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

In [5]:
from tapas.protos import interaction_pb2
from google.protobuf.json_format import MessageToDict

In [6]:
import tensorflow as tf

In [7]:
def _to_df(dic):
    columns = [f"{v}-{k}" for d in dic['columns'] for k,v in d.items()]
    rows = [{columns[i]: ' '.join(list(d.values())) for i, d in enumerate(row['cells'])} for row in dic['rows']]
    df = pd.DataFrame(rows, columns=columns)
    return df

In [8]:
from tapas.utils import text_utils
import math
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
def _collect_cells_from_table(cell_coos, table):
    cell_values = []
    for cell in cell_coos:
        value = str(table.iloc[cell[0],cell[1]])
        cell_values.append(value)
    return cell_values
def _safe_convert_to_float(value):
    float_value = text_utils.convert_to_float(value)
    if math.isnan(float_value):
        raise ValueError('Value is NaN %s' % value)
    return float_value

def _parse_value(value):
  """Parses a cell value to a number or lowercased string."""
  try:
    return _safe_convert_to_float(value)
  except ValueError:
    try:
      return value.lower()
    except ValueError:
      return value

def execute(aggregation_type, cell_coos,
            table):
  """Executes predicted structure against a table to produce the denotation."""
  values = _collect_cells_from_table(cell_coos, table)
  values_parsed = [_parse_value(value) for value in values]
  values_parsed = tuple(values_parsed)
  if aggregation_type == "NONE":
    # In this case there is no aggregation
    return values_parsed, values
  else:  # Should perform aggregation.
    if not values and (aggregation_type == "AVERAGE" or
                       aggregation_type == "SUM"):
      # Summing or averaging an empty set results in an empty set.
      # NB: SQL returns null for sum over an empty set.
      return tuple(), values
    if aggregation_type == "COUNT":
      denotation = len(values)
    else:
      # In this case all values must be numbers (to be summed or averaged).
      try:
        values_num = [text_utils.convert_to_float(value) for value in values]
      except ValueError:
        return values_parsed, values
      if aggregation_type == "SUM":
        denotation = sum(values_num)
      elif aggregation_type == "AVERAGE":
        denotation = sum(values_num) / len(values_num)
      else:
        raise ValueError('Unknwon aggregation type: %s' % aggregation_type)
    return tuple([float(denotation)]), values

In [9]:
dics = {}
split = 'random-split-1-dev'
for value in tf.data.TFRecordDataset(f'results/wtq/interactions/{split}.tfrecord'):
    interaction = interaction_pb2.Interaction()
    interaction.ParseFromString(value.numpy())
    d = MessageToDict(interaction)
    d['table_id'] = d['table']['tableId']
    d['table'] = _to_df(d['table']).to_dict()
    dics[d['id'].rsplit('-',1)[0]] = d

2021-09-22 02:43:54.155397: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-09-22 02:43:54.159938: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:82:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 836.37GiB/s
2021-09-22 02:43:54.161645: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 1 with properties: 
pciBusID: 0000:05:00.0 name: Tesla P4 computeCapability: 6.1
coreClock: 1.1135GHz coreCount: 20 deviceMemorySize: 7.43GiB deviceMemoryBandwidth: 178.99GiB/s
2021-09-22 02:43:54.161777: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-09-22 02:43:54.161863: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-09-

In [68]:
d = dics['nt-250']

In [93]:
l = list(map(lambda x:f"n{x}","t-1623\
nt-1624\
".split('n')))

In [92]:
for k in l:
    d = dics[k]
    table = pd.DataFrame.from_dict(d['table'])
    question = d['questions'][0]['originalText']
    encoding = tokenizer(table=table,
                                  queries=question,
                                  #truncation=True,
                                  padding="max_length",
                                  return_tensors="pt"
        )
    outputs = model(**encoding)
    coor, aggs = tokenizer.convert_logits_to_predictions(
    {k:v.to('cpu') for k,v in encoding.items()},
    outputs.logits.cpu().detach(),
    outputs.logits_aggregation.cpu().detach()
)
    print(execute(id2aggregation[aggs[0]],coor[0],table))

(('der liebeswalzer',), ['der liebeswalzer'])
(('guard',), ['guard'])
(('taylor jensen',), ['taylor jensen'])
((9.0,), ['3', '6', '7', '8', '9', '10', '11', '12', '16'])
((17.0,), ['farah', 'ghazni', 'helmand', 'herat', 'kabul', 'kandahar', 'khost', 'kunar', 'laghman', 'lowgar', 'nangarhar', 'nurestan', 'oruzgan', 'paktia', 'paktika', 'wardak', 'zabul'])
((4.0,), ['2', '4', '6', '15'])
((), [])
((12.0,), ['8', '47', '32', '4', '4', '9', '20', '15', '20', '16', '8', '41'])
((2.0,), ['federal republic of germany ortrud feickert karin fitzner ruth kasten', 'germany munkhbayar dorjsuren stefanie thurmann claudia verdicchio'])
((6.0,), ['huang qiuyan', 'anastasiya juravleva', 'zhang hao', 'wacharee ritthiwat', 'wang kuo-huei', 'fumiyo yoshida'])
((2.0,), ['james i 1231-1276', 'alfonso i 1286-1291'])
(('great britain',), ['great britain'])
(('87 850', '4 824'), ['87 850', '4 824'])
(('chinese hero',), ['chinese hero'])
((2.0,), ['1', '3'])
(('kolmonen',), ['kolmonen'])
((7.0,), ['the sea aro

In [69]:
d['questions'][0]['answer']['answerTexts']

['1']

In [81]:
table = pd.DataFrame.from_dict(d['table'])
question = d['questions'][0]['originalText']
#question = "what is baranov's nationality?"
#question = "which ships were launched in the year 2003?"
#question = 'how many players are drafted before dale armstrong' 
print(question)
table.head()

what is the number of points scored on 6 february 1922?


Unnamed: 0,Tie no-text,Home team-text,Score-text,Away team-text,Date-text
0,1,liverpool,0-1,west bromwich albion,28 january 1922
1,2,preston north end,3-1,newcastle united,28 january 1922
2,3,southampton,1-1,cardiff city,28 january 1922
3,replay,cardiff city,2-0,southampton,1 february 1922
4,4,leicester city,2-0,fulham,28 january 1922


In [71]:
new_table = pd.DataFrame.from_dict(table.sample(frac=1).reset_index(drop=True).to_dict())
new_table.head()

Unnamed: 0,Tie no-text,Home team-text,Score-text,Away team-text,Date-text
0,1,liverpool,0-1,west bromwich albion,28 january 1922
1,7,bolton wanderers,1-3,manchester city,28 january 1922
2,10,barnsley,3-1,oldham athletic,28 january 1922
3,replay,huddersfield town,2-0,brighton & hove albion,1 february 1922
4,5,nottingham forest,3-0,hull city,28 january 1922


In [72]:
table

Unnamed: 0,Tie no-text,Home team-text,Score-text,Away team-text,Date-text
0,1,liverpool,0-1,west bromwich albion,28 january 1922
1,2,preston north end,3-1,newcastle united,28 january 1922
2,3,southampton,1-1,cardiff city,28 january 1922
3,replay,cardiff city,2-0,southampton,1 february 1922
4,4,leicester city,2-0,fulham,28 january 1922
5,5,nottingham forest,3-0,hull city,28 january 1922
6,6,aston villa,1-0,luton town,28 january 1922
7,7,bolton wanderers,1-3,manchester city,28 january 1922
8,8,swindon town,0-1,blackburn rovers,28 january 1922
9,9,tottenham hotspur,1-0,watford,28 january 1922


In [73]:
#short_t = table.drop([4]).reset_index(drop=True)
short_t = table[table.columns.difference(['Definition-text'])]

In [74]:
target_table = table
#short_t.sample(frac=1).reset_index(drop=True)

In [75]:
target_table.head()

Unnamed: 0,Tie no-text,Home team-text,Score-text,Away team-text,Date-text
0,1,liverpool,0-1,west bromwich albion,28 january 1922
1,2,preston north end,3-1,newcastle united,28 january 1922
2,3,southampton,1-1,cardiff city,28 january 1922
3,replay,cardiff city,2-0,southampton,1 february 1922
4,4,leicester city,2-0,fulham,28 january 1922


In [82]:
encoding = tokenizer(table=target_table,
                                  queries=question,
                                  #truncation=True,
                                  padding="max_length",
                                  return_tensors="pt"
        )
        # remove the batch dimension which the tokenizer adds by default
        #encoding = {key: val.squeeze(0) for key, val in encoding.items()}

In [83]:
outputs = model(**encoding)

In [84]:
coor, aggs = tokenizer.convert_logits_to_predictions(
    {k:v.to('cpu') for k,v in encoding.items()},
    outputs.logits.cpu().detach(),
    outputs.logits_aggregation.cpu().detach()
)

In [85]:
print(id2aggregation[aggs[0]],coor[0])

COUNT [(17, 0)]


In [86]:
execute(id2aggregation[aggs[0]],coor[0],target_table)

((1.0,), ['replay'])

In [23]:
from tqdm import tqdm

In [24]:
ans = {}
err = set()
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
for key,d in tqdm(dics.items()):
    table = pd.DataFrame.from_dict(d['table'])
    question = d['questions'][0]['originalText']
    encoding = tokenizer(table=table,
          queries=question,
          truncation=True,
          padding="max_length",
          return_tensors="pt"
        )
    encoding = {k:v.to(device) for k,v in encoding.items()}
    model.to(device)
    with torch.no_grad():
        try:
            output = model(**encoding)
#         except IndexError:
#             continue
#         except RuntimeError:
        except:
            continue
        coor, agg = tokenizer.convert_logits_to_predictions(
            {k:v.to('cpu') for k,v in encoding.items()},
            output.logits.cpu().detach(),
            output.logits_aggregation.cpu().detach()
        )
        try:
            denos, _ = execute(id2aggregation[aggs[0]],coor[0],table)
            ans[key] = denos
        except IndexError:
            err.add(key)

100%|███████████████████████████████████████████████████████████████████████████████| 2810/2810 [57:12<00:00,  1.22s/it]


In [25]:
len(err)

0

In [33]:
ans['nt-378']

(1.0,)

In [30]:
with open('dev-out-iis2.tsv','w',newline='') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',quoting=csv.QUOTE_NONE,escapechar='\\')
    for k,deno in ans.items():
        deno = list(map(str,deno))
        row = [k]+[' '.join(deno)]
        writer.writerow(row)
        

In [32]:
for k, deno in list(ans.items())[:10]:
    deno = list(map(str,deno))
    row = [k]+[' '.join(deno)]
    print(row)


['nt-10001', '4.0']
['nt-10003', '1.0']
['nt-10021', '5.0']
['nt-10022', '2.0']
['nt-10024', '0.0']
['nt-10026', '1.0']
['nt-10030', '9.0']
['nt-10034', '1.0']
['nt-1004', '1.0']
['nt-10043', '1.0']


In [610]:
dics['nt-10001']

{'id': 'nt-10001-0',
 'table': {'Outcome-text': {0: 'runner-up',
   1: 'winner',
   2: 'winner',
   3: 'winner',
   4: 'runner-up',
   5: 'runner-up',
   6: 'winner',
   7: 'runner-up',
   8: 'winner'},
  'Year-text': {0: '2002',
   1: '2003',
   2: '2003',
   3: '2004',
   4: '2004',
   5: '2005',
   6: '2006',
   7: '2010',
   8: '2010'},
  'Championship-text': {0: 'canada',
   1: 'montreal',
   2: 'cincinnati',
   3: 'miami',
   4: 'toronto',
   5: 'cincinnati',
   6: 'cincinnati',
   7: 'indian wells',
   8: 'miami'},
  'Surface-text': {0: 'hard',
   1: 'hard',
   2: 'hard',
   3: 'hard',
   4: 'hard',
   5: 'hard',
   6: 'hard',
   7: 'hard',
   8: 'hard'},
  'Opponent-text': {0: 'guillermo canas',
   1: 'david nalbandian',
   2: 'mardy fish',
   3: 'guillermo coria',
   4: 'roger federer',
   5: 'roger federer',
   6: 'juan carlos ferrero',
   7: 'ivan ljubicic',
   8: 'tomas berdych'},
  'Score-text': {0: '4-6, 5-7',
   1: '6-1, 6-3',
   2: '4-6, 7-6(7-3), 7-6(7-4)',
   3: '6-7(