In [1]:
import jsonlines # !pip install jsonlines 해주기
import random

import pandas as pd
from transformers import TapasTokenizer
from transformers import TapasConfig, TapasForQuestionAnswering,AutoConfig, AutoTokenizer,  AutoModel

import torch

In [2]:
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wikisql-supervised")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised")


In [3]:
def construct_table(tb):
    
    column_header = tb['column_header']
    column_header_rev = []
    for col in column_header:
        temp_col = ""
        for c in col:
            temp_col += c + " "
        column_header_rev.append(temp_col)
        
    row_header = tb['row_header']
    row_header_rev = []
    for row in row_header:
        temp_row = ""
        for r in row:
            temp_row += r + " "
        row_header_rev.append(temp_row)
        
#     print(row_header_rev)
    
    data_values = tb['data']
    
    if len(row_header)>0:
        
        df = pd.DataFrame(data_values, columns = column_header_rev, index = row_header_rev)
        
        
    else:
        df = pd.DataFrame(data_values, columns = column_header_rev)
        
        
    return df

        

In [19]:
class AITQA_Dataset(torch.utils.data.Dataset):
    def __init__(self, qa_path, tb_path, tokenizer):
        
        self.qa_list = []
        with jsonlines.open(qa_path) as read_file:
            for line in read_file.iter():
                self.qa_list.append(line)
                
        self.tb_list = []
        with jsonlines.open(tb_path) as read_file:
            for line in read_file.iter():
                self.tb_list.append(line)
 
        self.tokenizer = tokenizer


    def __len__(self):
        return len(self.qa_list)
    
    
    def __getitem__(self, idx):
        
        qa_item = self.qa_list[idx]
#         print(qa_item)
        question = qa_item['question']
        answers = qa_item['answers']
        table_id = int(qa_item['table_id'].split('-')[-1])
        
        
        table = construct_table(self.tb_list[table_id])
        
        encoding = self.tokenizer(table=table,
                                  queries=question,
                                  padding="max_length",
                                  return_tensors="pt")
        
        
#         # remove the batch dimension which the tokenizer adds by default
#         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
#         # add the float_answer which is also required (weak supervision for aggregation case)
#         encoding["float_answer"] = torch.tensor(item.float_answer)
        
        return {'question': question,
                'answers': answers,
                'table_id':table_id,
#                 'table': self.tb_list[table_id],
                'encoding': encoding}


In [20]:
def collate_fc(samples, pad_id = 0):
    
    if len(samples)==0:
        return {}
    
    batch = {
            'question': [s['question'] for s in samples],
            'answers' : [s['answers'] for s in samples],
            'table_id' : [s['table_id'] for s in samples],
        
            'input_ids' : collate_tokens([s['encoding']['input_ids'] for s in samples],0),
            'token_type_ids' : collate_tokens([s['encoding']['token_type_ids'] for s in samples],0),
            'attention_mask' : collate_tokens([s['encoding']['attention_mask'] for s in samples],0),
        
    }
    
    
    return batch


def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
    """Convert a list of 1d tensors into a padded 2d tensor."""
    if len(values[0].size()) > 1:
        values = [v.view(-1) for v in values]
    size = max(v.size(0) for v in values)
    res = values[0].new(len(values), size).fill_(pad_idx)

    def copy_tensor(src, dst):
        assert dst.numel() == src.numel()
        if move_eos_to_beginning:
            assert src[-1] == eos_idx
            dst[0] = eos_idx
            dst[1:] = src[:-1]
        else:
            dst.copy_(src)

    for i, v in enumerate(values):
        copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
    return res

In [21]:
qa_data_path = '/home/jk/Jupyter_works/AITQA-master/raw_data/dev_questions.jsonl'
tb_data_path = '/home/jk/Jupyter_works/AITQA-master/raw_data/dev_tables.jsonl'

AITQA_dataset = AITQA_Dataset(qa_data_path, tb_data_path, tokenizer)
print(len(AITQA_dataset))


batch_size = 1

AITQA_dataloader = torch.utils.data.DataLoader(
                                                AITQA_dataset,
                                                batch_size=batch_size,
                                                collate_fn=collate_fc,
                                                pin_memory=True,
                                                shuffle = True,
                                                num_workers=5
                                            )

212


In [22]:
tb_data_path = '/home/jk/Jupyter_works/AITQA-master/raw_data/dev_tables.jsonl'

tb_list = []
with jsonlines.open(tb_data_path) as read_file:
    for line in read_file.iter():
        tb_list.append(line)

In [23]:

id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}


for batch in AITQA_dataloader:
    
#     print(batch)
#     print(construct_table(tb_list[batch['table_id'].item()]))
#     table = construct_table(tb_list[batch['table_id'].item()])
    
    
    
#     inputs = batch['encoding']


    inputs = {'input_ids': batch['input_ids'],
              'token_type_ids': batch['token_type_ids'].reshape(batch_size, -1, 7),
              'attention_mask': batch['attention_mask']
            
                }
    outputs = model(**inputs)

    
#     outputs = model(batch['input_ids'],
#                    batch['token_type_ids'],
#                    batch['attention_mask'],)


    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach()
    )    
    

    aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

    
    table = construct_table(tb_list[batch['table_id'][0]])

    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
            answers.append(table.iat[coordinates[0]])
        else:
            # multiple cells
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
            answers.append(", ".join(cell_values))


    queries = batch['question']
    gt_answers  = batch['answers']
    
    count = 0
    for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
        print(query)
        if predicted_agg == "NONE":
            print("Predicted answer: " + answer)
        else:
            print("Predicted answer: " + predicted_agg + " > " + answer)
            
            
        print("GT answer: " + gt_answers[count][0])
        count+=1
        print(" ")

            
    
            
    assert(False)

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f87ad1955e0>
Traceback (most recent call last):
Exception ignored in:   File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f87ad1955e0>
Traceback (most recent call last):
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    Exception ignored in: self._shutdown_workers()    <function _MultiProcessingDataLoaderIter.__del__ at 0x7f87ad1955e0>
self._shutdown_workers()
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
        Exception ignored in: if w.is_alive():if w.is_alive():<function _MultiProcessingDataLoaderIter.__del__ at 0x7f87ad1955e0>

  File "/hom

    
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
self._shutdown_workers()  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers

  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
        if w.is_alive():if w.is_alive():    

if w.is_alive():  File "/home/jk/anaconda3/lib/python3.8/multiprocessing/process.py", line 160, in is_alive
  File "/home/jk/anaconda3/lib/python3.8/multiprocessing/process.py", line 160, in is_alive

        assert self._parent_pid == os.getpid(), 'can only test a child process'assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/home/jk/anaconda3/lib/python3.8/multiprocessing/process.py", line 160, in is_alive

AssertionError    : AssertionErrorassert self._parent_pid == os.getpid(), 'can only test a child process'can only test 

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/jk/anaconda3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-19-2e282fe562f1>", line 32, in __getitem__
    encoding = self.tokenizer(table=table,
  File "/home/jk/anaconda3/lib/python3.8/site-packages/transformers/models/tapas/tokenization_tapas.py", line 624, in __call__
    return self.encode_plus(
  File "/home/jk/anaconda3/lib/python3.8/site-packages/transformers/models/tapas/tokenization_tapas.py", line 990, in encode_plus
    return self._encode_plus(
  File "/home/jk/anaconda3/lib/python3.8/site-packages/transformers/models/tapas/tokenization_tapas.py", line 1044, in _encode_plus
    return self.prepare_for_model(
  File "/home/jk/anaconda3/lib/python3.8/site-packages/transformers/models/tapas/tokenization_tapas.py", line 1182, in prepare_for_model
    raw_table = add_numeric_table_values(raw_table)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/transformers/models/tapas/tokenization_tapas.py", line 2767, in add_numeric_table_values
    table.iloc[row_index, col_index] = Cell(text=cell)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py", line 692, in __setitem__
    iloc._setitem_with_indexer(indexer, value, self.name)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py", line 1637, in _setitem_with_indexer
    self._setitem_single_block(indexer, value, name)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py", line 1861, in _setitem_single_block
    self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py", line 568, in setitem
    return self.apply("setitem", indexer=indexer, value=value)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py", line 427, in apply
    applied = getattr(b, f)(**kwargs)
  File "/home/jk/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py", line 1035, in setitem
    values[indexer] = value
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


In [None]:
table

In [None]:
assert(False)

In [25]:


ait_qa_data_dir = '/home/jk/Jupyter_works/AITQA-master/raw_data/dev_questions.jsonl'

qa_list = []

with jsonlines.open(ait_qa_data_dir) as read_file:
    for line in read_file.iter():
        qa_list.append(line)
 
len(qa_list)

212

In [26]:


ait_tb_data_dir = '/home/jk/Jupyter_works/AITQA-master/raw_data/dev_tables.jsonl'

tb_list = []

with jsonlines.open(ait_tb_data_dir) as read_file:
    for line in read_file.iter():
        tb_list.append(line)
 
len(tb_list)

40

In [87]:
def construct_table2(tb):
    

        
    row_header = tb['row_header']
    row_header_rev = []
    for row in row_header:
        temp_row = ""
        for r in row:
            temp_row += r + " "
        row_header_rev.append(temp_row)
        
        
        
        
    column_header = tb['column_header']
    if len(row_header)>0:
        column_header_rev = ["Row Headers"]
    else:
        column_header_rev = []
    for col in column_header:
        temp_col = ""
        for c in col:
            temp_col += c + " "
        column_header_rev.append(temp_col)


    
    data_values = tb['data']

        
    
    if len(row_header)>0:
        
        print("len(row_header) >>> ", len(row_header))
        print("len(data_values)       >>> ", len(data_values))
        for i in range(len(data_values)):
            temp_list = [row_header_rev[i]]
            print(row_header_rev[i])
            print(data_values[i])
            print("====================================")
            data_values[i] =  temp_list + data_values[i]
        
#         df = pd.DataFrame(data_values, columns = column_header_rev, index = row_header_rev)
        df = pd.DataFrame(data_values, columns = column_header_rev)

        
    else:
        df = pd.DataFrame(data_values, columns = column_header_rev)
        
        
    return df

        

In [115]:
rand_idx = random.randint(0,len(qa_list)-1)
print("rand_idx >> ", rand_idx)
# rand_idx=0

item = qa_list[rand_idx]
question = item['question']
gt_answers = item['answers']
table_id = int(item['table_id'].split('-')[-1])

print(question)
print(gt_answers)

table = construct_table2(tb_list[table_id])
display(table)



inputs = tokenizer(table=table,
                   queries=question,
                   padding="max_length",
                   return_tensors="pt")




id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}

outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs,
    outputs.logits.detach(),
    outputs.logits_aggregation.detach()
)

aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

answers = []
for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
        # only a single cell:
        answers.append(table.iat[coordinates[0]])
    else:
        # multiple cells
        cell_values = []
        for coordinate in coordinates:
            cell_values.append(table.iat[coordinate])
        answers.append(", ".join(cell_values))


for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
    print(query)
    if predicted_agg == "NONE":
        print("Predicted answer: " + answer)
        print(" ")
    else:
        print("Predicted answer: " + predicted_agg + " > " + answer)
        print(" ")

    print("Grond Truth: ", gt_answers[0])

rand_idx >>  118
how much was the operating expense for regional carriers in 2016?
['4,311']
len(row_header) >>>  13
len(data_values)       >>>  13
Salaries and related costs 
['$10,034', '$8,776', '$1,258', '14.3 %']
Aircraft fuel and related taxes 
['5,133', '6,544', '(1,411)', '(21.6)%']
Regional carriers expense 
['4,311', '4,241', '70', '1.7 %']
Contracted services 
['1,991', '1,848', '143', '7.7 %']
Depreciation and amortization 
['1,902', '1,835', '67', '3.7 %']
Aircraft maintenance materials and outside repairs 
['1,823', '1,848', '(25)', '(1.4)%']
Passenger commissions and other selling expenses 
['1,710', '1,672', '38', '2.3 %']
Landing fees and other rents 
['1,490', '1,493', '(3)', '(0.2)%']
Profit sharing 
['1,115', '1,490', '(375)', '(25.2)%']
Passenger service 
['907', '872', '35', '4.0 %']
Aircraft rent 
['285', '250', '35', '14.0 %']
Other 
['1,986', '2,033', '(47)', '(2.3)%']
Total operating expense 
['$32,687', '$32,902', '$(215)', '(0.7)%']


Unnamed: 0,Row Headers,"Year Ended December 31, 2016","Year Ended December 31, 2015",Increase (Decrease),% Increase (Decrease)
0,Salaries and related costs,"$10,034","$8,776","$1,258",14.3 %
1,Aircraft fuel and related taxes,5133,6544,"(1,411)",(21.6)%
2,Regional carriers expense,4311,4241,70,1.7 %
3,Contracted services,1991,1848,143,7.7 %
4,Depreciation and amortization,1902,1835,67,3.7 %
5,Aircraft maintenance materials and outside rep...,1823,1848,(25),(1.4)%
6,Passenger commissions and other selling expenses,1710,1672,38,2.3 %
7,Landing fees and other rents,1490,1493,(3),(0.2)%
8,Profit sharing,1115,1490,(375),(25.2)%
9,Passenger service,907,872,35,4.0 %


How much money did United spend for aircraft fuel in 2016?
Predicted answer: 4,311
 
Grond Truth:  4,311


In [83]:
a = ['Operating revenue']
b = ['$9,032', '$10,777', '$11,003', '$10,491']

a+b

['Operating revenue', '$9,032', '$10,777', '$11,003', '$10,491']

In [116]:
item

{'id': 'dev-118',
 'table_id': 'tab-21',
 'question': 'how much was the operating expense for regional carriers in 2016?',
 'answers': ['4,311'],
 'type': 'Table-driven',
 'row_hierarchy_needed': 'Yes',
 'paraphrase_group': ''}

In [117]:
table_id

21

In [118]:
tb = tb_list[table_id]
tb_list[table_id]

{'column_header': [['Year Ended December 31,', '2016'],
  ['Year Ended December 31,', '2015'],
  ['Increase (Decrease)'],
  ['% Increase (Decrease)']],
 'row_header': [['Salaries and related costs'],
  ['Aircraft fuel and related taxes'],
  ['Regional carriers expense'],
  ['Contracted services'],
  ['Depreciation and amortization'],
  ['Aircraft maintenance materials and outside repairs'],
  ['Passenger commissions and other selling expenses'],
  ['Landing fees and other rents'],
  ['Profit sharing'],
  ['Passenger service'],
  ['Aircraft rent'],
  ['Other'],
  ['Total operating expense']],
 'data': [['Salaries and related costs ',
   '$10,034',
   '$8,776',
   '$1,258',
   '14.3 %'],
  ['Aircraft fuel and related taxes ', '5,133', '6,544', '(1,411)', '(21.6)%'],
  ['Regional carriers expense ', '4,311', '4,241', '70', '1.7 %'],
  ['Contracted services ', '1,991', '1,848', '143', '7.7 %'],
  ['Depreciation and amortization ', '1,902', '1,835', '67', '3.7 %'],
  ['Aircraft maintenance 

In [122]:
row_header = tb['row_header']
row_header_rev = []
for row in row_header:
    temp_row = ""
    for r in row:
        temp_row += r + " "
    row_header_rev.append(temp_row)




column_header = tb['column_header']
if len(row_header)>0:
    column_header_rev = ["Row Headers"]
else:
    column_header_rev = []
for col in column_header:
    temp_col = ""
    for c in col:
        temp_col += c + " "
    column_header_rev.append(temp_col)
    
    

    
data_values = tb['data']



if len(row_header)>0:

    print("len(row_header) >>> ", len(row_header))
    print("len(data_values)       >>> ", len(data_values))
    for i in range(len(data_values)):
        temp_list = [row_header_rev[i]]
        print(row_header_rev[i])
        print(data_values[i])
        print("====================================")
        data_values[i] =  temp_list + data_values[i]

#         df = pd.DataFrame(data_values, columns = column_header_rev, index = row_header_rev)
    df = pd.DataFrame(data_values, columns = column_header_rev)


else:
    df = pd.DataFrame(data_values, columns = column_header_rev)

len(row_header) >>>  13
len(data_values)       >>>  13
Salaries and related costs 
['Salaries and related costs ', 'Salaries and related costs ', '$10,034', '$8,776', '$1,258', '14.3 %']
Aircraft fuel and related taxes 
['Aircraft fuel and related taxes ', 'Aircraft fuel and related taxes ', '5,133', '6,544', '(1,411)', '(21.6)%']
Regional carriers expense 
['Regional carriers expense ', 'Regional carriers expense ', '4,311', '4,241', '70', '1.7 %']
Contracted services 
['Contracted services ', 'Contracted services ', '1,991', '1,848', '143', '7.7 %']
Depreciation and amortization 
['Depreciation and amortization ', 'Depreciation and amortization ', '1,902', '1,835', '67', '3.7 %']
Aircraft maintenance materials and outside repairs 
['Aircraft maintenance materials and outside repairs ', 'Aircraft maintenance materials and outside repairs ', '1,823', '1,848', '(25)', '(1.4)%']
Passenger commissions and other selling expenses 
['Passenger commissions and other selling expenses ', 'Passe

ValueError: 5 columns passed, passed data had 7 columns

In [120]:
data_values

[['Salaries and related costs ',
  'Salaries and related costs ',
  '$10,034',
  '$8,776',
  '$1,258',
  '14.3 %'],
 ['Aircraft fuel and related taxes ',
  'Aircraft fuel and related taxes ',
  '5,133',
  '6,544',
  '(1,411)',
  '(21.6)%'],
 ['Regional carriers expense ',
  'Regional carriers expense ',
  '4,311',
  '4,241',
  '70',
  '1.7 %'],
 ['Contracted services ',
  'Contracted services ',
  '1,991',
  '1,848',
  '143',
  '7.7 %'],
 ['Depreciation and amortization ',
  'Depreciation and amortization ',
  '1,902',
  '1,835',
  '67',
  '3.7 %'],
 ['Aircraft maintenance materials and outside repairs ',
  'Aircraft maintenance materials and outside repairs ',
  '1,823',
  '1,848',
  '(25)',
  '(1.4)%'],
 ['Passenger commissions and other selling expenses ',
  'Passenger commissions and other selling expenses ',
  '1,710',
  '1,672',
  '38',
  '2.3 %'],
 ['Landing fees and other rents ',
  'Landing fees and other rents ',
  '1,490',
  '1,493',
  '(3)',
  '(0.2)%'],
 ['Profit sharing '

In [121]:
column_header_rev

['Row Headers',
 'Year Ended December 31, 2016 ',
 'Year Ended December 31, 2015 ',
 'Increase (Decrease) ',
 '% Increase (Decrease) ']

In [104]:
165, 62

165