#### Tapex : Model (from hugging face) that understand tables

In [44]:
# REPO: https://github.com/microsoft/Table-Pretraining

# Hugging Face model link: https://huggingface.co/microsoft/tapex-base-finetuned-wikisql

# Paper link: https://arxiv.org/pdf/2107.07653

# My Chatgpt conversation about this model (confidential): https://chatgpt.com/c/696fa4c0-03b0-8322-a886-a24f4064854c

## Use Online

In [2]:
from transformers import TapexTokenizer, BartForConditionalGeneration
import pandas as pd


In [3]:
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base-finetuned-wikisql")

In [4]:
model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-base-finetuned-wikisql")

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

In [24]:
data = {
    "year": [1896, 1900, 1904, 2004, 2008, 2012],
    "city": ["athens", "paris", "st. louis", "athens", "beijing", "london"]
}
table = pd.DataFrame.from_dict(data)


In [25]:
# tapex accepts uncased input since it is pre-trained on the uncased corpus
query = "In which year did beijing host the Olympic Games?"

encoding = tokenizer(table=table, query=query, return_tensors="pt")

In [26]:
encoding

{'input_ids': tensor([[    0,    11,    61,    76,   222,    28, 40049,  1482,     5,  1021,
         31434,   636,   426,   116, 11311,  4832,    76,  1721,   343,  3236,
           112,  4832, 42773,  1721,    23, 27859,  3236,   132,  4832, 23137,
          1721,  2242,   354,  3236,   155,  4832, 42224,  1721,  1690,     4,
         26120,   354,  3236,   204,  4832,  4482,  1721,    23, 27859,  3236,
           195,  4832,  2266,  1721,    28, 40049,  3236,   231,  4832,  1125,
          1721,   784, 24639,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [27]:
outputs = model.generate(**encoding)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
# [' 2008.0']

[' 2008.0']


## Download Model in Local Machine 

In [17]:
from transformers import TapexTokenizer, BartForConditionalGeneration

model_name = "microsoft/tapex-base-finetuned-wikisql"
save_dir = "./tapex_local"

tokenizer = TapexTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)

print("Saved model locally!")


Saved model locally!


### Load downloaded model

In [18]:
from transformers import TapexTokenizer, BartForConditionalGeneration

tokenizer = TapexTokenizer.from_pretrained(
    "./tapex_local",
    local_files_only=True
)

model = BartForConditionalGeneration.from_pretrained(
    "./tapex_local",
    local_files_only=True
)

print("Offline TAPEX ready!")


Offline TAPEX ready!


### Use downloaded model

In [38]:
data = {
    "year": [1896, 1900, 1904, 2004, 2008, 2012],
    "city": ["athens", "paris", "st. louis", "athens", "beijing", "london"]
}

table = pd.DataFrame(data)
table = table.astype(str)

question = "In which year did beijing host the Olympic Games?"

In [39]:
encoding = tokenizer(table=table, query=question, return_tensors="pt")


outputs = model.generate(**encoding)

answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(answer)

[' 2008.0']
