In [1]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = TapasTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

In [3]:
def apply_model(table, queries):
    inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
    outputs = model(**inputs)

    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
    )

    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

    answers = []

    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
            answers.append(table.iat[coordinates[0]])
        else:
            # multiple cells
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
            answers.append(", ".join(cell_values))

    display(table)        
    #print("")
    result = ""

    for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
        question = '===> {}\n'.format(query)
        if predicted_agg == "NONE":
            #print('{}\n'.format(answer))
            answer_formatted = answer + '\n'
        else:
            answer_formatted = '{} > {}\n\n'.format(predicted_agg, answer)
        result += question + answer_formatted

    return result
    

In [4]:
df = pd.read_csv('Countries by GDP.csv', sep=';')
df = df.reset_index(drop=True)

# Pandas guesses the numerical columns to be numeric, but TAPAS only like strings
df['GDP'] = df['GDP'].astype(str)
df['Year'] = df['Year'].astype(str)

In [5]:
queries_gdp = [
    "What is the GDP of the United States in 2022?", 
    "What is the total GDP of Europe in 2022?", 
    "What is the average GDP of all countries in 2022?",
    "What is the GDP of Italy in 2021?",
    "How many distinct countries are there in Asia?",
    "Total European GDP in 2021?",
    "What countries have a GDP higher than 10 million in 2022?",
    "How much does the total GDP in 2022 differ from the total GDP in 2021?",
    "What is the percentual increase of the US 2022 GDP vs 2021?"
]

print(apply_model(df, queries_gdp))

  segment_means = out.scatter_reduce(


Unnamed: 0,Country,Region,GDP,Year
0,United States,Americas,25035164,2022
1,China,Asia,18321197,2022
2,Japan,Asia,4300621,2022
3,Germany,Europe,4031149,2022
4,India,Asia,3468566,2022
5,United Kingdom,Europe,3198470,2022
6,France,Europe,2778090,2022
7,Canada,Americas,2200352,2022
8,Russia,Europe,2133092,2022
9,Italy,Europe,1996934,2022


===> What is the GDP of the United States in 2022?
SUM > 25035164

===> What is the total GDP of Europe in 2022?
SUM > 4031149, 3198470, 2778090, 2133092, 1996934

===> What is the average GDP of all countries in 2022?
AVERAGE > 25035164, 18321197, 4300621, 4031149, 3468566, 3198470, 2778090, 2200352, 2133092, 1996934

===> What is the GDP of Italy in 2021?
SUM > 2099880

===> How many distinct countries are there in Asia?
COUNT > China , Japan , India , China , Japan , India 

===> Total European GDP in 2021?
SUM > 4223116, 3186860, 2937473, 1775800, 2099880

===> What countries have a GDP higher than 10 million in 2022?
United States , China , Japan , Germany , India , United Kingdom , France 
===> How much does the total GDP in 2022 differ from the total GDP in 2021?
SUM > 25035164, 18321197, 4300621, 4031149, 3468566, 3198470, 2778090, 2200352, 2133092, 1996934, 22996100, 17734063, 4937422, 4223116, 3173398, 3186860, 2937473, 1990762, 1775800, 2099880

===> What is the percentual i