# 03. Model v0

Model v0 is an ML model for typical V-NLI, which accepts text and tabular data as input, and returns the corresponding figure.

We adopt [TAPEX](https://arxiv.org/abs/2107.07653), a pre-trained [BART](https://arxiv.org/abs/1910.13461) model, as a base model.
And we use almost the same hyperparameters as TAPEX.

We fine-tune the model with the two nvBench datasets preprocessed in the previous notebook.
One model is for the user study, and another is for real-world usage (Check the details in the previous notebook).

We use Hugging Face Transformers, one of the most famous NLP libraries, for implementation.
Primarily we refer to [this TAPEX example](https://github.com/huggingface/transformers/blob/main/examples/research_projects/tapex/run_wikisql_with_tapex.py
).

As a result, the exact match ratio of the user study model gets to be ~90% for the test dataset, and the model for real-world usage gets to be ~60%.
It's not comparable to the existing work because we adopt a different way to preprocess the nvBench dataset.
However, our goal is not to improve an ML model for V-NLI but to propose a novel UI for data visualization.

In the final user study, we use this model as the baseline model to compare our proposed interface with typical V-NLI.

## Setup

### Define Parameters


In [1]:
data_dir: str = "../data/"
load_model_from_last_checkpoint: bool = False
push_model_to_huggingface_hub: bool = True
skip_training: bool = False

# If user_study is True, the model is trained by the preprocessed dataset with the stratified sampling
# Otherwise, the group shuffled dataset is used
# See the preprocess notebook for the details of the datasets
user_study: bool = True


### Load Modules

In [2]:
import functools
import multiprocessing
import os
import pandas as pd
import sqlite3

from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List

import evaluate
import datasets
import numpy as np
import torch
import transformers

from datasets import Dataset
from transformers import (
    BartConfig,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    EvalPrediction,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    TapexTokenizer,
    trainer_utils,
)


In [3]:
transformers.set_seed(123)


### Define Variables

In [4]:
# Paths

DATA_DIR: Path = Path(data_dir)
DATABASE_DIR: Path = DATA_DIR.joinpath("database")

if user_study:
    MODEL_NAME: str = "vxnli-v0-user-study"
    PREPROCESSED_NVBENCH_DIR: Path = DATA_DIR.joinpath(
        "preprocessed-nvBench/stratified"
    )
else:
    MODEL_NAME: str = "vxnli-v0"
    PREPROCESSED_NVBENCH_DIR: Path = DATA_DIR.joinpath("preprocessed-nvBench/grouped")

DATASET_OUTPUT_DIR: Path = DATA_DIR.joinpath(f"{MODEL_NAME}.hf")
MODEL_OUTPUT_DIR: Path = DATA_DIR.joinpath(MODEL_NAME)
PREDS_OUTPUT_PATH: Path = DATA_DIR.joinpath(f"{MODEL_NAME}-preds.csv")

# Model Parameters

BASE_MODEL: str = "microsoft/tapex-base-finetuned-wtq"
MAX_SOURCE_LENGTH: int = 1024
MAX_TARGET_LENGTH: int = 124


### Load Tokenizer


In [5]:
tokenizer = TapexTokenizer.from_pretrained(
    BASE_MODEL, use_fast=True, add_prefix_space=True
)


In [6]:
# Example
tokenizer(
    table=pd.DataFrame.from_dict(
        {
            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
            "Number of movies": ["87", "53", "69"],
        }
    ),
    answer="how many movies does Leonardo Di Caprio have?",
    return_tensors="pt",
)


{'input_ids': tensor([[    0, 11311,  4832,  5552,  1721,   346,     9,  4133,  3236,   112,
          4832,  5378,   625,   181,  2582,  1721,  8176,  3236,   132,  4832,
          2084,   261,  6782,  2269,  2927, 12834,  1721,  4268,  3236,   155,
          4832,  5473, 26875, 42771,  6071,  1721,  5913,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

### Load Model

In [7]:
model_config = BartConfig.from_pretrained(
    BASE_MODEL,
    no_repeat_ngram_size=0,
    max_length=MAX_SOURCE_LENGTH,
    early_stopping=False,
)

if load_model_from_last_checkpoint:
    model = trainer_utils.get_last_checkpoint(MODEL_OUTPUT_DIR)
else:
    model = BASE_MODEL

model = BartForConditionalGeneration.from_pretrained(
    model,
    config=model_config,
)


## Preprocess Dataset


In [8]:
def load_table(db_id: str, table_name: str) -> pd.DataFrame:
    db_path = DATABASE_DIR.joinpath(f"{db_id}/{db_id}.sqlite")

    with sqlite3.connect(db_path) as con:
        return pd.read_sql(f"SELECT * FROM {table_name}", con)


# Example
load_table("customers_and_products_contacts", "products").head()


Unnamed: 0,product_id,product_type_code,product_name,product_price
0,1,Hardware,Apple,54753980.0
1,2,Clothes,jcrew,30590930.0
2,3,Hardware,Apple,10268.85
3,4,Hardware,Apple,22956670.0
4,5,Clothes,jcrew,5927022.0


In [9]:
def preprocess_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={col: col.lower() for col in df.columns})

    for col_name, col_dtype in zip(df.columns, df.dtypes):
        if pd.api.types.is_string_dtype(col_dtype):
            df[col_name] = df[col_name].str.lower()

    # The TAPEX tokenizer raises an error when the table contains non-str columns
    df = df.astype(str)

    return df


preprocess_table(load_table("customers_and_products_contacts", "products").head())


Unnamed: 0,product_id,product_type_code,product_name,product_price
0,1,hardware,apple,54753982.574522
1,2,clothes,jcrew,30590929.528306
2,3,hardware,apple,10268.85297069
3,4,hardware,apple,22956668.699482
4,5,clothes,jcrew,5927021.8748021


In [10]:
# functools.cache is supported in python3.9+, but use lru_cache to support python3.7+
@functools.lru_cache(maxsize=None)
def load_and_preprocess_table(db_id: str, table_name: str) -> pd.DataFrame:
    table = load_table(db_id, table_name)
    table = preprocess_table(table)

    return table


In [11]:
def preprocess_dataset(example: Dict[str, Any]) -> Dict[str, torch.Tensor]:
    table = load_and_preprocess_table(example["db_id"], example["table"])

    query = example["question"]
    answer = example["vega_zero"]

    model_inputs = tokenizer(
        table=table,
        query=query,
        answer=answer,
        max_length=MAX_SOURCE_LENGTH,
        padding=True,
        truncation=True,
    )

    labels = tokenizer(
        answer=answer,
        max_length=MAX_TARGET_LENGTH,
        padding=True,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [12]:
if DATASET_OUTPUT_DIR.exists():
    # load_from_dist doesn't support pathlib.Path
    dataset = datasets.load_from_disk(str(DATASET_OUTPUT_DIR))
else:
    dataset = datasets.load_dataset(
        "csv",
        data_files={
            # load_dataset doesn't support pathlib.Path
            "train": str(PREPROCESSED_NVBENCH_DIR.joinpath("train.csv")),
            "test": str(PREPROCESSED_NVBENCH_DIR.joinpath("test.csv")),
            "validation": str(PREPROCESSED_NVBENCH_DIR.joinpath("val.csv")),
        },
    )

    dataset = dataset.map(
        preprocess_dataset,
        batched=False,
        num_proc=multiprocessing.cpu_count(),
    )

    # save_to_disk doesn't support pathlib.Path
    dataset.save_to_disk(str(DATASET_OUTPUT_DIR))

dataset


DatasetDict({
    train: Dataset({
        features: ['db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'SQL', 'table', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12485
    })
    test: Dataset({
        features: ['db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'SQL', 'table', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1649
    })
    validation: Dataset({
        features: ['db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'SQL', 'table', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1592
    })
})

## Training


In [13]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
)


In [14]:
exact_match = evaluate.load("exact_match")


def compute_metrics(eval_pred: EvalPrediction):
    preds, labels = eval_pred

    preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    return exact_match.compute(predictions=preds, references=labels)


In [15]:
# Huggingface trainer uses environment variables to configure mlflow
# https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/integrations.py#L884

# MLFlow experiment name must be updated if you update training arguments
os.environ["MLFLOW_EXPERIMENT_NAME"] = f"{MODEL_NAME}-{datetime.now().strftime('%Y%m%d%H%M')}"


In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(
        output_dir=MODEL_OUTPUT_DIR,
        predict_with_generate=True,
        num_train_epochs=50,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        do_eval=True,
        metric_for_best_model="exact_match",
        push_to_hub=push_model_to_huggingface_hub,
        report_to="mlflow",
    ),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5),
    ],
)


Cloning https://huggingface.co/kwkty/vxnli-v0-user-study into local empty directory.


In [17]:
if not skip_training:
    trainer.train()


The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, SQL, vega_zero, question, query, hardness. If table, chart, db_id, SQL, vega_zero, question, query, hardness are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12485
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 19550
  Number of trainable parameters = 139420416
2022/11/25 08:26:49 INFO mlflow.tracking.fluent: Experiment with name 'vxnli-v0-user-study-202211250826' does not exist. Creating a new experiment.


Epoch,Training Loss,Validation Loss,Exact Match
1,0.2411,0.032693,0.900126
2,0.0231,0.026131,0.900126
3,0.0134,0.016617,0.944724
4,0.0094,0.018601,0.943467
5,0.0077,0.022604,0.93593
6,0.0066,0.016064,0.953518
7,0.0058,0.017232,0.953518
8,0.0053,0.02095,0.944724
9,0.0047,0.020135,0.93907
10,0.0052,0.022958,0.951005


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, SQL, vega_zero, question, query, hardness. If table, chart, db_id, SQL, vega_zero, question, query, hardness are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1592
  Batch size = 32
Saving model checkpoint to ../data/vxnli-v0-user-study/checkpoint-391
Configuration saved in ../data/vxnli-v0-user-study/checkpoint-391/config.json
Model weights saved in ../data/vxnli-v0-user-study/checkpoint-391/pytorch_model.bin
tokenizer config file saved in ../data/vxnli-v0-user-study/checkpoint-391/tokenizer_config.json
Special tokens file saved in ../data/vxnli-v0-user-study/checkpoint-391/special_tokens_map.json
tokenizer config file saved in ../data/vxnli-v0-user-study/tokenizer_config.json
Special tokens file saved in ../data/vxnli-v0-user-s

## Evaluation

In [18]:
# trainer.evaluate must be called for the model card

trainer.evaluate(dataset["test"])

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, SQL, vega_zero, question, query, hardness. If table, chart, db_id, SQL, vega_zero, question, query, hardness are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1649
  Batch size = 32


{'eval_loss': 0.014960955828428268,
 'eval_exact_match': 0.9405700424499697,
 'eval_runtime': 122.1686,
 'eval_samples_per_second': 13.498,
 'eval_steps_per_second': 0.426,
 'epoch': 11.0}

In [19]:
def predict(ds: Dataset) -> List[str]:
    preds = trainer.predict(
        ds,
        max_length=MAX_TARGET_LENGTH,
    )

    preds = tokenizer.batch_decode(
        preds.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    return [pred.strip() for pred in preds]


In [20]:
preds = predict(dataset["test"])

preds[:5], dataset["test"]["vega_zero"][:5]


The following columns in the test set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, SQL, vega_zero, question, query, hardness. If table, chart, db_id, SQL, vega_zero, question, query, hardness are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1649
  Batch size = 32


(['mark bar encoding x county_name y aggregate none population',
  'mark bar encoding x nationality y aggregate count nationality transform group x sort y desc',
  'mark bar encoding x crs_code y aggregate count crs_code transform group x sort x asc',
  'mark bar encoding x name y aggregate none code transform filter price between 60 and 120 sort y asc',
  'mark arc encoding x affiliation y aggregate sum enrollment transform group x'],
 ['mark bar encoding x county_name y aggregate none population',
  'mark bar encoding x nationality y aggregate count nationality transform group x sort y desc',
  'mark bar encoding x crs_code y aggregate count crs_code transform group x sort x asc',
  'mark bar encoding x name y aggregate none code transform filter price between 60 and 120 sort y asc',
  'mark arc encoding x affiliation y aggregate sum enrollment transform group x'])

In [21]:
exact_match.compute(
    predictions=preds,
    references=dataset["test"]["vega_zero"],
)

{'exact_match': 0.9023650697392359}

In [22]:
preds_df = dataset["test"].to_pandas()
preds_df = preds_df.drop(columns=["input_ids", "attention_mask", "labels"])
preds_df["pred"] = preds
preds_df["exact_matched"] = preds_df["pred"] == preds_df["vega_zero"]

preds_df.to_csv(PREDS_OUTPUT_PATH)

preds_df


Unnamed: 0,db_id,chart,hardness,query,question,vega_zero,SQL,table,pred,exact_matched
0,election,Bar,Easy,"Visualize BAR SELECT County_name , Population ...",what are the name and population of each count...,mark bar encoding x county_name y aggregate no...,"SELECT County_name , Population FROM county",county,mark bar encoding x county_name y aggregate no...,True
1,swimming,Bar,Medium,"Visualize BAR SELECT Nationality , COUNT(Natio...",return a bar chart about the distribution of n...,mark bar encoding x nationality y aggregate co...,"SELECT Nationality , COUNT(Nationality) FROM s...",swimmer,mark bar encoding x nationality y aggregate co...,True
2,college_1,Bar,Medium,"Visualize BAR SELECT CRS_CODE , count(*) FROM ...",visualize a bar chart for how many sections do...,mark bar encoding x crs_code y aggregate count...,"SELECT CRS_CODE , count(*) FROM CLASS GROUP BY...",class,mark bar encoding x crs_code y aggregate count...,True
3,manufactory_1,Bar,Medium,"Visualize BAR SELECT Name , Code FROM products...",for those products with a price between 60 and...,mark bar encoding x name y aggregate none code...,"SELECT Name , Code FROM products WHERE price B...",products,mark bar encoding x name y aggregate none code...,True
4,university_basketball,Pie,Easy,"Visualize PIE SELECT Affiliation , sum(enrollm...",display a pie chart for what are the total enr...,mark arc encoding x affiliation y aggregate su...,"SELECT Affiliation , sum(enrollment) FROM univ...",university,mark arc encoding x affiliation y aggregate su...,True
...,...,...,...,...,...,...,...,...,...,...
1644,local_govt_in_alabama,Pie,Easy,"Visualize PIE SELECT Event_Details , COUNT(Eve...",group and count details for the events using a...,mark arc encoding x event_details y aggregate ...,"SELECT Event_Details , COUNT(Event_Details) FR...",events,mark arc encoding x event_details y aggregate ...,True
1645,bike_1,Line,Hard,"Visualize LINE SELECT date , COUNT(date) FROM ...",please show the trend about the number of days...,mark line encoding x date y aggregate count da...,"SELECT date , COUNT(date) FROM weather WHERE m...",weather,mark line encoding x date y aggregate count da...,True
1646,hr_1,Bar,Extra Hard,"Visualize BAR SELECT HIRE_DATE , COUNT(HIRE_DA...",for all employees who have the letters d or s ...,mark bar encoding x hire_date y aggregate coun...,"SELECT HIRE_DATE , COUNT(HIRE_DATE) FROM emplo...",employees,mark bar encoding x hire_date y aggregate coun...,True
1647,bike_1,Line,Hard,"Visualize LINE SELECT date , COUNT(date) FROM ...",please show the trend about the number of days...,mark line encoding x date y aggregate count da...,"SELECT date , COUNT(date) FROM weather WHERE m...",weather,mark line encoding x date y aggregate count da...,True


In [23]:
pd.concat(
    [
        preds_df[preds_df["hardness"] == hardness]["exact_matched"]
        .value_counts()
        .rename(hardness)
        for hardness in ("Easy", "Medium", "Hard", "Extra Hard")
    ],
    axis=1,
)


Unnamed: 0,Easy,Medium,Hard,Extra Hard
True,527,677,183,101
False,42,58,22,39


In [24]:
pd.concat(
    [
        preds_df[preds_df["chart"] == chart]["exact_matched"]
        .value_counts()
        .rename(chart)
        
        for chart in preds_df["chart"].unique()
    ],
    axis=1,
)


Unnamed: 0,Bar,Pie,Line,Scatter,Grouping Line,Stacked Bar,Grouping Scatter
True,1143,133,73,29,31,50,29
False,91,22,24,13,5,2,4


## Push Model


In [25]:
if push_model_to_huggingface_hub:
    # huggingface_hub.notebook_login()

    trainer.push_to_hub()


Saving model checkpoint to ../data/vxnli-v0-user-study
Configuration saved in ../data/vxnli-v0-user-study/config.json
Model weights saved in ../data/vxnli-v0-user-study/pytorch_model.bin
tokenizer config file saved in ../data/vxnli-v0-user-study/tokenizer_config.json
Special tokens file saved in ../data/vxnli-v0-user-study/special_tokens_map.json
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/kwkty/vxnli-v0-user-study
   1fdfca4..9dda6e8  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}
To https://huggingface.co/kwkty/vxnli-v0-user-study
   9dda6e8..c24914b  main -> main

