# 06. Model v1

Model v1 is a ML model for the proposed UI which accepts "any" inputs.

We fine-tune model-v0 with the custom dataset annotated in the previous notebook.
We modify the tokenizer from the original one to support variable length and keyword arguments.

As a result, the exact match ratio gets to be ~62% for the test dataset.
This model is uploaded to [Huggingface Hub](https://huggingface.co/kwkty/vxnli-v0).
Besides, we fine-tine TAPEX, not model-v0, with the custom dataset.
However, the performance is a bit lower (~60%) than the former one.

We use model-v1 in the final user study to compare our proposed interface with typical V-NLI.


## Setup

### Define Parameters


In [1]:
data_dir: str = "../data/"
push_model_to_huggingface_hub: bool = True
use_wandb: bool = True


### Load Modules

In [2]:
import functools
import multiprocessing
import os
import pandas as pd
import sqlite3

from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List

import evaluate
import datasets
import numpy as np
import torch
import transformers

from datasets import Dataset, DatasetDict
from transformers import (
    BartConfig,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    EvalPrediction,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    TapexTokenizer,
    trainer_utils,
)


In [3]:
transformers.set_seed(123)


### Define Variables

In [4]:
# Paths

DATA_DIR: Path = Path(data_dir)

MODEL_NAME: str = "vxnli-v1"

DATABASE_DIR: Path = DATA_DIR.joinpath("datasets/nvBench/database")
DATASET_DIR: Path = DATA_DIR.joinpath(f"datasets/{MODEL_NAME}/")
DATASET_OUTPUT_DIR: Path = DATA_DIR.joinpath(f"datasets/{MODEL_NAME}.hf/")

MODEL_OUTPUT_DIR: Path = DATA_DIR.joinpath(f"models/{MODEL_NAME}/")
RESULT_OUTPUT_DIR: Path = DATA_DIR.joinpath(f"results/{MODEL_NAME}/")

# Model Parameters

# BASE_MODEL: str = "microsoft/tapex-base-finetuned-wtq"
BASE_MODEL: str = "kwkty/vxnli-v0"

MAX_SOURCE_LENGTH: int = 1024
MAX_TARGET_LENGTH: int = 124


In [5]:
RESULT_OUTPUT_DIR.mkdir(exist_ok=True)

### Load Tokenizer


In [6]:
tokenizer = TapexTokenizer.from_pretrained(
    BASE_MODEL, use_fast=True, add_prefix_space=True
)

tokenizer.add_special_tokens(
    {"additional_special_tokens": ["[arg]", "[kwarg]", "[eq]"]}
)


0

### Load Model

In [7]:
model_config = BartConfig.from_pretrained(
    BASE_MODEL,
    no_repeat_ngram_size=0,
    max_length=MAX_SOURCE_LENGTH,
    early_stopping=False,
)

model = BartForConditionalGeneration.from_pretrained(
    BASE_MODEL,
    config=model_config,
)

model.resize_token_embeddings(len(tokenizer))


Embedding(50268, 768)

## Preprocess Dataset


In [8]:
def load_table(db_id: str, table_name: str) -> pd.DataFrame:
    db_path = DATABASE_DIR.joinpath(f"{db_id}/{db_id}.sqlite")

    with sqlite3.connect(db_path) as con:
        return pd.read_sql(f"SELECT * FROM {table_name}", con)


# Example
load_table("customers_and_products_contacts", "products").head()


Unnamed: 0,product_id,product_type_code,product_name,product_price
0,1,Hardware,Apple,54753980.0
1,2,Clothes,jcrew,30590930.0
2,3,Hardware,Apple,10268.85
3,4,Hardware,Apple,22956670.0
4,5,Clothes,jcrew,5927022.0


In [9]:
def preprocess_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={col: col.lower() for col in df.columns})

    # The TAPEX tokenizer raises an error when the table contains non-str columns
    df = df.astype(str)

    for col_name, col_dtype in zip(df.columns, df.dtypes):
        df[col_name] = df[col_name].str.lower()

    return df


preprocess_table(load_table("customers_and_products_contacts", "products").head())


Unnamed: 0,product_id,product_type_code,product_name,product_price
0,1,hardware,apple,54753982.574522
1,2,clothes,jcrew,30590929.528306
2,3,hardware,apple,10268.85297069
3,4,hardware,apple,22956668.699482
4,5,clothes,jcrew,5927021.8748021


In [10]:
# functools.cache is supported in python3.9+, but use lru_cache to support python3.7+
@functools.lru_cache(maxsize=None)
def load_and_preprocess_table(db_id: str, table_name: str) -> pd.DataFrame:
    table = load_table(db_id, table_name)
    table = preprocess_table(table)

    return table


In [11]:
def preprocess_dataset(example: Dict[str, Any]) -> Dict[str, torch.Tensor]:
    table = load_and_preprocess_table(example["db_id"], example["table"])

    query = example["query"]
    answer = example["vega_zero"]

    model_inputs = tokenizer(
        table=table,
        query=query,
        answer=answer,
        max_length=MAX_SOURCE_LENGTH,
        padding=True,
        truncation=True,
    )

    labels = tokenizer(
        answer=answer,
        max_length=MAX_TARGET_LENGTH,
        padding=True,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [12]:
def preprocess_query(*args, **kwargs) -> str:
    args = (str(arg) for arg in args)
    args = " [arg] ".join(args)
    args = f"[arg] {args}"

    kwargs = (f"{k} [eq] {v}" for k, v in kwargs.items())
    kwargs = " [kwarg] ".join(kwargs)
    kwargs = f"[kwarg] {kwargs}"

    return f"{args} {kwargs}".lower()


In [13]:
def load_vxnli_dataset(subset: str) -> Dataset:
    # datasets.load_dataset("json", PATH) raises an json parse error
    # this is probably because it cannot parse the args and kwargs columns (list and dict types) well

    df = pd.read_json(DATASET_DIR.joinpath(f"{subset}.ndjson"), lines=True)
    df["query"] = df.apply(
        lambda row: preprocess_query(*row["args"], **row["kwargs"]), axis=1
    )
    df = df.drop(columns=["args", "kwargs"])

    return Dataset.from_pandas(df)


In [14]:
if DATASET_OUTPUT_DIR.exists():
    # load_from_dist doesn't support pathlib.Path
    dataset = datasets.load_from_disk(str(DATASET_OUTPUT_DIR))
else:
    dataset = DatasetDict()

    dataset["train"] = load_vxnli_dataset("train")
    dataset["test"] = load_vxnli_dataset("test")
    dataset["validation"] = load_vxnli_dataset("val")

    dataset = dataset.map(
        preprocess_dataset,
        batched=False,
        num_proc=multiprocessing.cpu_count(),
    )

    # save_to_disk doesn't support pathlib.Path
    dataset.save_to_disk(str(DATASET_OUTPUT_DIR))

dataset


DatasetDict({
    train: Dataset({
        features: ['db_id', 'table', 'chart', 'hardness', 'vega_zero', 'query', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1050
    })
    test: Dataset({
        features: ['db_id', 'table', 'chart', 'hardness', 'vega_zero', 'query', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 225
    })
    validation: Dataset({
        features: ['db_id', 'table', 'chart', 'hardness', 'vega_zero', 'query', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 225
    })
})

## Training


In [15]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
)


In [16]:
exact_match = evaluate.load("exact_match")


def compute_metrics(eval_pred: EvalPrediction):
    preds, labels = eval_pred

    preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    return exact_match.compute(predictions=preds, references=labels)


In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(
        output_dir=MODEL_OUTPUT_DIR,
        predict_with_generate=True,
        num_train_epochs=50,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        do_eval=True,
        metric_for_best_model="exact_match",
        push_to_hub=push_model_to_huggingface_hub,
        report_to="wandb" if use_wandb else "none",
    ),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5),
    ],
)


/home/jupyter/vxnli/notebooks/../data/models/vxnli-v1 is already a clone of https://huggingface.co/kwkty/vxnli-v1. Make sure you pull the latest changes with `repo.git_pull()`.


In [18]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, hardness, query, vega_zero. If table, chart, db_id, hardness, query, vega_zero are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1050
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6600
  Number of trainable parameters = 139422720
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mkwkty[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Exact Match
1,0.1159,0.193996,0.546667
2,0.0454,0.170474,0.573333
3,0.026,0.252346,0.555556
4,0.0198,0.232647,0.56
5,0.0185,0.211964,0.617778
6,0.0146,0.215054,0.555556
7,0.012,0.225278,0.56
8,0.0087,0.261872,0.564444
9,0.0047,0.259646,0.537778
10,0.0099,0.225496,0.564444


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, hardness, query, vega_zero. If table, chart, db_id, hardness, query, vega_zero are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 225
  Batch size = 8
Saving model checkpoint to ../data/models/vxnli-v1/checkpoint-132
Configuration saved in ../data/models/vxnli-v1/checkpoint-132/config.json
Model weights saved in ../data/models/vxnli-v1/checkpoint-132/pytorch_model.bin
tokenizer config file saved in ../data/models/vxnli-v1/checkpoint-132/tokenizer_config.json
Special tokens file saved in ../data/models/vxnli-v1/checkpoint-132/special_tokens_map.json
added tokens file saved in ../data/models/vxnli-v1/checkpoint-132/added_tokens.json
tokenizer config file saved in ../data/models/vxnli-v1/tokenizer_config.json
Special tokens file save

TrainOutput(global_step=1320, training_loss=0.02755795708208373, metrics={'train_runtime': 914.2834, 'train_samples_per_second': 57.422, 'train_steps_per_second': 7.219, 'total_flos': 6170953342095360.0, 'train_loss': 0.02755795708208373, 'epoch': 10.0})

## Evaluation

In [19]:
# trainer.evaluate must be called for the model card

trainer.evaluate(dataset["test"])


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, hardness, query, vega_zero. If table, chart, db_id, hardness, query, vega_zero are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 225
  Batch size = 8


{'eval_loss': 0.2780759036540985,
 'eval_exact_match': 0.6222222222222222,
 'eval_runtime': 20.6895,
 'eval_samples_per_second': 10.875,
 'eval_steps_per_second': 1.402,
 'epoch': 10.0}

In [20]:
def predict(ds: Dataset) -> List[str]:
    preds = trainer.predict(
        ds,
        max_length=MAX_TARGET_LENGTH,
    )

    preds = tokenizer.batch_decode(
        preds.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    return [pred.strip() for pred in preds]


In [21]:
preds = predict(dataset["test"])

preds[:5], dataset["test"]["vega_zero"][:5]


The following columns in the test set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: table, chart, db_id, hardness, query, vega_zero. If table, chart, db_id, hardness, query, vega_zero are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 225
  Batch size = 8


(['mark bar encoding x name y aggregate none weight transform sort y asc',
  'mark bar encoding x name y aggregate none weight transform sort x asc',
  'mark bar encoding x name y aggregate none weight transform sort x asc',
  'mark point encoding x investor_id y aggregate mean share_count transform group x',
  'mark point encoding x investor_id y aggregate mean share_count transform group x'],
 ['mark bar encoding x name y aggregate none weight transform sort x asc',
  'mark bar encoding x name y aggregate none weight transform sort x asc',
  'mark bar encoding x name y aggregate none weight transform sort x asc',
  'mark point encoding x investor_id y aggregate mean share_count transform group x',
  'mark point encoding x investor_id y aggregate mean share_count transform group x'])

In [22]:
exact_match.compute(
    predictions=preds,
    references=dataset["test"]["vega_zero"],
)


{'exact_match': 0.6222222222222222}

In [23]:
preds_df = dataset["test"].to_pandas()
preds_df = preds_df.drop(columns=["input_ids", "attention_mask", "labels"])
preds_df["pred"] = preds
preds_df["exact_matched"] = preds_df["pred"] == preds_df["vega_zero"]

preds_df.to_csv(RESULT_OUTPUT_DIR.joinpath("prediction.csv"))

preds_df


Unnamed: 0,db_id,table,chart,hardness,vega_zero,query,pred,exact_matched
0,candidate_poll,people,bar,Easy,mark bar encoding x name y aggregate none weig...,[arg] [kwarg] use_bar_chart [eq] true [kwarg]...,mark bar encoding x name y aggregate none weig...,False
1,candidate_poll,people,bar,Easy,mark bar encoding x name y aggregate none weig...,[arg] use a bar chart [kwarg] x [eq] name [kwa...,mark bar encoding x name y aggregate none weig...,True
2,candidate_poll,people,bar,Easy,mark bar encoding x name y aggregate none weig...,[arg] [kwarg] graph [eq] bar [kwarg] x [eq] n...,mark bar encoding x name y aggregate none weig...,True
3,tracking_share_transactions,transactions,point,Easy,mark point encoding x investor_id y aggregate ...,[arg] scatter chart [arg] investor id and mean...,mark point encoding x investor_id y aggregate ...,True
4,tracking_share_transactions,transactions,point,Easy,mark point encoding x investor_id y aggregate ...,[arg] [kwarg] graph_type [eq] scatter [kwarg]...,mark point encoding x investor_id y aggregate ...,True
...,...,...,...,...,...,...,...,...
220,train_station,station,bar,Medium,mark bar encoding x location y aggregate sum n...,[arg] [kwarg] chart [eq] bar [kwarg] x [eq] p...,mark bar encoding x main_home y aggregate sum ...,False
221,train_station,station,bar,Medium,mark bar encoding x location y aggregate sum n...,[arg] return sum(number_of_platforms) per loca...,mark bar encoding x location y aggregate sum n...,True
222,customers_and_invoices,accounts,line,Medium,mark line encoding x date_account_opened y agg...,[arg] line chart [arg] the number of accounts ...,mark line encoding x date_account_opened y agg...,True
223,customers_and_invoices,accounts,line,Medium,mark line encoding x date_account_opened y agg...,[arg] [kwarg] x [eq] date_account_opened (tim...,mark line encoding x date_account_opened y agg...,True


In [24]:
pd.concat(
    [
        preds_df[preds_df["hardness"] == hardness]["exact_matched"]
        .value_counts()
        .rename(hardness)
        for hardness in ("Easy", "Medium", "Hard", "Extra Hard")
    ],
    axis=1,
)


Unnamed: 0,Easy,Medium,Hard,Extra Hard
True,57,74,7,2
False,21,34,14,16


In [25]:
pd.concat(
    [
        preds_df[preds_df["chart"] == chart]["exact_matched"]
        .value_counts()
        .rename(chart)
        for chart in preds_df["chart"].unique()
    ],
    axis=1,
)


Unnamed: 0,bar,point,arc,line
True,96,17,15.0,12
False,63,13,,9


## Push Model


In [26]:
if push_model_to_huggingface_hub:
    trainer.push_to_hub()


Saving model checkpoint to ../data/models/vxnli-v1
Configuration saved in ../data/models/vxnli-v1/config.json
Model weights saved in ../data/models/vxnli-v1/pytorch_model.bin
tokenizer config file saved in ../data/models/vxnli-v1/tokenizer_config.json
Special tokens file saved in ../data/models/vxnli-v1/special_tokens_map.json
added tokens file saved in ../data/models/vxnli-v1/added_tokens.json
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/kwkty/vxnli-v1
   0e00d00..8e20aa5  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}
To https://huggingface.co/kwkty/vxnli-v1
   8e20aa5..da6f21a  main -> main

