In [1]:
from typing import Dict, Any
from datasets import load_dataset, concatenate_datasets
import evaluate

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

## Prepare Data

In [None]:
dataset = load_dataset("wikisql")

In [3]:
# explore dataset
dataset

DatasetDict({
    test: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 15878
    })
    validation: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 8421
    })
    train: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 56355
    })
})

In [4]:
# print example dataset
dataset["train"][0]

{'phase': 1,
 'question': 'Tell me what the notes are for South Australia ',
 'table': {'header': ['State/territory',
   'Text/background colour',
   'Format',
   'Current slogan',
   'Current series',
   'Notes'],
  'page_title': '',
  'page_id': '',
  'types': ['text', 'text', 'text', 'text', 'text', 'text'],
  'id': '1-1000181-1',
  'section_title': '',
  'caption': '',
  'rows': [['Australian Capital Territory',
    'blue/white',
    'Yaa·nna',
    'ACT · CELEBRATION OF A CENTURY 2013',
    'YIL·00A',
    'Slogan screenprinted on plate'],
   ['New South Wales',
    'black/yellow',
    'aa·nn·aa',
    'NEW SOUTH WALES',
    'BX·99·HI',
    'No slogan on current series'],
   ['New South Wales',
    'black/white',
    'aaa·nna',
    'NSW',
    'CPX·12A',
    'Optional white slimline series'],
   ['Northern Territory',
    'ochre/white',
    'Ca·nn·aa',
    'NT · OUTBACK AUSTRALIA',
    'CB·06·ZZ',
    'New series began in June 2011'],
   ['Queensland',
    'maroon/white',
    'nnn·aaa

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [4]:
table_prefix = "table:"
question_prefix = "question:"

def preprocess_function(examples: Dict[str, Any]):
    """preprocess each row of wikisql datasets by create input with this format
        {question_prefix} {natural_question} {table_prefix} {table_schema}
        the labels will be the SQL statement
        
    Args:
        examples (Dict[str, Any]): each row of datasets
        
    Returns:
        output from tokenizer
    """
    columns_merge = [",".join(table["header"]) for table in examples["table"]]
    question_list = [question.replace(u'\xa0', u' ') for question in examples["question"]]
    assert len(columns_merge) == len(question_list)
    inputs = [f"{question_prefix} {question_list[i]} {table_prefix} {columns_merge[i]}" for i in range(len(columns_merge))]
    targets = [sql["human_readable"] for sql in examples["sql"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

In [None]:
# run preprocess data
train_dataset = dataset["train"].map(preprocess_function, batched=True, remove_columns=["phase", "question", "table", "sql"])
test_dataset = dataset["test"].map(preprocess_function, batched=True, remove_columns=["phase", "question", "table", "sql"])
val_dataset = dataset["validation"].map(preprocess_function, batched=True, remove_columns=["phase", "question", "table", "sql"])

## Training

In [7]:
# declare model
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model = model.cuda()

In [8]:
# declare data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [9]:
# declare training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=5e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    # gradient_checkpointing=True,
    warmup_ratio=0.01,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    fp16=True,
    predict_with_generate=True,
    # generation_max_length=512,
    # generation_num_beams=None,
    lr_scheduler_type="cosine",
    # dataloader_num_workers=2,
    greater_is_better=False,
    metric_for_best_model="eval_loss",
)

In [10]:
# import numpy as np


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     # Decode generated summaries into text
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     # Replace -100 in the labels as we can't decode them
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     # Decode reference summaries into text
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     # ROUGE expects a newline after each sentence
#     decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
#     decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
#     # Compute ROUGE scores
#     result = rouge_score.compute(
#         predictions=decoded_preds, references=decoded_labels, use_stemmer=True
#     )
#     # Extract the median scores
#     result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
#     return {k: round(v, 4) for k, v in result.items()}

In [11]:
# declare trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
) # you can evaluate by using compute_metrics function above, but I comment out for the faster training loop

In [None]:
trainer.train()
# trainer.train(resume_from_checkpoint="./results/checkpoint-13000")

## Test Model

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-13000")

In [14]:
from typing import List

table_prefix = "table:"
question_prefix = "question:"

def prepare_input(question: str, table: List[str]):
    print("question:", question)
    print("table:", table)
    join_table = ",".join(table)
    inputs = f"{question_prefix} {question} {table_prefix} {join_table}"
    input_ids = tokenizer(inputs, max_length=700, return_tensors="pt").input_ids
    return input_ids

def inference(question: str, table: List[str]) -> str:
    input_data = prepare_input(question=question, table=table)
    input_data = input_data.to(model.device)
    outputs = model.generate(inputs=input_data, num_beams=10, top_k=10, max_length=512)
    result = tokenizer.decode(token_ids=outputs[0], skip_special_tokens=True)
    return result

In [26]:
test_id = 1000
print("model result:", inference(dataset["test"][test_id]["question"], dataset["test"][test_id]["table"]["header"]))
print("real result:", dataset["test"][test_id]["sql"]["human_readable"])

question: Who is the director of the episode that corresponds to the total episodes number 14? 
table: ['Total#', 'Series#', 'Title', 'Writer', 'Director', 'Original air date']
model result: SELECT Director FROM table WHERE Total# = 14
real result: SELECT Director FROM table WHERE Total# = 14


In [16]:
inference("what is id with name jui and age equal 25", ["id","name", "age"])

question: what is id with name jui and age equal 25
table: ['id', 'name', 'age']


'SELECT id FROM table WHERE name = jui AND age = 25'

In [17]:
inference("get people name with age equal 25", ["id","name", "age"])

question: get people name with age equal 25
table: ['id', 'name', 'age']


'SELECT name FROM table WHERE age = 25'

## Upload Model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub(repo_id="juierror/flan-t5-text2sql-with-schema")

In [None]:
tokenizer.push_to_hub(repo_id="juierror/flan-t5-text2sql-with-schema")