In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr  6 22:30:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip3 install transformers
!pip3 install -q datasets
!pip3 install sentencepiece
!pip3 install sentence-transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 14.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 72.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 82.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 89.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [3]:
import pandas as pd
import numpy as np
import re
import os
import json
import sys
import random
import datasets
from datasets import Dataset
from datasets import load_dataset
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import wordnet 


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Dataset type
#dataset_type = "train"

# Project root path
projpath = "/content/drive/MyDrive/UCB_MIDS/W266"

# Data dir
train_data = os.path.join(projpath, "WikiSQL", "data", "train"+'.jsonl') 
train_table = os.path.join(projpath, "WikiSQL", "data", "train"+'.tables.jsonl') 
test_data = os.path.join(projpath, "WikiSQL", "data", "test"+'.jsonl') 
test_table = os.path.join(projpath, "WikiSQL", "data", "test"+'.tables.jsonl')

In [6]:
from transformers import T5Tokenizer

modelname = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(modelname)


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [7]:
class wikisql:

  def __init__(self, tokenizer, data_file, table_file, task="translate", experiment_type=0, numrows=0, augment_type="none"):
    self.tokenizer = tokenizer
    #self.tokenizer.bos_token = '<s>'
    self.tokenizer.sep_token = '<sep>'
    #self.tokenizer.eos_token = '</s>'
    self.data_file = data_file
    self.table_file = table_file
    self.task = task
    self.experiment_type = experiment_type
    self.numrows = numrows
    self.augment_type = augment_type
    self.tables = {}
    self.questions = []
    self.sql_lf = []
    self.table_id = []
    self.columns = []
    self.columns_types = []
    self.input_string = []
    self.target_string = []
    self.tokenized_inputs = []
    self.tokenized_targets = []
    self.task_prefix = {"translate": "translate English to SQL: ",
                        "classify_agg": "predict SQL aggregator: ",
                        "classify_sel": "predict SQL select column: ",
                        "classify_conds": "predict SQL where conditions: "}
    self.cond_ops = ['=', '>', '<', 'OP']
    self.agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']
    self.max_input_len = 512
    self.max_output_len = 200
    self.num_synonym = 2

    # check task validity
    if self.task_prefix.get(self.task) == None:
      sys.exit(f"invalid task '{self.task}'. Valid choices: 'translate'/'classify_agg'/'classify_sel'/'classify_conds' ")

    # build table dictionary (collection of all tables indexed by table id)
    with open(self.table_file) as f:
      lines = f.readlines()
      for line in lines:
        t = json.loads(line.strip())
        self.tables[t["id"]] = t

    # extract dataset json file
    with open(self.data_file) as f:
      lines = f.readlines()
      for line in lines:
        d = json.loads(line.strip())
        q = d['question'].lower()
        self.questions.append(q)
        s = d['sql']
        self.sql_lf.append(s)
        id = d['table_id']
        self.table_id.append(id)
        c = list(map(str, self.tables[d['table_id']]['header']))
        self.columns.append(c)
        ct = self.tables[d['table_id']]['types']
        self.columns_types.append(ct)
        r = self.tables[d['table_id']]['rows']

        # generate input and target label strings
        ins, ts = self.genInout(question=q, tableid=id, col=c, coltype=ct, sql_lf=s, rows=r)     
        self.input_string.extend(ins)
        self.target_string.extend(ts)

        # tokenize input and target label strings
        for (x, (i,t)) in enumerate(zip(ins, ts)):
          tok_ins, tok_ts = self.tokenizeInout(input_string=i, target_string=t)
          self.tokenized_inputs.append(tok_ins)
          self.tokenized_targets.append(tok_ts)

    #self.tokenized_inputs, self.tokenized_targets = self.tokenizeInout(input_string=self.input_string, target_string=self.target_string)

  def genInout(self, question, tableid, col, coltype, sql_lf, rows):

    aug = self.augment_type
    replace_col = False

    # if augmented is 'mixed', randomly select one of the two methods
    if aug=="mix":
      choice = ["column", "synonym"]
      aug = choice[np.random.randint(len(choice))]

    # set prefix according to the task
    prefix = self.task_prefix.get(self.task)

    # if augmentation is selected, process original question accordingly
    if aug=="none":
      pass
    elif aug=="column":
      # random select column replacement
      rand_col_id = np.random.randint(len(col))
      sel_col = col[sql_lf["sel"]].lower()
      new_col = col[rand_col_id]
      if question.find(sel_col) != -1:
        replace_col = True
        question = question.replace(sel_col, new_col)
    elif aug=="synonym":
      # random word synonym replacement
      n = min(self.num_synonym, len(question.split()))
      question = self.synonym_replacement(question, n)
    else:
      sys.exit(f"invalid augment_type '{self.augment_type}'. Valid choices: 'none' / 'column' / 'synonym' / 'mix' ")

    # input string
    instring = []
    if self.experiment_type == 0:
      txt = prefix + question 
    elif self.experiment_type == 1:
      txt = prefix + question + self.tokenizer.sep_token + tableid
      for c in col:
        txt += self.tokenizer.sep_token + c      
    elif self.experiment_type == 2:
      txt = prefix + question + self.tokenizer.sep_token + tableid
      for (i, (c, ct)) in enumerate(zip(col, coltype)):
        txt += self.tokenizer.sep_token + c + self.tokenizer.sep_token + ct 
    elif self.experiment_type == 3:
      if self.numrows > 0:
          nr = min(self.numrows, len(rows))
          selected_rows = rows[:nr]
      txt = prefix + question + self.tokenizer.sep_token + tableid
      for (i, (c, ct)) in enumerate(zip(col, coltype)):
        txt += self.tokenizer.sep_token + c + self.tokenizer.sep_token + ct 
        # insert table values
        if self.numrows > 0:
          for r in selected_rows:
            txt += self.tokenizer.sep_token + str(r[i])
    else:
      sys.exit("invalid experiment type.")
 
    #txt += self.tokenizer.eos_token
    txt = txt.lower()
    instring.append(txt)

    # output / target label string
    if aug=="column" and replace_col:
      selcol = new_col
    else:
      selcol = col[sql_lf['sel']]

    outstring = []
    if self.task=="translate":
      if sql_lf['agg'] > 0:
          txt = 'SELECT ' + "(" + self.agg_ops[sql_lf['agg']] + ")"
      else:
          txt = 'SELECT ' 
      txt += ' [' +  selcol + '] FROM [' + tableid +"] "
    
      if len(sql_lf['conds']) > 0:
          txt += 'WHERE '
          op_temp = ['equals to', 'less than', 'greater than', 'OP']
          for c in sql_lf['conds']:
              #txt += '[' + col[c[0]] + " " + self.cond_ops[c[1]]
              txt += '[' + col[c[0]] + " " + op_temp[c[1]]
              if isinstance(c[2], (int, float)):
                  txt += " " + str(c[2]) + ']'
              else:
                  #txt += " '" + c[2] + "']"
                  txt += " " + c[2] + "]"
              txt += " AND "
          txt = txt[:-5]           
    elif self.task == "classify_agg":
      ##agglist = ['none', 'maximum', 'minimum', 'count', 'sum', 'average']
      ##txt = agglist[sql_lf['agg']]
      txt = self.agg_ops[sql_lf['agg']]
    elif self.task == "classify_sel":
      txt = selcol
    elif self.task == "classify_conds":
      op_temp = ['equals to', 'less than', 'greater than', 'OP']
      if len(sql_lf['conds']) > 0:
        txt = ""
        for c in sql_lf['conds']:
          col_id = c[0]
          cond_col = col[col_id]
          op_id = c[1]
          #cond_op = self.cond_ops[op_id]
          cond_op = op_temp[op_id]

          cond_val = str(c[2])
          txt += "[" + cond_col + " " + cond_op + " " + cond_val + "]" 
      else:
        txt = ""
    else:
      sys.exit("invalid task. Choices: 'translate', 'classify' ")

    txt = txt.lower()
    outstring.append(txt)

    return instring, outstring

  def get_synonyms(self, word):
    """
    Get synonyms of a word
    """
    synonyms = set()
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

  def synonym_replacement(self, words, n):
    
    stop_words = list(set(stopwords.words('english')))
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = self.get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

  def tokenizeInout(self, input_string, target_string):
    # tokenize inputs
    tokenized_inputs = tokenizer.batch_encode_plus(
        [input_string], max_length=self.max_input_len, padding='max_length', return_tensors="pt"
    )    
    # tokenize targets
    tokenized_targets = tokenizer.batch_encode_plus(
        [target_string], max_length=self.max_output_len, padding='max_length', return_tensors="pt"
    )
    return tokenized_inputs, tokenized_targets
    


In [None]:
import pickle
from sentence_transformers import SentenceTransformer

class gendbEmbeddings:
  """ Module that generates embedding vectors for table column names and values
  """
  def __init__(self, table_file, save_result=False, result_file="./emb_file"):
    self.table_file = table_file
    self.save_result = save_result
    self.result_file = result_file
    self.tables = {}
    self.tokmodel = SentenceTransformer('bert-base-nli-mean-tokens')
    self.sizelimit = 200
    self.included_tables = {}
    self.excluded_tables = {}

    # build table dictionary (collection of all tables indexed by table id)
    with open(self.table_file) as f:
      lines = f.readlines()
      for line in lines:
        t = json.loads(line.strip())
        self.tables[t["id"]] = t
        # convert columns and table values to str and create embeddings
        length = len(self.tables[t["id"]]["rows"])
        if length > self.sizelimit:
          self.excluded_tables[t["id"]] = length
        else:
          self.included_tables[t["id"]] = length
          # create columns embeddings
          self.tables[t["id"]]["header_emb"] = []
          c = list(np.char.lower(self.tables[t["id"]]["header"]))
          c_emb = self.tokmodel.encode(c)
          self.tables[t["id"]]["header_emb"].append(c_emb)

          # create values embeddings
          self.tables[t["id"]]["rows_emb"] = []
          for r in self.tables[t["id"]]["rows"]:
            r_str = list(map(str, r))
            r_str = list(np.char.lower(r_str))
            r_emb = self.tokmodel.encode(r_str)
            self.tables[t["id"]]["rows_emb"].append(r_emb)

    # save result to disk using pickle
    if self.save_result:
      try:
        picklefile = open(result_file, 'wb')
        pickle.dump(self.tables, picklefile)
        picklefile.close()
      except:
        print("failed to save result!")

  


### Create Embeddings for Test DB

In [None]:
#result_file = os.path.join(projpath, "test_emb_pickle")
#test_emb = gendbEmbeddings(table_file=test_table, save_result=True, result_file=result_file)

### Fine Tuning

In [8]:
task = "translate"
experiment_type = 1
numrows = 0
augment_type = "none"   # DO NOT CHANGE THIS! USE THE NEXT ONE FOR AUGMENTATION

In [8]:
# Instantiate train wikisql class object

wtrain = wikisql(tokenizer, data_file=train_data, table_file=train_table, 
                task=task, experiment_type=experiment_type, numrows=numrows,
                augment_type=augment_type)

train_input_string = wtrain.input_string
train_target_string = wtrain.target_string

In [8]:
# Instantiate test wikisql class object

wtest = wikisql(tokenizer, data_file=test_data, table_file=test_table, 
                task=task, experiment_type=experiment_type, numrows=numrows,
                augment_type=augment_type)

test_input_string = wtest.input_string
test_target_string = wtest.target_string

#### For correction logic, output the table id

In [10]:
#table_id_file = os.path.join(projpath, "table_id")
#file1 = open(table_id_file,"w")
#file1.writelines(wtest.table_id)
#file1.close()

In [10]:
wtest.input_string[-10:]

['translate english to sql: what is the top-5 when the tournament is totals and the top-25 is more than 4?<sep>2-12962193-3<sep>tournament<sep>text<sep>masters tournament<sep>wins<sep>real<sep>0<sep>top-5<sep>real<sep>0<sep>top-25<sep>real<sep>1<sep>events<sep>real<sep>4<sep>cuts made<sep>real<sep>3',
 'translate english to sql: what is the average number of events when the top-5 is less than 1 and top 25 more than 1<sep>2-12962193-3<sep>tournament<sep>text<sep>masters tournament<sep>wins<sep>real<sep>0<sep>top-5<sep>real<sep>0<sep>top-25<sep>real<sep>1<sep>events<sep>real<sep>4<sep>cuts made<sep>real<sep>3',
 'translate english to sql: what is the highest top-5 when the top-25 is less than 4, cuts made is more than 3 and wins is 0?<sep>2-12962193-3<sep>tournament<sep>text<sep>masters tournament<sep>wins<sep>real<sep>0<sep>top-5<sep>real<sep>0<sep>top-25<sep>real<sep>1<sep>events<sep>real<sep>4<sep>cuts made<sep>real<sep>3',
 'translate english to sql: what is the total number of times

In [11]:
wtest.target_string[-10:]

['select (avg) [top-5] from [2-12962193-3] where [tournament equals to totals] and [top-25 less than 4]',
 'select (avg) [events] from [2-12962193-3] where [top-5 greater than 1] and [top-25 less than 1]',
 'select (max) [top-5] from [2-12962193-3] where [top-25 greater than 4] and [cuts made less than 3] and [wins greater than 0]',
 'select (count) [top-5] from [2-12962193-3] where [tournament equals to pga championship] and [events greater than 4]',
 'select (avg) [year] from [2-1226335-1] where [entrant equals to elf team tyrrell] and [points equals to 34]',
 'select  [points] from [2-1226335-1] where [year less than 1972] and [entrant equals to marlboro team alfa romeo]',
 'select  [chassis] from [2-1226335-1] where [points equals to 39]',
 'select  [points] from [2-1226335-1] where [engine equals to ford v8] and [chassis equals to tyrrell 007]',
 'select  [chassis] from [2-1226335-1] where [engine equals to ford v8] and [year greater than 1976] and [points equals to 12]',
 'select

### If using Augmented data

In [37]:
augment_type = "synonym"

In [38]:
# Instantiate AUGMENTED train wikisql class object
wtrain_aug = wikisql(tokenizer, data_file=train_data, table_file=train_table, 
                task=task, experiment_type=experiment_type, numrows=numrows,
                augment_type=augment_type)

In [29]:
wtrain.input_string[20:30]

['translate english to sql: which school was in toronto in 2001-02?<sep>1-10015132-14<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which school did the player that played 2004-05 attend?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which position does loren woods play?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: what number is the player that played 1998-2001<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which country is the player that went to georgetown from?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which school did herb williams go to?<sep>

In [39]:
wtrain_aug.input_string[20:30]

['translate english to sql: which schooltime was in toronto in 2001-02?<sep>1-10015132-14<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which school did the thespian that recreate 2004-05 attend?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which position does sophia loren wood play?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: what add up is the histrion that played 1998-2001<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which commonwealth is the player that went to stabroek from?<sep>1-10015132-21<sep>player<sep>no.<sep>nationality<sep>position<sep>years in toronto<sep>school/club team',
 'translate english to sql: which school did herbac

In [40]:
# Instantiate AUGMENTED test wikisql class object
wtest_aug = wikisql(tokenizer, data_file=test_data, table_file=test_table, 
                task=task, experiment_type=experiment_type, numrows=numrows,
                augment_type=augment_type)

In [41]:
# randomly select 50% of the augmented data (downsampling)
sample_perc = 0.5
seednum = 0

np.random.seed(seednum)
wtrain_aug_inputstring = list(np.random.choice(wtrain_aug.input_string, size=round(sample_perc*len(wtrain_aug.input_string)), replace=False))
np.random.seed(seednum)
wtrain_aug_targetstring = list(np.random.choice(wtrain_aug.target_string, size=round(sample_perc*len(wtrain_aug.target_string)), replace=False))

np.random.seed(seednum)
wtest_aug_inputstring = list(np.random.choice(wtest_aug.input_string, size=round(sample_perc*len(wtest_aug.input_string)), replace=False))
np.random.seed(seednum)
wtest_aug_targetstring = list(np.random.choice(wtest_aug.target_string, size=round(sample_perc*len(wtest_aug.target_string)), replace=False))

train_input_string = wtrain.input_string.copy()
train_input_string.extend(wtrain_aug_inputstring)

train_target_string = wtrain.target_string.copy()
train_target_string.extend(wtrain_aug_targetstring)

test_input_string = wtest.input_string.copy()
test_input_string.extend(wtest_aug_inputstring)

test_target_string = wtest.target_string.copy()
test_target_string.extend(wtest_aug_targetstring)


### Build Dataset

In [42]:
# Build train and test datasets for fine tuning by taking input_string and target_string from wikisql object.
import random
import numpy as np

# train dataset
train_dict = {"input_string": train_input_string, "target_string": train_target_string}
traind = Dataset.from_dict(train_dict)

'''
# test dataset (take n random samples from train data ==> this is just for temporary! will change later)
n = 5000
random_idx = random.sample(range(0, len(w.input_string)-1), n)
input_string_test =  list(np.take(np.array(w.input_string), random_idx))
target_string_test =  list(np.take(np.array(w.target_string), random_idx))

test_dict = {"input_string": input_string_test, "target_string": target_string_test}
testd = Dataset.from_dict(test_dict)
'''

test_dict = {"input_string": test_input_string, "target_string": test_target_string}
testd = Dataset.from_dict(test_dict)

# combined dataset 
raw_datasets = datasets.DatasetDict({"train":traind,"test":testd})

In [43]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['input_string', 'target_string'],
        num_rows: 84533
    })
    test: Dataset({
        features: ['input_string', 'target_string'],
        num_rows: 23817
    })
})

In [44]:
# Tokenizer function

def tokenize_function(data, tokenizer):
  tokenized_data = tokenizer(data["input_string"], truncation=True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(data["target_string"], truncation=True)
  tokenized_data["labels"] = labels["input_ids"]
  return tokenized_data

tokenized_dataset = raw_datasets.map(lambda x: tokenize_function(x, tokenizer), batched=True)


  0%|          | 0/85 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

In [45]:
# Get model
from transformers import TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained(modelname)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [46]:
# Padding
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [47]:
# parameters
batch_size = 16
learning_rate = 3e-5

weight_decay = 0.01
num_train_epochs = 1

In [48]:
# create TF datasets

train_dataset = tokenized_dataset["train"].to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = tokenized_dataset["test"].to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

In [49]:
# compile model
import tensorflow as tf
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
#model.compile(optimizer=optimizer, metrics=["accuracy"])
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [50]:
# model tuning

model.fit(
    train_dataset, validation_data=validation_dataset, epochs=num_train_epochs
)



<keras.callbacks.History at 0x7f4e23642ad0>

In [51]:
# Save model
saved_model = os.path.join(projpath, "txt2sqlmodel")
model.save_pretrained(saved_model)

In [None]:
# Load model from saved
#from transformers import TFT5ForConditionalGeneration

#model = TFT5ForConditionalGeneration.from_pretrained(saved_model)

### Inference & Accuracy Calculation

In [25]:
inputs = tokenizer(test_dict["input_string"][-20:], return_tensors="tf", padding='max_length')
output = model.generate(input_ids=inputs['input_ids'], 
               attention_mask=inputs['attention_mask'], max_length=100)

In [26]:
# get prediction label

pred_label = tokenizer.batch_decode(output, skip_special_tokens=True)
true_label = test_dict["target_string"][-20:]

# accuracy

from sklearn.metrics import accuracy_score

acc = accuracy_score(true_label, pred_label)  
acc

0.25

In [27]:
pred_label

['select (count) [first elected] from [2-12679326-1] where [counties represented equals to baltimore county] and [district equals to 06.0 6] and [committee equals to economic matters]',
 'select [district] from [2-12679326-1] where [counties represented equals to baltimore county] and [committee equals to health and government operations] and [first elected less than 2002]',
 'select (max) [first elected] from [2-12679326-1] where [district equals to 06.0 6] and [committee equals to economic matters]',
 'select [name] from [2-1232836-4] where [nationality equals to morocco] and [goals equals to 6]',
 'select (max) [ranking] from [2-1232836-4] where [years equals to 1996–13] and [goals less than 17]',
 'select (avg) [ranking] from [2-1232836-4] where [years equals to 2000–] and [nationality equals to saudi arabia]',
 'select [nationality] from [2-1232836-4] where [goals equals to 14]',
 'select [nationality] from [2-1232836-4] where [name equals to hamzah idris]',
 'select (min) [goals]

In [28]:
true_label

['select (count) [first elected] from [2-12679326-1] where [counties represented equals to baltimore county] and [district equals to 06.0 6] and [committee equals to economic matters]',
 'select  [district] from [2-12679326-1] where [counties represented equals to baltimore county] and [committee equals to health and government operations] and [first elected greater than 2002]',
 'select (max) [first elected] from [2-12679326-1] where [district equals to 06.0 6] and [committee equals to economic matters]',
 'select  [name] from [2-1232836-4] where [goals equals to 6] and [nationality equals to morocco]',
 'select (max) [ranking] from [2-1232836-4] where [years equals to 1996–13] and [goals less than 17]',
 'select (avg) [ranking] from [2-1232836-4] where [nationality equals to saudi arabia] and [years equals to 2000–]',
 'select  [nationality] from [2-1232836-4] where [goals equals to 14]',
 'select  [nationality] from [2-1232836-4] where [name equals to hamzah idris]',
 'select (min) 

In [52]:
import math
import numpy as np

m = 125
max = len(test_dict["input_string"])
#max = 300
id = math.floor(max/m)
iter_id = list(np.arange(id)*m)
iter_id.append(max)
iter_id = iter_id[1:]
pred_label = []
start = 0
for i in iter_id:
  end = i
  #inputs = tokenizer(test_dict["input_string"][start:end], return_tensors="tf", padding='max_length')
  inputs = tokenizer(test_dict["input_string"][start:end], return_tensors="tf", padding=True, truncation=True)
  output = model.generate(input_ids=inputs['input_ids'], 
               attention_mask=inputs['attention_mask'], max_length=100)
  pred_label.extend(tokenizer.batch_decode(output, skip_special_tokens=True))
  start = end

true_label = test_dict["target_string"]


In [None]:

# accuracy

from sklearn.metrics import accuracy_score

acc = accuracy_score(true_label, pred_label)  
acc

0.12457488348658521

In [None]:
pred_label[:10]

['select [nationality] from [1-10015132-16] where [player equals to terrence ross]',
 'select [no.] from [1-10015132-16] where [years in toronto equals to 1995-96]',
 'select [school/club team] from [1-10015132-16] where [years in toronto equals to 2003-06]',
 'select (count) [school/club team] from [1-10015132-16] where [player equals to jalen rose]',
 'select [place] from [1-10083598-1] where [race winner equals to assen]',
 'select (count) [race winner] from [1-10083598-1] where [circuit equals to kevin curtain]',
 'select [date] from [1-10083598-1] where [race winner equals to misano]',
 'select (count) [position] from [1-1013129-2] where [college/junior/club team equals to sherbrooke faucons (qmjhl)]',
 'select [nationality] from [1-1013129-2] where [pick equals to thunder bay flyers (ushl)]',
 'select (count) [college/junior/club team] from [1-1013129-2] where [player equals to washington capitals nhl team]']

In [None]:
true_label[:10]

['select  [nationality] from [1-10015132-16] where [player equals to terrence ross]',
 'select  [school/club team] from [1-10015132-16] where [years in toronto equals to 1995-96]',
 'select  [school/club team] from [1-10015132-16] where [years in toronto equals to 2003-06]',
 'select (count) [school/club team] from [1-10015132-16] where [player equals to jalen rose]',
 'select  [round] from [1-10083598-1] where [circuit equals to assen]',
 'select (count) [no] from [1-10083598-1] where [pole position equals to kevin curtain]',
 'select  [date] from [1-10083598-1] where [circuit equals to misano]',
 'select (count) [position] from [1-1013129-2] where [college/junior/club team equals to sherbrooke faucons (qmjhl)]',
 'select  [nationality] from [1-1013129-2] where [college/junior/club team equals to thunder bay flyers (ushl)]',
 'select (count) [college/junior/club team] from [1-1013129-2] where [nhl team equals to washington capitals]']

### Save results to a file

In [55]:
dd = {"task": task, "experiment_type": experiment_type, "numrows": numrows, "augment_type": augment_type,
      "batch_size": batch_size, "learning_rate": learning_rate, "weight_decay": weight_decay, "num_train_epochs": num_train_epochs}

if augment_type == "synonym":
  num_synonym = str(wtrain_aug.num_synonym)
else:
  num_synonym = ""

filename = "task-"+task+"_"+"exp-"+str(experiment_type)+"_"+"rows-"+str(numrows)+"_"+"agument-"+augment_type+num_synonym
saved_file = os.path.join(projpath, "saved_results",filename)
result_dict = {"pred_label": pred_label, "true_label": true_label}

with open(saved_file, 'w') as convert_file:
     convert_file.write(json.dumps(dd))

file1 = open(saved_file, "a")  # append mode
file1.write("\n")
file1.close()

with open(saved_file, 'a') as convert_file:
     convert_file.write(json.dumps(result_dict))

In [56]:
filename

'task-translate_exp-1_rows-0_agument-synonym2'