# Install necessary dependencies

In [1]:
!pip install accelerate -U
!pip install -q -U datasets
!pip install scipy
!pip install ipywidgets
!pip install wandb
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install tqdm
!pip install rouge_score
!pip install evaluate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m266.2/270.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.8 MB/s[0m e

# Import dependencies




In [2]:
# Data libraries
from datasets import load_dataset, Dataset, concatenate_datasets, list_metrics, load_metric

# preprocessing Libraries
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Model libraries
import transformers
import ast
import torch
import sklearn
from transformers import (
    T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    DefaultDataCollator, T5ForQuestionAnswering, DataCollatorForSeq2Seq, pipeline, AutoModelForQuestionAnswering, BertTokenizer, BertForQuestionAnswering
)

# Metrics library
import evaluate
from evaluate import evaluator

# Visualization library
import wandb

# Warning libraries
import warnings
warnings.filterwarnings('ignore')

# Utility functions
import t5_utils as utils


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Global variables

In [3]:
DATASET = "Clinton/texttosqlv2_25000_v2"
PREFIX = "instruction"
INPUT_COL = "input"
OUTPUT_COL = "output"
MODEL = "google/flan-t5-base"
TRAIN_SPLIT = "train[:10000]"
VALIDATION_SPLIT = "train[10000:12500]"
TEST_SPLIT = "train[22500:25000]"


# Training args

MAX_STEPS = 1000
PER_DEVICE_TRAIN_BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH_SIZE = 8
SAVE_STEPS = 100
EVAL_STEPS = 100
LOGGING_STEPS = 100
CHECKPOINT_SAVED = 1000
OPTIMIZER = "adamw_torch"
LEARNING_RATE = 2.5e-5
COLLATOR = "seq2seq" # seq2seq or default

# for compute metrics
TASK = "text2text-generation"
METRICS = "rouge"

In [4]:
utils.metric = evaluate.load(METRICS)
utils.tokenizer = T5Tokenizer.from_pretrained(MODEL)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Data gathering and preparation

This function takes data from HuggingFace or local data, split ( train, test, validation, train[:slicing] ), input_col, target_col, prefix(column or text)

In [5]:
train_data = utils.prepare_data(
                                DATASET,
                                TRAIN_SPLIT,
                                INPUT_COL,
                                OUTPUT_COL,
                                PREFIX
                                )

eval_data = utils.prepare_data(
                               DATASET,
                               VALIDATION_SPLIT,
                               INPUT_COL,
                               OUTPUT_COL,
                               PREFIX
                               )

test_data = utils.prepare_data(
                               DATASET,
                               TEST_SPLIT,
                               INPUT_COL,
                               OUTPUT_COL,
                               PREFIX
                               )

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
input_data_first_train = train_data[0]["input_text"]
targer_data_first_train = train_data[0]["target_text"]
print(f"input_text : {input_data_first_train}")
print("--------------------------------")
print(f"target_text : {targer_data_first_train}")

input_text : tell me the number of times he placed 4th . : CREATE TABLE table_204_780 (
    id number,
    "year" number,
    "competition" text,
    "venue" text,
    "position" text,
    "notes" text
)
--------------------------------
target_text : SELECT COUNT(*) FROM table_204_780 WHERE "position" = 4


In [7]:
input_data_first_eval = eval_data[0]["input_text"]
targer_data_first_eval = eval_data[0]["target_text"]
print(f"input_text : {input_data_first_eval}")
print("--------------------------------")
print(f"target_text : {targer_data_first_eval}")

input_text : What is the lead percentage when the socialist is at 35.5%? : CREATE TABLE table_1183 (
    "Date Released" text,
    "Institute" text,
    "Socialist" text,
    "Social Democratic" text,
    "Green-Communist" text,
    "Left Bloc" text,
    "Peoples Party" text,
    "Lead" text
)
--------------------------------
target_text : SELECT "Lead" FROM table_1183 WHERE "Socialist" = '35.5%'


In [8]:
input_data_first_test = test_data[0]["input_text"]
targer_data_first_test = test_data[0]["target_text"]
print(f"input_text : {input_data_first_test}")
print("--------------------------------")
print(f"target_text : {targer_data_first_test}")

input_text : how many times patient 004-65311 has been prescribed for 500 ml flex cont excel : sodium chloride 0.9 % iv soln until 2104? : CREATE TABLE lab (
    labid number,
    patientunitstayid number,
    labname text,
    labresult number,
    labresulttime time
)

CREATE TABLE cost (
    costid number,
    uniquepid text,
    patienthealthsystemstayid number,
    eventtype text,
    eventid number,
    chargetime time,
    cost number
)

CREATE TABLE medication (
    medicationid number,
    patientunitstayid number,
    drugname text,
    dosage text,
    routeadmin text,
    drugstarttime time,
    drugstoptime time
)

CREATE TABLE treatment (
    treatmentid number,
    patientunitstayid number,
    treatmentname text,
    treatmenttime time
)

CREATE TABLE allergy (
    allergyid number,
    patientunitstayid number,
    drugname text,
    allergyname text,
    allergytime time
)

CREATE TABLE microlab (
    microlabid number,
    patientunitstayid number,
    culturesite te

In [9]:
max_source_length, max_target_length = utils.identify_max_lengths(
                                                                  train_data,
                                                                  eval_data,
                                                                  model = MODEL
                                                                  )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [10]:
print(f"Input max length : {max_source_length}")
print("--------------------------------")
print(f"Target max length : {max_target_length}")


Input max length : 512
--------------------------------
Target max length : 512


In [11]:
train_data_tokenized, eval_data_tokenized = utils.preprocessed_data(
                                                                    train_data,
                                                                    eval_data,
                                                                    max_source_length,
                                                                    max_target_length,
                                                                    model = MODEL)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make s

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Training

Look at utility functions to get in-depth understanding of training function

In [12]:
utils.train_model(
                  train_data_tokenized,
                  train_data_tokenized,
                  project = "Finetunning-t5",
                  model = MODEL,
                  base_model_name = "T5",
                  max_steps = MAX_STEPS,
                  per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
                  per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
                  save_steps = SAVE_STEPS,
                  eval_steps = EVAL_STEPS,
                  logging_steps = LOGGING_STEPS,
                  checkpoint_saved = CHECKPOINT_SAVED,
                  optimizer = OPTIMIZER,
                  learning_rate = LEARNING_RATE,
                  collator = COLLATOR
                  )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,1.5953,0.872786,74.1404,56.3763,72.8778,72.8182,512.0
200,1.0009,0.715405,78.3331,63.1608,77.217,77.1868,512.0
300,0.8943,0.637364,80.5476,66.6968,79.4904,79.4523,512.0
400,0.8833,0.58624,81.5554,68.3264,80.521,80.4823,512.0
500,0.8099,0.552417,82.043,69.216,81.0759,81.0384,512.0
600,0.7851,0.527347,82.5299,70.057,81.5879,81.5582,512.0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-9ed16be41bff>", line 1, in <cell line: 1>
    utils.train_model(
  File "/content/t5_utils.py", line 225, in train_model
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1555, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1922, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2271, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3011, in evaluate
    output = eval_loop(
  File "/usr/local/li

TypeError: object of type 'NoneType' has no len()

# Evaluate Model

# Inference

This function takes 3 argumnets : task(summarization, classification, etc), text(data), model(pretrained, fine tuned)


In [13]:
utils.inference("What is Years, when Nationality is United States, and when Position is PG / SG?",
                "CREATE TABLE table_name_83 ( years VARCHAR, nationality VARCHAR, position VARCHAR )",
                fined_tuned_path = "/content/T5-Finetunning-t5/checkpoint-100")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'SELECT YEARS FROM table_name_83 WHERE nationality = "United States" AND position = "PG/SG"'

In [14]:
utils.inference("What owner or owners have an operational description?",
                "CREATE TABLE table_name_34 ( owner_s_ VARCHAR, description VARCHAR )",
                fined_tuned_path = "/content/T5-Finetunning-t5/checkpoint-100")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'SELECT owner_s__description FROM table_name_34 WHERE owner_s__description = "operational"'

In [16]:
tokenizer = T5Tokenizer.from_pretrained("/content/T5-Finetunning-t5/checkpoint-600")
model = T5ForConditionalGeneration.from_pretrained("/content/T5-Finetunning-t5/checkpoint-600")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
tokenizer.push_to_hub("HuzaifaHPC/T5_Text_to_SQL")
model.push_to_hub("HuzaifaHPC/T5_Text_to_SQL")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HuzaifaHPC/T5_Text_to_SQL/commit/2eda43b44c9584f5582af2463b5a0c5e459d760e', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='2eda43b44c9584f5582af2463b5a0c5e459d760e', pr_url=None, pr_revision=None, pr_num=None)