In [1]:
import pandas as pd
import json
from tqdm.notebook import tqdm
import re
import os

from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from google.cloud import storage
from datetime import datetime

In [2]:
def blob(bucket_name, filename):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(filename)
    return blob

def get_blobs(bucket_name, folder):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = list(bucket.list_blobs(prefix=folder))
    return blob

In [3]:
model_run_date = datetime.today().strftime('%Y-%m-%d')
model_name = 't5-small'
model_folder = f'gs://data_tql/Model/{model_run_date}/{model_name}'

folder_items = get_blobs('data_tql', model_folder.replace("gs://data_tql/",""))

pattern = r'run_(\d+)'
# Initialize an empty list to store the extracted run numbers
run_numbers = []
# Iterate through the folder paths and extract run numbers
for folder_path in folder_items:
    match = re.search(pattern, folder_path.name)
    if match:
        run_number = int(match.group(1))
        run_numbers.append(run_number)

# Find the maximum run number
if run_numbers:
    run_number = max(run_numbers) + 1
else:
    run_number = 1
    
model_folder = model_folder + '/run_' + f'{run_number}'
model_folder

'gs://data_tql/Model/2023-11-05/t5-small/run_5'

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
queryData = pd.read_csv('gs://data_tql/spider/processed/spiderQueryData.csv')
tableData = pd.read_csv('gs://data_tql/spider/processed/Schemas/tablesSchemaSpider.csv')

display(queryData.head(1))
display(tableData.head(2))

Unnamed: 0,db_id,TQL,SQL,dataset,fileName,filePath,result
0,department_management,How many heads of the departments are older th...,SELECT count(*) FROM head WHERE age > 56,train,department_management.sqlite,sqliteDB/department_management.sqlite,{'count(*)': {0: 5}}


Unnamed: 0,schema_id,table_name,table_name_original,primary_key,column_list,column_list_original,column_datatypes,foreign_keys
0,perpetrator,perpetrator,perpetrator,Perpetrator_ID,"['perpetrator id', 'people id', 'date', 'year'...","['Perpetrator_ID', 'People_ID', 'Date', 'Year'...","['number', 'number', 'text', 'number', 'text',...",[]
1,perpetrator,people,people,People_ID,"['people id', 'name', 'height', 'weight', 'hom...","['People_ID', 'Name', 'Height', 'Weight', 'Hom...","['number', 'text', 'number', 'number', 'text']","[['perpetrator', 'People_ID', 'people', 'Peopl..."


In [6]:
def create_schema_natural_language(row):

    schema_id = row['schema_id']
    table_name = row['table_name']
    primary_key = row['primary_key']
    column_list = eval(row['column_list_original'])
    datatype_list = eval(row['column_datatypes'])
    foreign_key = eval(row['foreign_keys'])

    column_list_with_datatype = []
    for column, datatype in zip(column_list, datatype_list):
        column_list_with_datatype.append(' '.join([column, datatype]))
        
        
    schema_natural_language = f"CREATE TABLE {table_name} ({', '.join(column_list_with_datatype)}) which has {primary_key} as primary key"

    # schema_natural_language = f"Given the table {table_name} having columns as {', '.join(column_list_with_datatype)} which has {primary_key} as primary key"
    return schema_natural_language

In [7]:
tableData['schema_natural_language'] = tableData.apply(create_schema_natural_language, axis = 1)
tableData.head(3)

all_schemas = tableData['schema_id'].unique()
schema_table_query = {}
for schema in all_schemas:
    schema_details = ' and '.join(tableData[tableData['schema_id'] == schema]['schema_natural_language'].values)
    schema_table_query[schema] = schema_details

queryData['schema_natural_language'] = queryData['db_id'].map(schema_table_query)
queryData['final_TQL'] = queryData['TQL'] + ' with tables: ' + queryData['schema_natural_language']
queryData.head(2)

queryData['final_TQL'][0], queryData['SQL'][0]

('How many heads of the departments are older than 56 ? with tables: CREATE TABLE department (Department_ID number, Name text, Creation text, Ranking number, Budget_in_Billions number, Num_Employees number) which has Department_ID as primary key and CREATE TABLE head (head_ID number, name text, born_state text, age number) which has head_ID as primary key and CREATE TABLE management (department_ID number, head_ID number, temporary_acting text) which has department_ID as primary key',
 'SELECT count(*) FROM head WHERE age  >  56')

In [8]:
# Load the pretrained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = torch.nn.DataParallel(T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql')).to(device)

In [9]:
# Define a custom dataset for training
class SQLDataset(Dataset):
    def __init__(self, input_texts, target_queries, tokenizer, task_prefix):
        self.input_texts = input_texts
        self.target_queries = target_queries
        self.tokenizer = tokenizer
        self.task_prefix = task_prefix

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        
        input_text = self.task_prefix + self.input_texts[index]
        target_query = self.target_queries[index]

        input_encoding = self.tokenizer([input_text], return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        target_encoding = self.tokenizer([target_query], return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        
        return {
            'input_ids': input_encoding.input_ids.squeeze(0),
            'attention_mask': input_encoding.attention_mask.squeeze(0),
            'labels': target_encoding.input_ids.squeeze(0),
        }

In [10]:
# Load the labeled dataset
input_texts = queryData['final_TQL'].values # List of input texts
target_queries = queryData['SQL'].values  # List of corresponding target SQL queries

# Split the dataset into train and validation sets
train_input_texts, val_input_texts, train_target_queries, val_target_queries = train_test_split(input_texts, target_queries, test_size=0.2, random_state=42)

In [11]:
# Create instances of the custom dataset
task_prefix = 'SQL query for: '
train_dataset = SQLDataset(train_input_texts, train_target_queries, tokenizer, task_prefix)
val_dataset = SQLDataset(val_input_texts, val_target_queries, tokenizer, task_prefix)

In [12]:
# Define the training hyperparameters
BATCH_SIZE = 32
NUM_EPOCHS = 20
LEARNING_RATE = 0.004

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
criterion = torch.nn.CrossEntropyLoss()

# Set up TensorBoard writer
writer = SummaryWriter(f'{model_folder}/logs/')

In [13]:
# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
# Training loop
for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
        loss = outputs.loss
        loss.mean().backward()
        optimizer.step()
        
        # Write training loss to TensorBoard
        writer.add_scalar('Training Loss', loss.mean().item(), epoch)

    # Evaluation on validation set
    model.eval()
    total_val_loss = 0
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        with torch.no_grad():
            outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
            val_loss = outputs.loss
            total_val_loss += val_loss.mean().item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    
    # Write validation loss to TensorBoard
    writer.add_scalar('Validation Loss', avg_val_loss, epoch)
    
    # Print progress
    print(f'Epoch: {epoch+1}, Validation Loss: {avg_val_loss:.4f}')

# Close the TensorBoard writer
writer.close()

  0%|          | 0/20 [00:00<?, ?it/s]



Epoch: 1, Validation Loss: 0.0199
Epoch: 2, Validation Loss: 0.0161
Epoch: 3, Validation Loss: 0.0139
Epoch: 4, Validation Loss: 0.0127
Epoch: 5, Validation Loss: 0.0126
Epoch: 6, Validation Loss: 0.0112
Epoch: 7, Validation Loss: 0.0119
Epoch: 8, Validation Loss: 0.0109
Epoch: 9, Validation Loss: 0.0107
Epoch: 10, Validation Loss: 0.0109
Epoch: 11, Validation Loss: 0.0104
Epoch: 12, Validation Loss: 0.0148
Epoch: 13, Validation Loss: 0.0111
Epoch: 14, Validation Loss: 0.0102
Epoch: 15, Validation Loss: 0.0101
Epoch: 16, Validation Loss: 0.0102
Epoch: 17, Validation Loss: 0.0100
Epoch: 18, Validation Loss: 0.0102
Epoch: 19, Validation Loss: 0.0098
Epoch: 20, Validation Loss: 0.0100


In [15]:
import subprocess
import glob

local_download_path = "/home/jupyter/model"
model.module.save_pretrained(local_download_path)

gsutil_command = f"gsutil -m cp -r {local_download_path} {model_folder}"

# Run the gsutil command
try:
    subprocess.run(gsutil_command, shell=True, check=True, stdout=subprocess.PIPE)
    print("Model upload successful.")
except subprocess.CalledProcessError as e:
    print(f"Model upload failed. Error: {e}")

files = glob.glob(local_download_path+"/*")
for f in files:
    os.remove(f)    
os.rmdir(local_download_path)

Copying file:///home/jupyter/model/events.out.tfevents.1699209479.jasmeet.49623.0 [Content-Type=application/octet-stream]...
Copying file:///home/jupyter/model/model.pt [Content-Type=application/vnd.snesdev-page-table]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file:///home/jupyter/model/events.out.tfevents.1698965850.jasmeet.120493.0 [Content-

Model upload successful.


In [16]:
model_folder

'gs://data_tql/Model/2023-11-05/t5-small/run_5'

In [17]:
import os
import glob

files = glob.glob(local_download_path+"/*")
for f in files:
    os.remove(f)

os.rmdir(local_download_path)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/model'

In [18]:
model_path = f'{model_folder.replace("gs://data_tql/","")}/model.pt'
blob_model = blob('data_tql', model_path)
with blob_model.open("wb", ignore_flush=True) as f:
    torch.save(model, f)

In [19]:
folder_items = get_blobs('data_tql', model_folder.replace("gs://data_tql/",""))

# Create the local directory if it doesn't exist
os.makedirs(local_download_path, exist_ok=True)

# Download each object in the folder
for blob in folder_items:
    # Create the local file path by combining the local directory with the object name
    local_file_path = os.path.join(local_download_path, blob.name.split('/')[-1])

    # Download the object to the local file path
    blob.download_to_filename(local_file_path)

    print(f'Downloaded {blob.name} to {local_file_path}')

print(f'All files from {model_folder} have been downloaded to {local_download_path}')


Downloaded Model/2023-11-05/t5-small/run_5/logs/events.out.tfevents.1699226419.jasmeet.4186.0 to /home/jupyter/model/events.out.tfevents.1699226419.jasmeet.4186.0
Downloaded Model/2023-11-05/t5-small/run_5/model.pt to /home/jupyter/model/model.pt
Downloaded Model/2023-11-05/t5-small/run_5/model/config.json to /home/jupyter/model/config.json
Downloaded Model/2023-11-05/t5-small/run_5/model/events.out.tfevents.1698771135.jasmeet.26921.0 to /home/jupyter/model/events.out.tfevents.1698771135.jasmeet.26921.0
Downloaded Model/2023-11-05/t5-small/run_5/model/events.out.tfevents.1698954100.jasmeet.6221.0 to /home/jupyter/model/events.out.tfevents.1698954100.jasmeet.6221.0
Downloaded Model/2023-11-05/t5-small/run_5/model/events.out.tfevents.1698965850.jasmeet.120493.0 to /home/jupyter/model/events.out.tfevents.1698965850.jasmeet.120493.0
Downloaded Model/2023-11-05/t5-small/run_5/model/events.out.tfevents.1699200885.jasmeet.4952.0 to /home/jupyter/model/events.out.tfevents.1699200885.jasmeet.49

In [20]:
# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(local_download_path)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Move the model to GPU if available
model.to(device)

# Preprocess input text
input_text = val_input_texts[1]
sql = val_target_queries[1]

# Tokenize and encode input text
tokens = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
with torch.no_grad():
    outputs = model.generate(input_ids=tokens.input_ids.to(device), max_new_tokens=1024)
predicted_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print("Input Text: ", input_text)
print('-'*100)
print("Predicted Query: ", predicted_query)
print('-'*100)
print("Actual Query: ", sql)

----------------------------------------------------------------------------------------------------
Predicted Query:  SELECT DISTINCT COUNT ( DISTINCT t3.paperid ) FROM paperkeyphrase AS t1 JOIN keyphrase AS t4 ON t1.keyphraseid = t4.keyphraseid JOIN writes AS t3 ON t3.paperid = t1.paperid JOIN author AS t2 ON t3.authorid = t2.authorid WHERE t2.authorname = "Ed Desmond" AND t4.keyphrasename = "Semantic Parsing";
----------------------------------------------------------------------------------------------------
Actual Query:  SELECT DISTINCT COUNT ( DISTINCT t3.paperid ) FROM paperkeyphrase AS t1 JOIN keyphrase AS t4 ON t1.keyphraseid  =  t4.keyphraseid JOIN writes AS t3 ON t3.paperid  =  t1.paperid JOIN author AS t2 ON t3.authorid  =  t2.authorid WHERE t2.authorname  =  "Ed Desmond" AND t4.keyphrasename  =  "Semantic Parsing";
