In [4]:
# !pip install -r requirements.txt

In [5]:
# pip install --upgrade spacy


In [6]:
import pandas as pd
import json
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from google.cloud import storage
from datetime import datetime

In [7]:
def blob(bucket_name, filename):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(filename)
    return blob

In [8]:
queryData = pd.read_csv('gs://data_tql/spider/processed/spiderQueryData.csv')
tableData = pd.read_csv('gs://data_tql/spider/processed/Schemas/tablesSchemaSpider.csv')

display(queryData.head(1))
display(tableData.head(2))

Unnamed: 0,db_id,TQL,SQL,dataset,fileName,filePath,result
0,department_management,How many heads of the departments are older th...,SELECT count(*) FROM head WHERE age > 56,train,department_management.sqlite,sqliteDB/department_management.sqlite,{'count(*)': {0: 5}}


Unnamed: 0,schema_id,table_name,table_name_original,primary_key,column_list,column_list_original,column_datatypes,foreign_keys
0,perpetrator,perpetrator,perpetrator,Perpetrator_ID,"['perpetrator id', 'people id', 'date', 'year'...","['Perpetrator_ID', 'People_ID', 'Date', 'Year'...","['number', 'number', 'text', 'number', 'text',...",[]
1,perpetrator,people,people,People_ID,"['people id', 'name', 'height', 'weight', 'hom...","['People_ID', 'Name', 'Height', 'Weight', 'Hom...","['number', 'text', 'number', 'number', 'text']","[['perpetrator', 'People_ID', 'people', 'Peopl..."


In [9]:
def create_schema_natural_language(row):

    schema_id = row['schema_id']
    table_name = row['table_name']
    primary_key = row['primary_key']
    column_list = eval(row['column_list_original'])
    datatype_list = eval(row['column_datatypes'])
    foreign_key = eval(row['foreign_keys'])

    column_list_with_datatype = []
    for column, datatype in zip(column_list, datatype_list):
        column_list_with_datatype.append(' has datatype '.join([column, datatype]))

    schema_natural_language = f"Given the Table {table_name} having columns as {', '.join(column_list_with_datatype)} which has {primary_key}"
    return schema_natural_language

In [10]:
tableData['schema_natural_language'] = tableData.apply(create_schema_natural_language, axis = 1)
tableData.head(3)

all_schemas = tableData['schema_id'].unique()
schema_table_query = {}
for schema in all_schemas:
    schema_details = ' and '.join(tableData[tableData['schema_id'] == schema]['schema_natural_language'].values)
    schema_table_query[schema] = schema_details

queryData['schema_natural_language'] = queryData['db_id'].map(schema_table_query)
queryData['final_TQL'] = queryData['schema_natural_language'] + ' ' + queryData['TQL']
queryData.head(2)

queryData['final_TQL'][0], queryData['SQL'][0]

('Given the Table department having columns as Department_ID has datatype number, Name has datatype text, Creation has datatype text, Ranking has datatype number, Budget_in_Billions has datatype number, Num_Employees has datatype number which has Department_ID and Given the Table head having columns as head_ID has datatype number, name has datatype text, born_state has datatype text, age has datatype number which has head_ID and Given the Table management having columns as department_ID has datatype number, head_ID has datatype number, temporary_acting has datatype text which has department_ID How many heads of the departments are older than 56 ?',
 'SELECT count(*) FROM head WHERE age  >  56')

In [12]:
queryData.columns

Index(['db_id', 'TQL', 'SQL', 'dataset', 'fileName', 'filePath', 'result',
       'schema_natural_language', 'final_TQL'],
      dtype='object')

In [16]:
tableData['schema_natural_language'].iloc[1]

'Given the Table people having columns as People_ID has datatype number, Name has datatype text, Height has datatype number, Weight has datatype number, Home Town has datatype text which has People_ID'

In [18]:
queryData['schema_natural_language'].iloc[1]

'Given the Table department having columns as Department_ID has datatype number, Name has datatype text, Creation has datatype text, Ranking has datatype number, Budget_in_Billions has datatype number, Num_Employees has datatype number which has Department_ID and Given the Table head having columns as head_ID has datatype number, name has datatype text, born_state has datatype text, age has datatype number which has head_ID and Given the Table management having columns as department_ID has datatype number, head_ID has datatype number, temporary_acting has datatype text which has department_ID'

In [19]:
import pandas as pd
import json

# Initialize a list to store JSON objects
json_objects = []

# Iterate through the DataFrame and create JSON objects
for index, row in queryData.iterrows():
    data = {
        "input": row['TQL'],
        "context": row['schema_natural_language'],
        "SQL": row['SQL']
    }
    
    # Append the JSON object to the list
    json_objects.append(data)

# Define the output file path
output_file = "output.jsonl"

# Create the JSONL file and write the JSON objects
with open(output_file, 'w') as outfile:
    for json_obj in json_objects:
        outfile.write(json.dumps(json_obj) + "\n")

print("JSONL file created successfully.")


JSONL file created successfully.
