In this notebook I am building the logic that converts the raw sql into a sql that can be processed.

Example: 
Convert "'SELECT (U.S. Open Cup) FROM 1-1046170-5 WHERE Regular Season = 4th, atlantic division' (the original sql" to ""

In [1]:
import json
import pandas as pd

In [2]:
"""
each entry in path_text2sql_results contains:

- the final sql
- all components of the sql (selects, conditions aggregations..)
- table-id for the sql
- the human-readable question for the sql

## [[3, 0, '4th, atlantic division']]}
3 = column index
0 = {1, 2} for {<, >}
"'4th, atlantic division'"  = content of where-clause

"""


path_text2sql_results = '/Users/bleopold/OneDrive/data-analysis/results_from_sqlova/results_dev.jsonl'

path_dev_tables = '/Users/bleopold/OneDrive/data-analysis/vendors/WikiSQL/data/dev.tables.jsonl'

# Get the data I need

In [3]:
# get names of all the tables for which I have data
with open(path_dev_tables, 'r') as json_file:
    dev_tables_raw_file = list(json_file)

tables_with_data = []

for json_str in dev_tables_raw_file:
    result = json.loads(json_str)
    table_id = result["id"]
    
    tables_with_data.append(table_id)

In [4]:
# for all these tables I have a text to sql result
with open(path_text2sql_results, 'r') as json_file:
    text2sql_results_raw_file = list(json_file)

tables_with_text2sql_result = []

for json_str in text2sql_results_raw_file:
    result = json.loads(json_str)
    table_id = result["table_id"]
    
    tables_with_text2sql_result.append(table_id)
    
tables_with_text2sql_result = list(set(tables_with_text2sql_result))

In [5]:
# for all these tables I have data and a text to sql result
"""
get the intersect of tables_with_data and tables_with_text2sql_result
"""
ids_usable_tables = [_ for _ in tables_with_text2sql_result if _ in tables_with_data]

# Define functions

In [17]:
def replace_table_id_with_table_name_in_sql(sql, table_id):
    # holds the table_id in one column and the table_name in the other column
    mapping_table_id_table_name = pd.read_csv(
        "../data/01_raw/wiki-sql/schema_infos/mapping_table_id_table_name.csv"
        ,  sep = '\t'
    )
    
    
    # get the table_name using table_id
    table_name = list(mapping_table_id_table_name[
        mapping_table_id_table_name["table_id"]==table_id
    ]["table_name"])[0]
    
    
    
    # replace table_id for table_name in the original sql
    sql_clean_table_name = sql.replace(table_id, table_name)
    
    return sql_clean_table_name

In [18]:
def replace_enclosed_where_conditions_with_quotes(sql, condition_infos):
    """
    example for condition_infos: [[3, 0, '4th, atlantic division']]
    
    place each condition in the array between dobles quotes
    """
    for condition_info in condition_infos:
        condition_value_orig = condition_info[2]
        condition_value_quoted = "'{}'".format(condition_value_orig)
        sql = sql.replace(condition_value_orig, condition_value_quoted)

    return sql

In [19]:
def insert_clean_column_names(sql, condition_infos, table_id):
    mapping_column_names = pd.read_csv(
        "../data/01_raw/wiki-sql/schema_infos/mapping_column_names.csv"
        ,  sep = '\t'
    )
    
    for condition_info in condition_infos:
        column_order = condition_info[0]
    
    
        column_name_original  = list(
            mapping_column_names[
                (mapping_column_names["table_id"] == table_id)
                &
                (mapping_column_names["column_order"] == column_order)
            ]["column_name_original"]
        )[0]

        column_name_clean  = list(
            mapping_column_names[
                (mapping_column_names["table_id"] == table_id)
                &
                (mapping_column_names["column_order"] == column_order)
            ]["column_name_clean"]
        )[0]


        sql = sql.replace(column_name_original, column_name_clean)
        
        
    return sql

# Convert original sql to parse-able sql

In [23]:
# get one sql/table for testing
with open(path_text2sql_results, 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    
    table_id = result["table_id"]
    
    if table_id in ('1-1046170-5'
                    ,'1-1061075-1'
                    ,'1-10015132-9'
                    ,'1-10015132-11'
                    ,'1-10026563-1'
                    ,'1-10295819-2'
                    ,'1-10429820-13'
                   ):
        table_file_path = '../data/01_raw/wiki-sql/tables/{}.csv'.format(table_id)
        text_question = result["nlu"]
        sql_original = result["sql"]
        print(sql_original)
        
        sql = replace_table_id_with_table_name_in_sql(sql_original, table_id)
        
        condition_infos = result["query"].get("conds")

        if condition_infos:
            sql = replace_enclosed_where_conditions_with_quotes(sql, condition_infos)
    
        if condition_infos:
            sql = insert_clean_column_names(sql, condition_infos, table_id)
        
        print(sql)
        print('\n')

SELECT (Position) FROM 1-10015132-11 WHERE School/Club Team = butler cc (ks)
SELECT (Position) FROM toronto_raptors_all-time_roster WHERE schoolclub_team = 'butler cc (ks)'


SELECT count(School/Club Team) FROM 1-10015132-11 WHERE No. = 3
SELECT count(School/Club Team) FROM toronto_raptors_all-time_roster WHERE no = '3'


SELECT (School/Club Team) FROM 1-10015132-11 WHERE No. = 21
SELECT (School/Club Team) FROM toronto_raptors_all-time_roster WHERE no = '21'


SELECT (Player) FROM 1-10015132-11 WHERE No. = 42
SELECT (Player) FROM toronto_raptors_all-time_roster WHERE no = '42'


SELECT (Player) FROM 1-10015132-11 WHERE Position = guard AND Years in Toronto = 1996-97
SELECT (Player) FROM toronto_raptors_all-time_roster WHERE position = 'guard' AND years_in_toronto = '1996-97'


SELECT (Player) FROM 1-10015132-9 WHERE School/Club Team = westchester high school
SELECT (Player) FROM toronto_raptors_all-time_roster WHERE schoolclub_team = 'westchester high school'


SELECT (School/Club Team