In [1]:
!pip install sentence_transformers




In [2]:
import re
import pandas as pd
import json
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import ast

  from tqdm.autonotebook import tqdm, trange




**Step 1: main_data_prep**



In [3]:


def extract_sql_details(sql_query):
    # Extract table names from FROM and JOIN clauses
    table_pattern = r"FROM\s+([a-zA-Z0-9_]+)(?:\s+AS\s+\w+)?|JOIN\s+([a-zA-Z0-9_]+)(?:\s+AS\s+\w+)?"
    tables = re.findall(table_pattern, sql_query)
    tables = [table for table_tuple in tables for table in table_tuple if table]  # Flatten and filter out empty strings

    # Extract column names from SELECT, ON, WHERE, and ORDER BY clauses
    column_pattern = r"SELECT\s+(.*?)\s+FROM|ON\s+([\w\.]+)\s*=\s*([\w\.]+)|WHERE\s+([\w\.]+)\s*=\s*['\"]?(.*?)['\"]?|ORDER\s+BY\s+([\w\.]+)"
    columns = re.findall(column_pattern, sql_query)
    columns = [item for sublist in columns for item in sublist if item]  # Flatten and filter out empty strings

    # Extract JOIN clause
    join_pattern = r"(JOIN\s+[a-zA-Z0-9_]+\s+(?:AS\s+\w+\s+)?ON\s+[\w\.]+\s*=\s*[\w\.]+)"
    join_clause = re.findall(join_pattern, sql_query)

    # Extract encoded values from WHERE clause
    where_pattern = r"WHERE\s+([\w\.]+\s*=\s*['\"].*?['\"]|\d+)"
    encoded_values = re.findall(where_pattern, sql_query)

    # Output formatting
    print("Table Names:\n")
    for table in sorted(set(tables)):
        print(f"- {table}")

    print("\nColumn Names:\n")
    for column in sorted(set(columns)):
        print(f"- {column}")

    if join_clause:
        print("\nJoin Clause:\n")
        for join in join_clause:
            print(f"{join} (This joins the tables using the specified ON condition.)")

    if encoded_values:
        print("\nEncoded Values:\n")
        for value in encoded_values:
            print(f"- {value.strip()} (This filters the results to include only those rows where {value.split('=')[0].strip()} is equal to {value.split('=')[-1].strip()})")
    else:
        print("\nEncoded Values:\n- None")

# Example 1: With JOIN and WHERE clause
sql_query_with_join = """
SELECT ic.pcinstnm
FROM adm2022 a
JOIN ic2022campuses ic
ON a.UNITID = ic.UNITID
WHERE admcon3 = 1;
"""

# Example 2: With JOIN, WHERE, and ORDER BY clause
sql_query_ex3 = """
SELECT ic.pcinstnm
FROM ic2022campuses AS ic
JOIN gr2022 AS gr
ON ic.unitid = gr.unitid
WHERE ic.pcstabbr = 'TX'
ORDER BY gr.grtotlm DESC
LIMIT 1;
"""

# Test the function with both SQL queries
print("Example 1 Output:")
extract_sql_details(sql_query_with_join)
print("\n\nExample 2 Output:")
extract_sql_details(sql_query_ex3)

Example 1 Output:
Table Names:

- adm2022
- ic2022campuses

Column Names:

- a.UNITID
- admcon3
- ic.UNITID
- ic.pcinstnm

Encoded Values:
- None


Example 2 Output:
Table Names:

- gr2022
- ic2022campuses

Column Names:

- gr.grtotlm
- gr.unitid
- ic.pcinstnm
- ic.pcstabbr
- ic.unitid

Join Clause:

JOIN gr2022 AS gr
ON ic.unitid = gr.unitid (This joins the tables using the specified ON condition.)

Encoded Values:

- ic.pcstabbr = 'TX' (This filters the results to include only those rows where ic.pcstabbr is equal to 'TX')


In [4]:
# Example SQL query
sql_query_ex3 = """
SELECT ic.pcinstnm
FROM ic2022campuses AS ic
JOIN gr2022 AS gr
ON ic.unitid = gr.unitid
WHERE ic.pcstabbr = 'TX'
ORDER BY gr.grtotlm DESC
LIMIT 1;
"""

extract_sql_details(sql_query_ex3)

Table Names:

- gr2022
- ic2022campuses

Column Names:

- gr.grtotlm
- gr.unitid
- ic.pcinstnm
- ic.pcstabbr
- ic.unitid

Join Clause:

JOIN gr2022 AS gr
ON ic.unitid = gr.unitid (This joins the tables using the specified ON condition.)

Encoded Values:

- ic.pcstabbr = 'TX' (This filters the results to include only those rows where ic.pcstabbr is equal to 'TX')


In [5]:
# Example 1: With JOIN and WHERE clause
sql_query_with_join = """
SELECT ic.pcinstnm
FROM adm2022 a
JOIN ic2022campuses ic
ON a.UNITID = ic.UNITID
WHERE admcon3 = 1;
"""

# Test the function with different SQL queries
print("Example 1 Output:")
extract_sql_details(sql_query_with_join)


Example 1 Output:
Table Names:

- adm2022
- ic2022campuses

Column Names:

- a.UNITID
- admcon3
- ic.UNITID
- ic.pcinstnm

Encoded Values:
- None


In [6]:
# Example 2: Without JOIN, with WHERE clause
sql_query_without_join = """
SELECT webaddr
FROM hd2022
WHERE instnm = 'Boston University';
"""
print("\n\nExample 2 Output:")
extract_sql_details(sql_query_without_join)



Example 2 Output:
Table Names:

- hd2022

Column Names:

- instnm
- webaddr

Encoded Values:

- instnm = 'Boston University' (This filters the results to include only those rows where instnm is equal to 'Boston University')


In [7]:


def extract_sql_details(sql_query):
    # Extract table names from FROM and JOIN clauses
    table_pattern = r"FROM\s+([a-zA-Z0-9_]+)(?:\s+AS\s+\w+)?|JOIN\s+([a-zA-Z0-9_]+)(?:\s+AS\s+\w+)?"
    tables = re.findall(table_pattern, sql_query)
    tables = [table for table_tuple in tables for table in table_tuple if table]  # Flatten and filter out empty strings

    # Extract column names from SELECT, ON, WHERE, and ORDER BY clauses
    column_pattern = r"SELECT\s+(.*?)\s+FROM|ON\s+([\w\.]+)\s*=\s*([\w\.]+)|WHERE\s+([\w\.]+)\s*=\s*['\"]?(.*?)['\"]?|ORDER\s+BY\s+([\w\.]+)"
    columns = re.findall(column_pattern, sql_query)
    columns = [item for sublist in columns for item in sublist if item]  # Flatten and filter out empty strings

    # Extract JOIN clause
    join_pattern = r"JOIN\s+[a-zA-Z0-9_]+\s+(?:AS\s+\w+\s+)?ON\s+[\w\.]+\s*=\s*[\w\.]+"
    join_clause = re.findall(join_pattern, sql_query)

    # Extract encoded values from WHERE clause
    where_pattern = r"WHERE\s+([\w\.]+\s*=\s*['\"].*?['\"]|\d+)"
    encoded_values = re.findall(where_pattern, sql_query)

    # Output formatting
    print("Table Names:\n")
    for table in sorted(set(tables)):
        print(f"- {table}")

    print("\nColumn Names:\n")
    for column in sorted(set(columns)):
        print(f"- {column}")

    if join_clause:
        print("\nJoin Clause:\n")
        for join in join_clause:
            print(f"{join} (This joins the tables using the specified ON condition.)")

    if encoded_values:
        print("\nEncoded Values:\n")
        for value in encoded_values:
            key_value = value.split('=')
            print(f"- {key_value[0].strip()} = {key_value[1].strip()} (This filters the results to include only those rows where {key_value[0].strip()} is equal to {key_value[1].strip()})")
    else:
        print("\nEncoded Values:\n- None")

# Example 1: With JOIN and WHERE clause
sql_query_with_join = """
SELECT ic.pcinstnm
FROM adm2022 a
JOIN ic2022campuses ic
ON a.UNITID = ic.UNITID
WHERE admcon3 = 1;
"""

# Example 2: With JOIN, WHERE, and ORDER BY clause
sql_query_ex3 = """
SELECT ic.pcinstnm
FROM ic2022campuses AS ic
JOIN gr2022 AS gr
ON ic.unitid = gr.unitid
WHERE ic.pcstabbr = 'TX'
ORDER BY gr.grtotlm DESC
LIMIT 1;
"""

# Example 3: Without JOIN, with WHERE clause
sql_query_without_join = """
SELECT webaddr
FROM hd2022
WHERE instnm = 'Boston University';
"""

# Test the function with the SQL queries
print("Example 1 Output:")
extract_sql_details(sql_query_with_join)
print("\n\nExample 2 Output:")
extract_sql_details(sql_query_ex3)
print("\n\nExample 3 Output:")
extract_sql_details(sql_query_without_join)


Example 1 Output:
Table Names:

- adm2022
- ic2022campuses

Column Names:

- a.UNITID
- admcon3
- ic.UNITID
- ic.pcinstnm

Encoded Values:
- None


Example 2 Output:
Table Names:

- gr2022
- ic2022campuses

Column Names:

- gr.grtotlm
- gr.unitid
- ic.pcinstnm
- ic.pcstabbr
- ic.unitid

Join Clause:

JOIN gr2022 AS gr
ON ic.unitid = gr.unitid (This joins the tables using the specified ON condition.)

Encoded Values:

- ic.pcstabbr = 'TX' (This filters the results to include only those rows where ic.pcstabbr is equal to 'TX')


Example 3 Output:
Table Names:

- hd2022

Column Names:

- instnm
- webaddr

Encoded Values:

- instnm = 'Boston University' (This filters the results to include only those rows where instnm is equal to 'Boston University')




**Step 2: Updated langchain context**



In [11]:
file_path = '/content/Langchain_context.csv'
df = pd.read_csv('/content/Langchain_context.csv')

df.head()

Unnamed: 0,Human Questions,Langchain created Context
0,Which schools require high school grades,"{\n ""documents"": [\n {\n ""page_conten..."
1,What are the top 5 universities in Massachuset...,"{\n ""documents"": [\n {\n ""page_conten..."
2,What is the Boston University website?,"{\n ""documents"": [\n {\n ""page_conten..."
3,List the universities in California state alon...,"{\n ""documents"": [\n {\n ""page_conten..."
4,"Provide details on Boston University, Boston.","{\n ""documents"": [\n {\n ""page_conten..."


In [12]:
# Function to extract table info
def extract_table_info(json_str):
    try:
        data = json.loads(json_str)
        output_dict = {}
        for document in data.get("documents", []):
            metadata = document.get("metadata", {})
            table_name = metadata.get("Table_Name", "N/A")
            table_description = metadata.get("Table_Description", "N/A")
            output_dict[table_name] = table_description
        return json.dumps(output_dict, indent=4)
    except Exception as e:
        return str(e)


In [13]:
# Apply the function to the "Langchain created Context" column
df['Table_info'] = df['Langchain created Context'].apply(extract_table_info)

# Save the updated dataframe to a new Excel file
output_file_path = '/content/updated_langchain_context.csv'
df.to_csv(output_file_path, index=False)

print("Updated CSV file saved to:", output_file_path)


Updated CSV file saved to: /content/updated_langchain_context.csv


In [59]:
from google.colab import files
files.download(output_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Step 3: Embedding scoring**

In [14]:
"""
 asymmetric semantic search, you usually have a short query (like a question or some keywords)
 and you want to find a longer paragraph answering the query. An example would be a query like
 “What is Python” and you want to find the paragraph “Python is an interpreted, high-level and
 general-purpose programming language. Python’s design philosophy …”.
  For asymmetric tasks, flipping the query and the entries in your corpus usually does not make sense.

"""

'\n asymmetric semantic search, you usually have a short query (like a question or some keywords)\n and you want to find a longer paragraph answering the query. An example would be a query like\n “What is Python” and you want to find the paragraph “Python is an interpreted, high-level and\n general-purpose programming language. Python’s design philosophy …”.\n  For asymmetric tasks, flipping the query and the entries in your corpus usually does not make sense.\n\n'

In [15]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# use the updated csv file
#file_path = '/content/updated_langchain_context.csv'
#df = pd.read_csv(file_path)

# Function to calculate asymmetric similarity between a question and table info
def calculate_asymmetric_similarity(question, table_info):
    # Ensure inputs are strings
    question= str(question) # Convert question to string if it's not
    table_info= str(table_info) # Convert table_info to string if it's not

    # Encode the question and table info
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    table_info_embedding = embedder.encode(table_info, convert_to_tensor=True)

    # Calculate cosine similarity
    similarity_score = torch.nn.functional.cosine_similarity(question_embedding, table_info_embedding, dim=0)

    return similarity_score.item()

# Apply the similarity function for each row
df['Similarity_Score'] = df.apply(lambda row: calculate_asymmetric_similarity(row['Human Questions'], row['Table_info']), axis=1)

# Save the updated dataframe to a new Excel file
output_file_path = '/content/updated_langchain_context_with_similarity.csv'
df.to_csv(output_file_path, index=False)

print("Updated CSV file saved to:", output_file_path)



Updated CSV file saved to: /content/updated_langchain_context_with_similarity.csv


In [17]:
df.head()

Unnamed: 0,Human Questions,Langchain created Context,Table_info,Similarity_Score
0,Which schools require high school grades,"{\n ""documents"": [\n {\n ""page_conten...","{\n ""ic2022campuses"": ""This table contains ...",0.318869
1,What are the top 5 universities in Massachuset...,"{\n ""documents"": [\n {\n ""page_conten...","{\n ""ic2022campuses"": ""This table contains ...",0.488724
2,What is the Boston University website?,"{\n ""documents"": [\n {\n ""page_conten...","{\n ""ic2022campuses"": ""This table contains ...",0.286951
3,List the universities in California state alon...,"{\n ""documents"": [\n {\n ""page_conten...","{\n ""ic2022campuses"": ""This table contains ...",0.397723
4,"Provide details on Boston University, Boston.","{\n ""documents"": [\n {\n ""page_conten...","{\n ""ic2022campuses"": ""This table contains ...",0.318588


**Step 4: semantic search results with actual score**

In [18]:
# Load the pre-trained model (optimized for GPU usage)
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')

# Load the updated Excel file
#file_path = '/content/drive/MyDrive/ALY6080/FINAL_folder/updated_langchain_context_with_similarity.xlsx'
#df = pd.read_csv(file_path)

# Encode all Table_info entries into embeddings
table_infos = df['Table_info'].tolist()
table_info_embeddings = model.encode(table_infos, convert_to_tensor=True)

# Function to find top 3 semantically similar Table_info entries and the actual table info score for a given question
def find_top3_similar_tables(question, actual_table_info, table_info_embeddings, table_infos):
    # Ensure inputs are strings
    question= str(question) # Convert question to string if it's not

    question_embedding = model.encode(question, convert_to_tensor=True)
    actual_table_info_embedding = model.encode(actual_table_info, convert_to_tensor=True)

    # Compute cosine similarities between the question and all table_infos
    cosine_scores = util.pytorch_cos_sim(question_embedding, table_info_embeddings)[0]

    # Compute similarity for the actual table info
    actual_table_info_score = util.pytorch_cos_sim(question_embedding, actual_table_info_embedding)[0].item()

    # Find the top 3 matches
    top_results = torch.topk(cosine_scores, k=3)

    # Retrieve the top 3 similar table infos
    top_table_infos = [table_infos[idx] for idx in top_results.indices]
    top_scores = [cosine_scores[idx].item() for idx in top_results.indices]

    return actual_table_info_score, top_table_infos, top_scores

# List to store results
results = []

# Iterate through each question and find top 3 similar Table_info entries
for index, row in df.iterrows():
    question = row['Human Questions']
    actual_table_info = row['Table_info']

    actual_table_info_score, top_table_infos, top_scores = find_top3_similar_tables(question, actual_table_info, table_info_embeddings, table_infos)

    results.append({
        "Question": question,
        "Actual Table Info": actual_table_info,
        "Actual Table Info Score": actual_table_info_score,
        "Top 1 Similar Table Info": top_table_infos[0],
        "Top 1 Score": top_scores[0],
        "Top 2 Similar Table Info": top_table_infos[1],
        "Top 2 Score": top_scores[1],
        "Top 3 Similar Table Info": top_table_infos[2],
        "Top 3 Score": top_scores[2]
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save the results to a new Excel file
output_file_path = '/content/semantic_search_results_with_actual_score.csv'
results_df.to_csv(output_file_path, index=False)

print("Results saved to:", output_file_path)


Results saved to: /content/semantic_search_results_with_actual_score.csv


**Step 5: Streamlit_data_prep**

In [19]:
# use the results DataFrame from the provided Excel file
#file_path = '/content/semantic_search_results_with_actual_score.csv'
#df = pd.read_csv(file_path)
df= results_df
# Function to extract table names from the JSON-like string in the specified columns
def extract_table_names(table_info):
    try:
        table_dict = ast.literal_eval(table_info)
        table_names = ','.join(table_dict.keys())
        return table_names
    except (ValueError, SyntaxError):
        return ''

# Apply the function to create new columns 'Actual Table Names', 'Top 1 Table Names', etc.
df['Actual Table Names'] = df['Actual Table Info'].apply(extract_table_names)
df['Top 1 Table Names'] = df['Top 1 Similar Table Info'].apply(extract_table_names)
df['Top 2 Table Names'] = df['Top 2 Similar Table Info'].apply(extract_table_names)
df['Top 3 Table Names'] = df['Top 3 Similar Table Info'].apply(extract_table_names)

# Save the updated DataFrame to a new Excel file
output_path = '/content/final_app.csv'
df.to_csv(output_path, index=False)
print("Results saved to:", output_path)


Results saved to: /content/final_app.csv
