# Calculate Phrase Similarity

This notebook

In [1]:
import re
import json

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model
st_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def calculate_semantic_similarity_to_phrase(variable_name, phrase):
    # Encode the variable name and the phrase
    variable_embedding = st_model.encode(variable_name)
    phrase_embedding = st_model.encode(phrase)

    # Calculate cosine similarity
    similarity = util.cos_sim(variable_embedding, phrase_embedding).item()
    return similarity


In [None]:
# Example usage
# variable_name = "MostExpensive"
# phrase = "This code ranks items by price and selects the highest in each category."

variable_name = 'MostExpensive'
phrase = 'The query identifies and returns the locations (country, region, city) where there are employees but no customers in the Northwind database.'

similarity_score = calculate_semantic_similarity_to_phrase(variable_name, phrase)

print(f"Similarity between '{variable_name}' and phrase: {similarity_score}")

In [4]:
def remove_special_characters(input_string):
    # Replace all non-alphanumeric characters (except spaces) with an empty string
    cleaned_string = re.sub(r'[^A-Za-z0-9 ]+', '', input_string)
    return cleaned_string

In [None]:
def calculate_phrase_similarities(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    for item in data:
        cte_name = item.get("cte-name")
        llm_cte_name = item.get("Mistral-7B-Instruct-v0.3-cte-name")
        summary = item.get("summary")
        
        similarity = 0
        llm_similarity = 0
        if cte_name is not None and cte_name != "":
            if summary is not None and summary != "":
                similarity = calculate_semantic_similarity_to_phrase(cte_name, summary) # remove_special_characters(summary)\
                llm_similarity = calculate_semantic_similarity_to_phrase(llm_cte_name.removeprefix('CTE_'), summary)

        
        # Append the similarities to the object
        item["phrase_similarity"] = similarity
        item["Mistral-7B-Instruct-v0.3-phrase_similarity"] = llm_similarity

    with open(json_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

#### Comparing LLM Similarities to SPIDER 2 LITE and Curated Github Data

In [9]:
calculate_phrase_similarities('data_files\curated-ctes.json')
calculate_phrase_similarities('data_files\spider2-lite-ctes.json')