# Similarity measures and word subtraction
Letícia Marçal Russo 0664618

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.metrics.distance import edit_distance
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import nltk
from nltk.tree import Tree
from zss import simple_distance, Node
from sklearn.preprocessing import MinMaxScaler, normalize
from gensim.models import Word2Vec
from collections import Counter

In [2]:
### SBERT similarity on questions ###

# Load the SBERT model for sentence similarity
model_BERT = SentenceTransformer('bert-base-nli-mean-tokens')

# Define a function to calculate the sentence similarity using SBERT
def calculate_sentence_sim_SBERT(sent1, sent2):
    embeddings = model_BERT.encode([sent1, sent2])
    similarity = cosine_similarity(embeddings)[0][1]
    
    # Rescale the similarity value to a range of 0 to 1
    similarity = (similarity + 1) / 2
        
    return similarity

In [3]:
### Tree Edit Distance on parse trees ###

# Define function to convert a NLTK tree to node objects
def parse_tree_to_nodes(tree):
    if isinstance(tree, str):
        return Node(tree)
    else:
        children = [parse_tree_to_nodes(child) for child in tree]
        return Node(tree.label(), children)

def count_nodes(node):
    if node is None:
        return 0
    count = 1
    for child in node.children:
        count += count_nodes(child)
    return count


# Define a function to calculate the Tree Edit Distance
def calculate_tree_edit_distance(tree1, tree2):
    # Parse the tree strings into NLTK Tree objects
    tree1 = Tree.fromstring(tree1)
    tree2 = Tree.fromstring(tree2)

    # Convert the NLTK Tree objects to Node objects
    root1 = parse_tree_to_nodes(tree1)
    root2 = parse_tree_to_nodes(tree2)

    # Calculate the tree edit distance
    distance = simple_distance(root1, root2)

    # Count the nodes in the trees
    tree1_size = count_nodes(root1)
    tree2_size = count_nodes(root2)

    # Scale the distance to obtain the actual edit distance
    actual_distance = distance / (tree1_size + tree2_size)

    return actual_distance

In [4]:
### Word subtraction ### 

# Define function for word subtraction 
def subtract_nodes(nodes1, nodes2):
    nodes1 = [node.strip() for node in nodes1]
    nodes2 = [node.strip() for node in nodes2]

    nodes1_counter = Counter(nodes1)
    nodes2_counter = Counter(nodes2)

    not_common_nodes_counter = nodes1_counter - nodes2_counter
    not_common_nodes = list(not_common_nodes_counter.elements())

    return not_common_nodes

In [5]:
### Function to create pairwise comparison and apply the three analysis ###

def pairwise_analysis(df):
    output_rows = []
    
    # Get the parse trees from the df column
    parse_trees = df['parse_tree'].tolist()
    
    # Iterate over unique tasks 
    tasks = df['task'].unique()
    for task in tasks:
        task_df = df[df['task'] == task]
        combinations = list(itertools.combinations(task_df.index, 2))

        # Perform pairwise comparison within the task
        for idx1, idx2 in combinations:
            row1 = task_df.loc[idx1]
            row2 = task_df.loc[idx2]
            
            # Calculate question similarity SBERT
            question_sim_SBERT = calculate_sentence_sim_SBERT(row1['question'], row2['question'])
            
            # Calculate TED
            parse_tree_dist = calculate_tree_edit_distance(row1['parse_tree'], row2['parse_tree'])

            # Calculate question subtraction
            question1_words = row1['question'].split()
            question2_words = row2['question'].split()
            question_subtraction = [word for word in question1_words if word not in question2_words]

            # Calculate nodes subtraction
            nodes1 = row1['nodes'].split(',')
            nodes2 = row2['nodes'].split(',')
            nodes_subtraction = subtract_nodes(nodes1, nodes2)
            
            # Create new columns in the df
            output_rows.append({
                'task': row1['task'],
                'question1': row1['question'],
                'question2': row2['question'],
                'parse_tree1': row1['parse_tree'],
                'parse_tree2': row2['parse_tree'],
                'nodes1': row1['nodes'],
                'nodes2': row2['nodes'],
                'question_sim_SBERT': question_sim_SBERT,
                'parse_tree_dist': parse_tree_dist,
                'question_subtraction': ' '.join(question_subtraction),
                'nodes_subtraction': nodes_subtraction
            })

    output_df = pd.DataFrame(output_rows)
    return output_df

In [6]:
# Read the questions + parse tree from the XLSX file into a pandas df
question_df = pd.read_excel("questions_parsetree_final.xlsx", engine = "openpyxl")

In [7]:
# Call the pairwise_analysis function 
output_df = pairwise_analysis(question_df)

# View 60 first rows
first_60_rows = output_df[:60]
first_60_rows 

Unnamed: 0,task,question1,question2,parse_tree1,parse_tree2,nodes1,nodes2,question_sim_SBERT,parse_tree_dist,question_subtraction,nodes_subtraction
0,access,What is the percentage of the rural population...,"What is the total rural population in Shikoku,...",(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (aggre aggregate 0) (c...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.904799,0.414634,percentage of within 2 kilometers of all-seaso...,"[Clip, /roads, Buffer, /roads_buffer, /rural_2..."
1,access,What is the percentage of the rural population...,What is the rural population within 2 kilomete...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.990467,0.098039,percentage,"[AddFields, CalculateField, CalculateField, Su..."
2,access,What is the percentage of the rural population...,What is the population within 2 kilometers of ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.98585,0.132075,percentage,"[AddFields, CalculateField, CalculateField, Ca..."
3,access,What is the percentage of the rural population...,What is the proportion of population within 2 ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC genpro 0 ira_) ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.989975,0.034483,percentage,"[AddFields, CalculateField, CalculateField, Ca..."
4,access,What is the percentage of the rural population...,What is the area of rural district within 2 ki...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC covamount 0 era...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.97583,0.090909,percentage population,"[AddFields, CalculateField, CalculateField, Ca..."
5,access,What is the percentage of the rural population...,What areas are within 2 kilometers of the all-...,(start what is (measure (coreC genpro 0 ira_) ...,(start (measure (location what areas)) are (co...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.977657,0.27451,is percentage population,"[AddFields, AddFields, /rural_2km_addfields, C..."
6,access,What is the percentage of the rural population...,What is the population for each rural district...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.899259,0.45,percentage of within 2 kilometers of all-seaso...,"[Clip, /roads, Buffer, /roads_buffer, /rural_2..."
7,access,What is the percentage of the rural population...,What are the areas within 2 kilometers of the ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what are (measure (coreC region 0)) (co...,"/urbanization, SelectLayerByAttribute, /urbani...","/roads, Buffer, /roads_buffer",0.920201,0.16,is percentage rural population,"[/urbanization, SelectLayerByAttribute, /urban..."
8,access,What is the percentage of the rural population...,"Where are the all-season roads in Shikoku, Japan",(start what is (measure (coreC genpro 0 ira_) ...,(start (measure (location where are (coreC obj...,"/urbanization, SelectLayerByAttribute, /urbani...",/roads,0.843195,0.578947,What is percentage of rural population within ...,"[/urbanization, SelectLayerByAttribute, /urban..."
9,access,What is the percentage of the rural population...,What is the population for each district in Sh...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...",/chochomoku,0.797812,0.45,percentage of rural within 2 kilometers of all...,"[/urbanization, SelectLayerByAttribute, /urban..."


In [8]:
# Save df to an excel file
output_df.to_excel("analysis_df.xlsx", index = False)

In [9]:
# Open again the data set in order to filter it out 
subworkflows_df = pd.read_excel("analysis_df.xlsx", engine = "openpyxl")

In [10]:
# Define a function to check if one workflow is subset of another 
def is_subset(row):
    return all(item in row['nodes1'] for item in row['nodes2'])

# Apply the function to create a new column 'subset_check'
subworkflows_df['subset_check'] = subworkflows_df.apply(is_subset, axis=1)

# Filter and keep only the rows with 'subset_check' equal to true
filtered_df = subworkflows_df[subworkflows_df['subset_check']]

In [11]:
filtered_df

Unnamed: 0,task,question1,question2,parse_tree1,parse_tree2,nodes1,nodes2,question_sim_SBERT,parse_tree_dist,question_subtraction,nodes_subtraction,subset_check
0,access,What is the percentage of the rural population...,"What is the total rural population in Shikoku,...",(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (aggre aggregate 0) (c...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.904799,0.414634,percentage of within 2 kilometers of all-seaso...,"['Clip', '/roads', 'Buffer', '/roads_buffer', ...",True
1,access,What is the percentage of the rural population...,What is the rural population within 2 kilomete...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.990467,0.098039,percentage,"['AddFields', 'CalculateField', 'CalculateFiel...",True
2,access,What is the percentage of the rural population...,What is the population within 2 kilometers of ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.985850,0.132075,percentage,"['AddFields', 'CalculateField', 'CalculateFiel...",True
3,access,What is the percentage of the rural population...,What is the proportion of population within 2 ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC genpro 0 ira_) ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.989975,0.034483,percentage,"['AddFields', 'CalculateField', 'CalculateFiel...",True
4,access,What is the percentage of the rural population...,What is the area of rural district within 2 ki...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC covamount 0 era...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.975830,0.090909,percentage population,"['AddFields', 'CalculateField', 'CalculateFiel...",True
...,...,...,...,...,...,...,...,...,...,...,...,...
243,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in Wh/m2 for each ...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...",/solar_rad,0.890799,0.586957,are cells with slope lower than 45 degrees and...,"['RasterCalculator', '/solar_rad_kwh', '/dsm',...",True
244,solar,What are the rooftop cells with slope lower th...,"What are the slopes in Glover Park, Washington...",(start what are (measure (coreC object 0)) (co...,(start what are (measure (coreC field 0 rat_))...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...","/dsm, Slope, /dsm_slope",0.816133,0.387097,rooftop cells with slope lower than 45 degrees,"['/solar_rad', 'RasterCalculator', '/solar_rad...",True
246,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in KWh/m2 for each...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...","/solar_rad, RasterCalculator, /solar_rad_kwh",0.820802,0.454545,are cells with slope lower than 45 degrees,"['/dsm', 'Slope', '/dsm_slope', 'Con', '/dsm_r...",True
247,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in Wh/m2 for each ...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...",/solar_rad,0.822046,0.454545,are cells with slope lower than 45 degrees,"['RasterCalculator', '/solar_rad_kwh', '/dsm',...",True


In [12]:
##### Eliminate data nodes (comes with a / before it) and only keep the tools ####

# Define a function to remove items starting with '/'
def remove_slash_items(row):
    return [item for item in row if not item.startswith('/')]

# Create a copy of the df
filtered_df_copy = filtered_df.copy()

# Convert the items in 'nodes_subtraction' column to lists
filtered_df_copy['nodes_subtraction'] = filtered_df_copy['nodes_subtraction'].apply(eval)

# Apply the function to the 'nodes_subtraction' column using .loc
filtered_df_copy.loc[:, 'nodes_subtraction'] = filtered_df_copy['nodes_subtraction'].apply(remove_slash_items)

In [13]:
filtered_df_copy

Unnamed: 0,task,question1,question2,parse_tree1,parse_tree2,nodes1,nodes2,question_sim_SBERT,parse_tree_dist,question_subtraction,nodes_subtraction,subset_check
0,access,What is the percentage of the rural population...,"What is the total rural population in Shikoku,...",(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (aggre aggregate 0) (c...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.904799,0.414634,percentage of within 2 kilometers of all-seaso...,"[Clip, Buffer, AddFields, CalculateField, Calc...",True
1,access,What is the percentage of the rural population...,What is the rural population within 2 kilomete...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.990467,0.098039,percentage,"[AddFields, CalculateField, CalculateField, Su...",True
2,access,What is the percentage of the rural population...,What is the population within 2 kilometers of ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC objconamount 0 ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.985850,0.132075,percentage,"[AddFields, CalculateField, CalculateField, Ca...",True
3,access,What is the percentage of the rural population...,What is the proportion of population within 2 ...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC genpro 0 ira_) ...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.989975,0.034483,percentage,"[AddFields, CalculateField, CalculateField, Ca...",True
4,access,What is the percentage of the rural population...,What is the area of rural district within 2 ki...,(start what is (measure (coreC genpro 0 ira_) ...,(start what is (measure (coreC covamount 0 era...,"/urbanization, SelectLayerByAttribute, /urbani...","/urbanization, SelectLayerByAttribute, /urbani...",0.975830,0.090909,percentage population,"[AddFields, CalculateField, CalculateField, Ca...",True
...,...,...,...,...,...,...,...,...,...,...,...,...
243,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in Wh/m2 for each ...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...",/solar_rad,0.890799,0.586957,are cells with slope lower than 45 degrees and...,"[RasterCalculator, Slope, Con, Con]",True
244,solar,What are the rooftop cells with slope lower th...,"What are the slopes in Glover Park, Washington...",(start what are (measure (coreC object 0)) (co...,(start what are (measure (coreC field 0 rat_))...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...","/dsm, Slope, /dsm_slope",0.816133,0.387097,rooftop cells with slope lower than 45 degrees,"[RasterCalculator, Con]",True
246,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in KWh/m2 for each...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...","/solar_rad, RasterCalculator, /solar_rad_kwh",0.820802,0.454545,are cells with slope lower than 45 degrees,"[Slope, Con]",True
247,solar,What are the rooftop cells with slope lower th...,What is the solar radiation in Wh/m2 for each ...,(start what are (measure (coreC object 0)) (co...,(start what is (measure (coreC conamount 0 era...,"/solar_rad, RasterCalculator, /solar_rad_kwh, ...",/solar_rad,0.822046,0.454545,are cells with slope lower than 45 degrees,"[RasterCalculator, Slope, Con]",True


In [14]:
# Save the final df to an excel file
filtered_df_copy.to_excel("analysis_df_filtered.xlsx", index = False)