In [1]:
!pip install transformers sentencepiece



In [2]:
import requests
import json
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from numpy import dot
from numpy.linalg import norm
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

THRESHOLD = 0.7

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

with open("/content/ultratool.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [4]:
tools = dict()

def fix_spacing(text: str) -> str:
    return re.sub(r'(?<=[a-z])([A-Z])', r' \1', text).lower()

for question in data:
    q_tools = question['tools']
    for q_tool in q_tools:
        if q_tool['name'] not in tools.keys():
            tools[q_tool['name']] = {
                'Parameters': {},
                'Outputs': {}
            }
            try:
                args = q_tool['arguments']['properties']
                results = q_tool['results']['properties']
            except KeyError:
                del tools[q_tool['name']]
                print(f"{q_tool['name']} has invalid structure, deleted")
                continue

            for arg_name, arg_data in args.items():
                try:
                    desc = f"{fix_spacing(arg_name).capitalize()}. {arg_data['type'].capitalize()}. {arg_data['description'].capitalize()}."
                    tools[q_tool['name']]['Parameters'][arg_name] = desc
                except KeyError:
                    try:
                        for script_name, script_data in arg_data['properties'].items():
                            desc = f"{fix_spacing(arg_name).capitalize()}. {fix_spacing(script_name).capitalize()}. {script_data['type'].capitalize()}. {script_data['description'].capitalize()}."
                            tools[q_tool['name']]['Parameters'][f"{arg_name}_{script_name}"] = desc
                    except KeyError:
                        del tools[q_tool['name']]
                        print(f"{q_tool['name']} has invalid params description, deleted")
                        break

            if q_tool['name'] in tools.keys():
                for result_name, result_data in results.items():
                    try:
                        desc = f"{fix_spacing(result_name).capitalize()}. {result_data['type'].capitalize()}. {result_data['description'].capitalize()}."
                        tools[q_tool['name']]['Outputs'][result_name] = desc
                    except:
                        try:
                            for script_name, script_data in result_data['properties'].items():
                                desc = f"{fix_spacing(result_name).capitalize()}. {fix_spacing(script_name).capitalize()}. {script_data['type'].capitalize()}. {script_data['description'].capitalize()}."
                                tools[q_tool['name']]['Parameters'][f"{result_name}_{script_name}"] = desc
                        except KeyError:
                            del tools[q_tool['name']]
                            print(f"{q_tool['name']} has invalid results description, deleted")
                            break

get_menu has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
get_menu has invalid results description, deleted
schedule_tool has invalid params description, deleted
update_schedule has invalid params description, deleted
update_schedule has invalid params description, deleted
schedule_tool has invalid params description, deleted
stock_trend_predictor has invalid results description, deleted
get_menu has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
get_menu has invalid results description, deleted
stock_trend_predictor has invalid results description, deleted
stock_trend_p

In [8]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(params):
    '''
    Extract embedding from parameters/outputs descriptions.
    '''
    if len(params) == 0:
        return None

    encoded_input = tokenizer(params, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def encode_funcs(funcs_preprocessed):
    '''
    Applies get_embeddings function to dictionary from get_funcs function.
    '''
    all_params_encoded, all_outputs_encoded = [], []
    for func, data in funcs_preprocessed.items():
        params = []
        outputs = []
        for _, text in data['Parameters'].items():
            params.append(text)
        for _, text in data['Outputs'].items():
            outputs.append(text)

        params_encoded, outputs_encoded = get_embeddings(params), get_embeddings(outputs)
        all_params_encoded.append(params_encoded)
        all_outputs_encoded.append(outputs_encoded)
    return all_params_encoded, all_outputs_encoded

def get_cosine_similarity(emb1, emb2):
    cos_sim = dot(emb1, emb2)/(norm(emb1)*norm(emb2))
    return cos_sim

def get_bonds(all_params_encoded, all_outputs_encoded):
    '''
    Calculates cosine proximity between all extracted embeddings.
    Generates a list of edges depending on THRESHOLD.
    This list contains only function and parameter IDs.
    '''
    bonds = [] # [[input_func_id, output_func_id, input_param_id, cos_sim], ...]
    for input_func_id in tqdm(range(len(all_params_encoded))):
        if all_params_encoded[input_func_id] != None:
            for param_id in range(len(all_params_encoded[input_func_id])):
                for output_func_id in range(len(all_outputs_encoded)):
                    if output_func_id != input_func_id and all_outputs_encoded[output_func_id] != None:
                        for output_elem_id in range(len(all_outputs_encoded[output_func_id])):
                            cosine_similarity = get_cosine_similarity(
                                all_params_encoded[input_func_id][param_id],
                                all_outputs_encoded[output_func_id][output_elem_id])

                            if cosine_similarity > THRESHOLD:
                                bonds.append([input_func_id, output_func_id, param_id, output_elem_id, cosine_similarity])
    return bonds

def decode_bonds(bonds, funcs_preprocessed):
    '''
    Decodes function and parameter IDs from get_bonds dunction into their names.
    '''
    func_names, params_names, output_names = [], [], []
    for func, data in funcs_preprocessed.items():
        params = [param for param in data['Parameters'].keys()]
        params_names.append(params)
        outputs = [output for output in data['Outputs'].keys()]
        output_names.append(outputs)
        func_names.append(func)
    bonds_encoded = []
    for bond in bonds:
        bonds_encoded.append([
            func_names[bond[0]],
            func_names[bond[1]],
            params_names[bond[0]][bond[2]],
            output_names[bond[1]][bond[3]],
            bond[4]
        ])
    return bonds_encoded

In [6]:
params, outputs = encode_funcs(tools)
bonds = get_bonds(params, outputs)
decoded_bonds = decode_bonds(bonds, tools)
df = pd.DataFrame(decoded_bonds, columns=['input_func', 'output_func', 'input_param', 'output_element', 'cos_sim'])
df.sort_values('cos_sim', ascending=False, ignore_index=True)

1


IndexError: list index out of range

# Теперь разметим ту часть функций, которые присутствуют в размеченном датасете

In [26]:
with open("/content/graph_desc.json", "r", encoding="utf-8") as file:
    content = file.read().strip()
    if not content:
        content = "{}"
    data = json.loads(content)

nodes = [node['id'] for node in data['nodes']]

In [32]:
tools_gt = dict()
for k, v in tools.items():
    if k in nodes:
        tools_gt[k] = v

print(len(tools_gt.keys()), len(nodes))

258 260


In [55]:
params_gt, outputs_gt = encode_funcs(tools_gt)
bonds_gt = get_bonds(params_gt, outputs_gt)
decoded_bonds_gt = decode_bonds(bonds_gt, tools_gt)
df_gt = pd.DataFrame(decoded_bonds_gt, columns=['input_func', 'output_func', 'input_param', 'output_element', 'cos_sim'])

df_gt_graph = df_gt[['output_func', 'input_func', 'cos_sim']]
df_gt_graph = df_gt_graph.rename(columns={'output_func':'source', 'input_func':'target', 'cos_sim':'cos_sim'})
df_gt_graph.sort_values('cos_sim', ascending=False, ignore_index=True, inplace=True)
df_gt_graph.drop_duplicates(subset=['source', 'target'], keep='first', inplace=True, ignore_index=True)

df_gt_graph

Unnamed: 0,source,target,cos_sim
0,check_room_availability,payment_processing,1.000000
1,hotel_booking,send_confirmation,1.000000
2,deposit_product_search,credit_card_debt,1.000000
3,payment_processing,hotel_booking_query,1.000000
4,hotel_booking,hotel_booking_query,1.000000
...,...,...,...
701,flight_info_query,create_schedule,0.700730
702,book_restaurant,checkout_api,0.700457
703,travel_plan_maker,visa_application,0.700400
704,check_room_booking_status,generate_meeting_alert,0.700332


In [61]:
df_gt_graph[df_gt_graph['cos_sim'] > 0.718].reset_index()

Unnamed: 0,index,source,target,cos_sim
0,0,check_room_availability,payment_processing,1.000000
1,1,hotel_booking,send_confirmation,1.000000
2,2,deposit_product_search,credit_card_debt,1.000000
3,3,payment_processing,hotel_booking_query,1.000000
4,4,hotel_booking,hotel_booking_query,1.000000
...,...,...,...,...
605,605,travel_group_schedule,advance_ticket_booking,0.718453
606,606,travel_plan_maker,travel_camera_info,0.718315
607,607,travel_plan_maker,travel_diary_generator,0.718315
608,608,weather_query,smart_home_control,0.718261


In [62]:
source, target = [], []
for link in data['links']:
    if link['source'] in tools_gt.keys() and link['target'] in tools_gt.keys():
        source.append(link['source'])
        target.append(link['target'])

df_target_graph = pd.DataFrame({'source': source, 'target': target})
df_target_graph

Unnamed: 0,source,target
0,create_reminder,clock_alarm_change
1,book_restaurant,get_dish_id
2,select_hotel,check_room_availability
3,file_write,weather_query
4,bank_balance_query,domestic_remittance
...,...,...
603,special_foreign_currency_purchase,transfer_money
604,create_meeting,business_communication
605,train_ticket_cancelling,identity_verification
606,website_design_tool,event_planning_tool


In [75]:
import pandas as pd
import networkx as nx
import numpy as np
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr


def encode_graph(df, mapping=None):
    """Кодирует вершины строками в числовые значения, используя общий mapping."""
    if mapping is None:
        unique_nodes = pd.unique(df[['source', 'target']].values.ravel('K'))
        mapping = {node: i for i, node in enumerate(unique_nodes)}

    df_encoded = df.replace(mapping)
    return df_encoded, mapping

def build_graph(df):
    """Строит ориентированный граф из DataFrame."""
    G = nx.DiGraph()
    G.add_edges_from(df.to_records(index=False))
    return G

def jaccard_similarity(G1, G2):
    """Метрика Жаккара: сравнивает множества рёбер двух графов."""
    edges1, edges2 = set(G1.edges()), set(G2.edges())
    return len(edges1 & edges2) / len(edges1 | edges2) if edges1 | edges2 else 1

def dice_similarity(G1, G2):
    """Метрика Дайса: сходство на основе доли общих рёбер."""
    edges1, edges2 = set(G1.edges()), set(G2.edges())
    return 2 * len(edges1 & edges2) / (len(edges1) + len(edges2)) if edges1 or edges2 else 1

def degree_correlation(G1, G2):
    """Корреляция степеней вершин (пирсоновская корреляция)."""
    degrees1 = dict(G1.degree())
    degrees2 = dict(G2.degree())
    common_nodes = set(degrees1) & set(degrees2)

    if not common_nodes:
        return 0

    deg1 = np.array([degrees1[node] for node in common_nodes])
    deg2 = np.array([degrees2[node] for node in common_nodes])

    return pearsonr(deg1, deg2)[0]

def cosine_degree_similarity(G1, G2):
    """Косинусное сходство степеней вершин."""
    degrees1 = dict(G1.degree())
    degrees2 = dict(G2.degree())
    common_nodes = set(degrees1) & set(degrees2)

    if not common_nodes:
        return 0

    deg1 = np.array([degrees1[node] for node in common_nodes])
    deg2 = np.array([degrees2[node] for node in common_nodes])

    norm1 = np.linalg.norm(deg1)
    norm2 = np.linalg.norm(deg2)

    return np.dot(deg1, deg2) / (norm1 * norm2) if norm1 and norm2 else 1

def rmse_degree_difference(G1, G2):
    """Среднеквадратичное отклонение степеней вершин."""
    degrees1 = dict(G1.degree())
    degrees2 = dict(G2.degree())
    common_nodes = set(degrees1) & set(degrees2)

    if not common_nodes:
        return float('inf')

    deg1 = np.array([degrees1[node] for node in common_nodes])
    deg2 = np.array([degrees2[node] for node in common_nodes])

    return np.sqrt(np.mean((deg1 - deg2) ** 2))

def mean_degree_difference(G1, G2):
    """Разница в средней степени вершин."""
    mean_deg1 = np.mean([d for _, d in G1.degree()])
    mean_deg2 = np.mean([d for _, d in G2.degree()])
    return abs(mean_deg1 - mean_deg2)

def directed_degree_correlation(G1, G2):
    """Корреляция отдельно входящих и отдельно исходящих степеней вершин."""
    in_deg1 = dict(G1.in_degree())
    in_deg2 = dict(G2.in_degree())
    out_deg1 = dict(G1.out_degree())
    out_deg2 = dict(G2.out_degree())

    common_nodes = set(in_deg1) & set(in_deg2) & set(out_deg1) & set(out_deg2)

    if not common_nodes:
        return 0

    in_deg1_vals = np.array([in_deg1[node] for node in common_nodes])
    in_deg2_vals = np.array([in_deg2[node] for node in common_nodes])
    out_deg1_vals = np.array([out_deg1[node] for node in common_nodes])
    out_deg2_vals = np.array([out_deg2[node] for node in common_nodes])

    in_corr = pearsonr(in_deg1_vals, in_deg2_vals)[0]
    out_corr = pearsonr(out_deg1_vals, out_deg2_vals)[0]

    return (in_corr + out_corr) / 2

df_gt_graph_filtered = df_gt_graph[df_gt_graph['cos_sim'] > 0.83].reset_index()
df_gt_graph_filtered = df_gt_graph_filtered[['source', 'target']]

all_nodes = pd.unique(pd.concat([df_target_graph, df_gt_graph_filtered])[['source', 'target']].values.ravel('K'))
mapping = {node: i for i, node in enumerate(all_nodes)}

df1_encoded, _ = encode_graph(df_target_graph, mapping)
df2_encoded, _ = encode_graph(df_gt_graph_filtered, mapping)

G1 = build_graph(df1_encoded)
G2 = build_graph(df2_encoded)

jac_sim = jaccard_similarity(G1, G2)
dice_sim = dice_similarity(G1, G2)
deg_corr = degree_correlation(G1, G2)
cos_sim = cosine_degree_similarity(G1, G2)
rmse_deg = rmse_degree_difference(G1, G2)
mean_deg_diff = mean_degree_difference(G1, G2)
directed_deg_corr = directed_degree_correlation(G1, G2)

print(f"Кол-во ребер в таргете: {df_target_graph.shape}, в предикте: {df_gt_graph_filtered.shape}")
print(f"Коэффициент Жаккара: {jac_sim:.3f}")
print(f"Коэффициент Дайса: {dice_sim:.3f}")
print(f"Корреляция степеней вершин: {deg_corr:.3f}")
print(f"Косинусное сходство степеней: {cos_sim:.3f}")
print(f"Среднеквадратичное отклонение степеней: {rmse_deg:.3f}")
print(f"Разница в средней степени вершин: {mean_deg_diff:.3f}")
print(f"Корреляция входящих/исходящих степеней: {directed_deg_corr:.3f}")

Кол-во ребер в таргете: (608, 2), в предикте: (192, 2)
Коэффициент Жаккара: 0.068
Коэффициент Дайса: 0.128
Корреляция степеней вершин: 0.572
Косинусное сходство степеней: 0.786
Среднеквадратичное отклонение степеней: 4.252
Разница в средней степени вершин: 1.635
Корреляция входящих/исходящих степеней: 0.483


  df_encoded = df.replace(mapping)


TODO
1. Пробовать другие модели для извлечения эмбеддингов
2. Менять промпты (формат и содержание строк, подаваемых на вход в эмбеддер)
3. Пробовать другие метрики близости эмбеддингов