In [12]:
import logging
from typing import List, Optional, Any, Dict, Tuple
from numpy import dot
from numpy.linalg import norm

import requests
import json
import warnings
import pandas as pd
from sentence_transformers import SentenceTransformer
from pydantic import BaseModel, create_model, validator, ValidationError
from langchain.chat_models.base import BaseChatModel
from langchain.schema import (
    BaseMessage,
    AIMessage,
    HumanMessage,
    SystemMessage,
    ChatResult,
    ChatGeneration,
)
from langchain.tools import StructuredTool
from langchain.agents import create_structured_chat_agent, AgentExecutor
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from pydantic import PrivateAttr

warnings.filterwarnings("ignore")

# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_colwidth", None)
# pd.set_option("display.expand_frame_repr", False)
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from scipy.spatial.distance import cosine

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/galyukshev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
data = pd.read_json(
    "/Users/galyukshev/Desktop/smol-llm-research/xlam-function-calling-60k/xlam-function-calling-60k-used.json"
)
data

Unnamed: 0,id,query,answers,tools
0,47119,Find the latest 20 tweets from the user with t...,"[{'name': 'get_user_tweets', 'arguments': {'co...","[{'name': 'song_download', 'description': 'Dow..."
1,43159,Find a pastry recipe with 'chocolate' as an in...,"[{'name': 'pastry_ingredient', 'arguments': {'...","[{'name': 'pastry_ingredient', 'description': ..."
2,52763,1. Retrieve the availability status of a listi...,"[{'name': 'listing_status', 'arguments': {'is_...","[{'name': 'listing_status', 'description': 'Re..."
3,6431,"What's the missing integer in the list [0, 1, ...","[{'name': 'find_missing_number', 'arguments': ...","[{'name': 'find_missing_number', 'description'..."
4,428,Compute the 20th Fibonacci number and the fact...,"[{'name': 'fibonacci', 'arguments': {'n': 20}}...","[{'name': 'fibonacci', 'description': 'Calcula..."
5,40876,Calculate the standard deviation of the number...,"[{'name': 'std_deviation', 'arguments': {'numb...","[{'name': 'std_deviation', 'description': 'Cal..."
6,36526,1. Check the availability status of a listing ...,"[{'name': 'listing_status', 'arguments': {'is_...","[{'name': 'listing_status', 'description': 'Re..."
7,37040,"Give me a salad recipe with bell peppers, and ...","[{'name': 'salad_ingredient', 'arguments': {'i...","[{'name': 'salad_ingredient', 'description': '..."
8,46776,Download the song from this SoundCloud URL: 'h...,"[{'name': 'song_download', 'arguments': {'trac...","[{'name': 'song_download', 'description': 'Dow..."
9,48062,Display the crops grown in Spain with the subc...,"[{'name': 'crops_list', 'arguments': {'subcate...","[{'name': 'crops_list', 'description': 'Fetche..."


In [14]:
tools = data["tools"][0]
tools

[{'name': 'song_download',
  'description': 'Downloads a song from the provided SoundCloud URL using the specified RapidAPI key.',
  'parameters': {'track_url': {'description': 'The URL of the SoundCloud track to be downloaded.',
    'type': 'str',
    'default': 'https://soundcloud.com/user-977421934/the-phoenix'}}},
 {'name': 'get_user_info_by_user_id',
  'description': 'Fetch Instagram user information by user ID using the RapidAPI service.',
  'parameters': {'is_id': {'description': 'The Instagram user ID for which the information is to be fetched.',
    'type': 'int',
    'default': '18527'}}},
 {'name': 'get_user_tweets',
  'description': 'Fetches tweets for a specified user using the provided RapidAPI key.',
  'parameters': {'count': {'description': 'The number of tweets to retrieve.',
    'type': 'str',
    'default': '20'},
   'user': {'description': 'The ID of the user whose tweets are to be fetched.',
    'type': 'str',
    'default': '2455740283'},
   'cursor': {'descriptio

In [15]:
def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text.lower())
    # filtered_tokens = [
    #     word for word in tokens if word.isalnum() and word not in stop_words
    # ]
    filtered_tokens = [word for word in tokens if word.isalnum()]
    return " ".join(filtered_tokens)

In [16]:
lst = []
for tool in tools:
    tool_prep = dict()
    args = dict()
    for arg in tool["parameters"].keys():
        args[arg] = preprocess_text(tool["parameters"][arg]["description"])
    tool_prep["name"] = tool["name"]
    tool_prep["output"] = preprocess_text(tool["description"])
    tool_prep["args"] = args
    lst.append(tool_prep)
lst

[{'name': 'song_download',
  'output': 'downloads a song from the provided soundcloud url using the specified rapidapi key',
  'args': {'track_url': 'the url of the soundcloud track to be downloaded'}},
 {'name': 'get_user_info_by_user_id',
  'output': 'fetch instagram user information by user id using the rapidapi service',
  'args': {'is_id': 'the instagram user id for which the information is to be fetched'}},
 {'name': 'get_user_tweets',
  'output': 'fetches tweets for a specified user using the provided rapidapi key',
  'args': {'count': 'the number of tweets to retrieve',
   'user': 'the id of the user whose tweets are to be fetched',
   'cursor': 'the cursor for pagination default is none'}},
 {'name': 'info',
  'output': 'fetches profile information for a given instagram username using the rapidapi service',
  'args': {'username': 'the instagram username for which to fetch profile information defaults to'}},
 {'name': 'get_highlight_by_id',
  'output': 'fetches all images and v

In [17]:
texts = []
for item in lst:
    texts.append(preprocess_text(item["output"]))
    for arg_value in item["args"].values():
        texts.append(preprocess_text(arg_value))
vectorizer = TfidfVectorizer()
embeddings_matrix = vectorizer.fit_transform(texts)
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# embeddings_matrix = model.encode(texts)
embeddings = {text: vec for text, vec in zip(texts, embeddings_matrix.toarray())}
embeddings

{'downloads a song from the provided soundcloud url using the specified rapidapi key': array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.34962271,
        0.        , 0.        , 0.        , 0.        , 0.34962271,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.30026004, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.30026004,
        0.23807044, 0.        , 0.        , 0.34962271, 0.30026004,
        0.        , 0.30026004, 0.26297595, 0.        , 0.        ,
        0.        , 0.30026004, 0.        , 0.        , 0.23807044,
        0.        , 0.        , 0.        ]),
 'the url of the soundcloud track to be downloaded': array([0.        , 0.        , 0.        , 0.3271053 , 0.        ,
        0.        , 0.        , 0.        , 0.43117507, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,


In [18]:
for item in lst:
    item["output"] = embeddings[preprocess_text(item["output"])]
    for key, value in item["args"].items():
        item["args"][key] = embeddings[preprocess_text(value)]
lst

[{'name': 'song_download',
  'output': array([0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.34962271,
         0.        , 0.        , 0.        , 0.        , 0.34962271,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.30026004, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.30026004,
         0.23807044, 0.        , 0.        , 0.34962271, 0.30026004,
         0.        , 0.30026004, 0.26297595, 0.        , 0.        ,
         0.        , 0.30026004, 0.        , 0.        , 0.23807044,
         0.        , 0.        , 0.        ]),
  'args': {'track_url': array([0.        , 0.        , 0.        , 0.3271053 , 0.        ,
          0.        , 0.        , 0.        , 0.43117507, 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.    

In [19]:
def get_cosine_similarity(emb1, emb2):
    cos_sim = dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    return cos_sim

In [20]:
results = []

for item in lst:
    output = item["output"]
    name = item["name"]

    for item1 in lst:
        for arg_name, arg_values in item1["args"].items():
            dist = (
                get_cosine_similarity(output, arg_values)
                if np.any(output) and np.any(arg_values)
                else 1.0
            )
            if name != item1["name"]:
                results.append(
                    {
                        "function_name": name,
                        "arg_from_function": item1["name"],
                        "arg": arg_name,
                        "cosine_distance": dist,
                    }
                )

cosine_distances_df = pd.DataFrame(results)
cosine_distances_df.groupby(by=["function_name", "arg_from_function"])[
    "cosine_distance"
].max()

function_name             arg_from_function       
get_highlight_by_id       get_user_info_by_user_id    0.096539
                          get_user_tweets             0.102488
                          info                        0.040397
                          song_download               0.054400
get_user_info_by_user_id  get_highlight_by_id         0.253420
                          get_user_tweets             0.257552
                          info                        0.254105
                          song_download               0.048599
get_user_tweets           get_highlight_by_id         0.079757
                          get_user_info_by_user_id    0.204859
                          info                        0.087384
                          song_download               0.053094
info                      get_highlight_by_id         0.158585
                          get_user_info_by_user_id    0.245870
                          get_user_tweets             0.092885
    

In [21]:
tools

[{'name': 'song_download',
  'description': 'Downloads a song from the provided SoundCloud URL using the specified RapidAPI key.',
  'parameters': {'track_url': {'description': 'The URL of the SoundCloud track to be downloaded.',
    'type': 'str',
    'default': 'https://soundcloud.com/user-977421934/the-phoenix'}}},
 {'name': 'get_user_info_by_user_id',
  'description': 'Fetch Instagram user information by user ID using the RapidAPI service.',
  'parameters': {'is_id': {'description': 'The Instagram user ID for which the information is to be fetched.',
    'type': 'int',
    'default': '18527'}}},
 {'name': 'get_user_tweets',
  'description': 'Fetches tweets for a specified user using the provided RapidAPI key.',
  'parameters': {'count': {'description': 'The number of tweets to retrieve.',
    'type': 'str',
    'default': '20'},
   'user': {'description': 'The ID of the user whose tweets are to be fetched.',
    'type': 'str',
    'default': '2455740283'},
   'cursor': {'descriptio