In [45]:
!pip install transformers sentencepiece



In [46]:
import requests
import json
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from numpy import dot
from numpy.linalg import norm

#device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')#.to(device)
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')#.to(device)

In [47]:
with open("/content/ultratool.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

In [48]:
parsed_data = dict()
parsed_tools = dict()

for row in data[:100]:
    tools = row['tools']
    question = row['question']
    parsed_data[question] = []
    for tool in tools:
        name = tool['name']
        parsed_data[question].append(name)
        if name not in parsed_tools.keys():
            description = tool['description']
            try:
                results = tool['results']['properties']
            except KeyError:
                parsed_tools[name] = description
                continue
            results_desc = '. Returns'
            for k, v in results.items():
                try:
                    res_desc = v['description']
                except KeyError:
                    try:
                        for k, v_ in v['properties'].items():
                            res_desc = v_['description']
                            results_desc += f' {res_desc[0].lower()}{res_desc[1:]},'
                    except KeyError:
                        for k, v__ in v['items']['properties'].items():
                            res_desc = v__['description']
                            results_desc += f' {res_desc[0].lower()}{res_desc[1:]},'
                results_desc += f' {res_desc[0].lower()}{res_desc[1:]},'
            results_desc = results_desc[:-1] + '.'
            description += results_desc
            parsed_tools[name] = description

In [49]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    #sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def get_cosine_similarity(emb1, emb2):
    cos_sim = dot(emb1, emb2)/(norm(emb1)*norm(emb2))
    return cos_sim

def list_metrics(predict, target):
    predict = set(predict)
    target = set(target)

    intersection = predict.intersection(target)
    union = predict.union(target)

    tn = len([item for item in predict if item not in target])
    fp = len([item for item in target if item not in predict])

    intersection_length = len(intersection)
    union_length = len(union)

    jaccard = intersection_length / union_length

    try:
        fp_tn = fp / tn
    except ZeroDivisionError:
        fp_tn = 3

    return fp_tn, jaccard

In [50]:
questions_emb = get_embeddings(list(parsed_data.keys()))
tools_emb = get_embeddings(list(parsed_tools.values()))

In [51]:
THRESHOLD = 0.4

jaccard_total = []
fp_tn_total = []

for i, (question, used_tools) in enumerate(parsed_data.items()):
    predicted_tools = []
    for j, (tool_name, tool_description) in enumerate(parsed_tools.items()):
        cos_sim = get_cosine_similarity(questions_emb[i], tools_emb[j])
        if cos_sim > THRESHOLD:
            predicted_tools.append(tool_name)
    fp_tn, jaccard = list_metrics(predicted_tools, used_tools)
    jaccard_total.append(jaccard)
    fp_tn_total.append(fp_tn)

print(f'jaccard: {sum(jaccard_total) / len(jaccard_total)}')
print(f'fp_tn: {sum(fp_tn_total) / len(fp_tn_total)}')

jaccard: 0.3551772116772117
fp_tn: 1.5610357142857145


In [51]:
def get_embeddings(question, tools_descriptions):
    sentences = tools_descriptions + [question]
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings[:-1], sentence_embeddings[-1]


jaccard_total = []
fp_tn_total = []
for question, used_tools in parsed_data.items():
    predicted_tools = []
    tools_emb, question_emb = get_embeddings(question, list(parsed_tools.values()))
    for i, (tool_name, tool_description) in enumerate(parsed_tools.items()):
        cos_sim = get_cosine_similarity(tools_emb[i], question_emb)
        if cos_sim > THRESHOLD:
            predicted_tools.append(tool_name)
    fp_tn, jaccard = list_metrics(predicted_tools, used_tools)
    jaccard_total.append(jaccard)
    fp_tn_total.append(fp_tn)

print(f'jaccard: {sum(jaccard_total) / len(jaccard_total)}')
print(f'fp_tn: {sum(fp_tn_total) / len(fp_tn_total)}')