In [1]:
%pip install scikit-learn
%pip install numpy
%pip install pandas
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Dataset with dataset names and descriptions
datasets = [
    {"name": "cat", "description": "the structure of a cat body"},
    {"name": "dog", "description": "the structure of a dog body"},
    {"name": "bird", "description": "the structure of a bird body"}
]

# Text for which we want to find relevant datasets
text = "dog's physique is comprised of 4 legs"

In [3]:
# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Compute embeddings for the text and dataset descriptions
text_embedding = model.encode([text])[0]
dataset_embeddings = model.encode([dataset['description'] for dataset in datasets])

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [4]:
# Compute cosine similarity between the text and dataset descriptions
similarities = cosine_similarity([text_embedding], dataset_embeddings)[0]

# Combine the dataset names, descriptions, and their corresponding similarities
result = [(dataset['name'], dataset['description'], similarity) for dataset, similarity in zip(datasets, similarities)]

# Sort the result based on similarity in descending order
result.sort(key=lambda x: x[2], reverse=True)

In [5]:
k = 2  # Number of most relevant datasets to retrieve
top_k_datasets = result[:k]

# Display the most relevant datasets
print("Most relevant datasets:")
for dataset_name, dataset_description, similarity in top_k_datasets:
    print(f"Dataset: {dataset_name}, Similarity: {similarity:.2f}")

Most relevant datasets:
Dataset: dog, Similarity: 0.63
Dataset: bird, Similarity: 0.31


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import json

def find_top_k_datasets(claim: str, k: int):
    # Load a pre-trained SentenceTransformer model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    # load datasets description
    with open('./DataDescription/description.json', 'r') as openfile:
        datasets = json.load(openfile)

    # Compute embeddings for the text and dataset descriptions
    text_embedding = model.encode([claim])[0]
    dataset_embeddings = model.encode([dataset['description'] for dataset in datasets])

    # Compute cosine similarity between the text and dataset descriptions
    similarities = cosine_similarity([text_embedding], dataset_embeddings)[0]
    # Combine the dataset names, descriptions, and their corresponding similarities
    result = [(dataset['name'], dataset['description'], similarity) for dataset, similarity in zip(datasets, similarities)]
    # Sort the result based on similarity in descending order
    result.sort(key=lambda x: x[2], reverse=True)

    top_k_datasets = result[:k]

    print("Most relevant datasets:")
    for dataset_name, _, similarity in top_k_datasets:
        print(f"Dataset: {dataset_name}, Similarity: {similarity:.2f}")

    return top_k_datasets

claim = "The energy consumption level of the US was super bad last year."
print(find_top_k_datasets(claim, k=2))

: 