In [8]:
# !pip install chromadb --quiet


In [9]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [10]:
from pathlib import Path

from git import Repo
from langchain.document_loaders import GitLoader, DirectoryLoader
import requests
from langchain.embeddings import OpenAIEmbeddings, FakeEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.callbacks import get_openai_callback
from joblib import Parallel, delayed



data_path = Path("data")
github_path = Path("github")
persist_directory = Path("persist")
chroma_perist_directory = persist_directory / "chroma"


In [42]:
github_relevant = {
    # "topic": ["hubmap"],
    # "organization": ["hubmapconsortium"],
    "repo": ["hubmapconsortium/software-docs"]
}


# Function to retrieve repositories by topic
def get_repositories_by_topic(topic):
    response = requests.get('https://api.github.com/search/repositories', params={'q': f'topic:{topic}'})
    if response.status_code == 200:
        data = response.json()
        return data['items']
    else:
        print(f"Error occurred while accessing repositories for topic: {topic}")
        return []

# Function to retrieve repositories by organization
def get_repositories_by_organization(organization):
    response = requests.get(f'https://api.github.com/orgs/{organization}/repos')
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error occurred while accessing repositories for organization: {organization}")
        return []

def get_default_branch(repository):
    response = requests.get(repository['url'])
    if response.status_code == 200:
        data = response.json()
        return data['default_branch']
    else:
        print(f"Error occurred while accessing the default branch for repository: {repository['name']}")
        return ""
    
def get_repository(repo): # organization/repo
    response = requests.get(f'https://api.github.com/repos/{repo}')
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error occurred while accessing repository: {repo}")
        return {}


# Function to retrieve repository dict matching the search criteria
def get_repositories(search_criteria):
    repositories = []
    for key, values in search_criteria.items():
        if key == "topic":
            for value in values:
                repositories.extend(get_repositories_by_topic(value))
        elif key == "organization":
            for value in values:
                repositories.extend(get_repositories_by_organization(value))
        elif key == "repo":
            for value in values:
                repositories.append(get_repository(value))
    return repositories


def repo_to_loader(repo):
    repo_path = data_path / github_path / repo["full_name"]
    repo_branch = get_default_branch(repo)
    repo_url = repo["html_url"]

    if not repo_path.exists():
        repo = Repo.clone_from(
            repo_url, to_path=repo_path
        )
        repo.git.checkout(repo_branch)

    # loader = GitLoader(
    #     repo_path=repo_path,
    #     branch=repo_branch,
    #     file_filter=lambda file_path: file_path.endswith(".md"),
    # )
    loader = DirectoryLoader(repo_path, glob="**/*.md", use_multithreading=True, loader_cls=TextLoader)
    # docs = loader.load()
    return loader



In [45]:

# documents = []
# for repo in get_repositories(github_relevant):
#     loader = repo_to_loader(repo)
#     documents.extend(loader.load())

# docs = Parallel(n_jobs=-1)(delayed(repo_to_loader)(repo) for repo in get_repositories(github_relevant))
docs = [repo_to_loader(repo) for repo in get_repositories(github_relevant)]
loaded_docs = [doc for loader in docs for doc in loader.load()]


In [46]:
print(len(docs))
docs[0]

1


<langchain.document_loaders.directory.DirectoryLoader at 0x118290d30>

In [47]:
doc = [x for x in loaded_docs if x.metadata["source"]=="data/github/hubmapconsortium/software-docs/docs/faq.md"]

In [48]:
doc[0].page_content

'---\nlayout: page\n---\n# HuBMAP Data Release Frequently Asked Questions\n\n## General Questions\n\n<details>\n<summary>What is HuBMAP & what are its goals?</summary>\n\nBetter insights into the principles governing tissue organization-function relationship will potentially lead to better understanding of the significance of normal inter-individual variability and changes across the lifespan, and inform about the emergence of disease at the biomolecular level before the appearance of clinical symptoms. Despite vastly improved imaging and omics technologies and many important foundational discoveries, our understanding of how tissues are organized is restricted by remaining main challenges: 1) integrating high content, high resolution spatial and omics information to comprehensively profile biomolecular distribution and morphology of tissues in a high throughput manner and 2) placing this information into 3D tissue maps amenable to modelling.\n\nThe vision for the Human BioMolecular At

In [49]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(loaded_docs)
print(f"Number of documents: {len(docs)}")

# embeddings = FakeEmbeddings(size=10)
embeddings = OpenAIEmbeddings()

Number of documents: 659


In [52]:
software_docs_col = "hubmapconsortium_software-docs"
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=str(chroma_perist_directory), collection_name=software_docs_col)

vectordb.persist()
vectordb = None

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb.db.duckdb:loaded in 478 embeddings
loaded in 478 embeddings
loaded in 478 embeddings
INFO:chromadb.db.duckdb:loaded in 1 collections
loaded in 1 collections
loaded in 1 collections
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": [[2, 22546, 33, 18082, 4476, 45565, 271, 2028, 12827, 5727, 279, 9904, 21685, 4669, 

In [54]:
vectordb = Chroma(persist_directory=str(chroma_perist_directory), embedding_function=embeddings, collection_name=software_docs_col)


INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb.db.duckdb:loaded in 1137 embeddings
loaded in 1137 embeddings
loaded in 1137 embeddings
INFO:chromadb.db.duckdb:loaded in 2 collections
loaded in 2 collections
loaded in 2 collections
INFO:chromadb.db.duckdb:collection with name hubmapconsortium_software-docs already exists, returning existing collection
collection with name hubmapconsortium_software-docs already exists, returning existing collection
collection with name hubmapconsortium_software-docs already exists, returning existing collection


In [59]:
vectordb.similarity_search_with_score("How do I create multiple samples using the hubmap SDK python lib?")
# vectordb.similarity_search_with_score("What is the meaning of the grouping_code field in the donor metadata?")
# vectordb.similarity_search_with_score("How can I query HuBMAP data?")

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": ["How do I create multiple samples using the hubmap SDK python lib?"], "encoding_format": "base64"}' message='Post details'
api_version=None data='{"input": ["How do I create multiple samples using the hubmap SDK python lib?"], "encoding_format": "base64"}' message='Post details'
api_version=None data='{"input": ["How do I create multiple samples using the hubmap SDK python lib?"], "encoding_format": "base64"}' message='Post details'
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=506 request_id=59e39a5

[(Document(page_content='* [Entity SDK](/sdk/entitysdk.html)\n* [Search SDK](/sdk/searchsdk.html)\n\n### Demo the HuBMAP SDK with Jupyter Notebook\n\nDownload an interactive [Jupyter Notebook Tutorial](/sdk/Hubmap-Sdk.ipynb)\n\nTo get started wtih Jupyter Notebooks, refer to the [Jupyter Documentation](https://docs.jupyter.org/en/latest/)', metadata={'source': 'data/github/hubmapconsortium/software-docs/docs/sdk/hubmapsdk.md'}),
  0.3905152976512909),
 (Document(page_content='---\nlayout: home\n---\n\n# HuBMAP Software Documentation\n\nHuBMAP offers several software and tool components for programatic access to HuBMAP data and metadata.  This site contains information about this software and tools, more information is available at the [HuBMAP Consortium Site](https://hubmapconsortium.org) or the [HuBMAP Data Portal](https://portal.hubmapconsortium.org)\n\nFor an overview of HuBMAP data visit the [Data Overview](/data-sankey/index.html) page.\n\n### APIs\nThere are several APIs which ca