<a href="https://colab.research.google.com/github/hansheng-hsu/AI-chatbot-for-PDC/blob/main/RAG_with_full_JSON_20240728.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load library

In [1]:
#connect to google drive
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install langchain weaviate_client wasabi langchain_community llama-cpp-python langchain_weaviate

Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting weaviate_client
  Downloading weaviate_client-4.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.85.tar.gz (49.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langchain_weaviate
  Downloading langchain_weaviate-0.0.2-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain)
  Downloading langchain_core-0.2.26-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3

In [4]:
import requests
import json
import weaviate
from weaviate.embedded import EmbeddedOptions
from wasabi import msg
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Fetch the Data

In [5]:
# Step 1: Fetch the study and protocol data
study_protocol_url = "https://pdc.cancer.gov/graphql"
study_protocol_query = """
query($acceptDUA: Boolean!) {
  study(acceptDUA: $acceptDUA) {
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    project_id
    study_name
    program_name
    project_name
    disease_type
    primary_site
    analytical_fraction
    experiment_type
    embargo_date
    cases_count
    aliquots_count
  }
  protocolPerStudy {
    protocol_id
    protocol_submitter_id
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    program_submitter_id
    protocol_name
    protocol_date
  }
}
"""
variables = {
    "acceptDUA": True
}
response = requests.post(study_protocol_url, json={'query': study_protocol_query, 'variables': variables})
data = response.json()

# Step 2: Merge study and protocol data
merged_data = {}

for study in data['data']['study']:
    pdc_study_id = study['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = study
    else:
        merged_data[pdc_study_id].update(study)

for protocol in data['data']['protocolPerStudy']:
    pdc_study_id = protocol['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = protocol
    else:
        for key, value in protocol.items():
            if key not in merged_data[pdc_study_id]:
                merged_data[pdc_study_id][key] = value

# Step 3: Fetch publication data
publication_url = "https://pdc.cancer.gov/graphql"
publication_query = """
{
  getPaginatedUIPublication(offset: 0, limit: 60) {
    uiPublication {
      publication_id
      pubmed_id
      doi
      title
      journal
      journal_url
      year
      abstract
      studies {
        pdc_study_id
        submitter_id_name
      }
    }
  }
}
"""
pub_response = requests.post(publication_url, json={'query': publication_query})
pub_data = pub_response.json()

# Step 4: Create a mapping of pdc_study_id to publication info
publication_mapping = {}

for publication in pub_data['data']['getPaginatedUIPublication']['uiPublication']:
    for study in publication['studies']:
        pdc_study_id = study['pdc_study_id']
        if pdc_study_id not in publication_mapping:
            publication_mapping[pdc_study_id] = []
        publication_mapping[pdc_study_id].append({
            'publication_id': publication['publication_id'],
            'pubmed_id': publication['pubmed_id'],
            'doi': publication['doi'],
            'title': publication['title'],
            'journal': publication['journal'],
            'journal_url': publication['journal_url'],
            'year': publication['year'],
            'abstract': publication['abstract'],
        })

# Step 5: Add publication info to merged data
for pdc_study_id, publications in publication_mapping.items():
    if pdc_study_id in merged_data:
        merged_data[pdc_study_id]['publications'] = publications
    else:
        merged_data[pdc_study_id] = {'publications': publications}

# Convert merged_data back to a list if needed
merged_list = list(merged_data.values())

# Step 6: Fetch the list of PubMed IDs
publication_query_pubmed_ids = """
{
  getPaginatedUIPublication(offset: 0, limit: 60) {
    total
    uiPublication {
      pubmed_id
    }
  }
}
"""
pub_response_pubmed_ids = requests.post(publication_url, json={'query': publication_query_pubmed_ids})
response_json_pubmed_ids = pub_response_pubmed_ids.json()

# Extract the list of publications
publications = response_json_pubmed_ids['data']['getPaginatedUIPublication']['uiPublication']

# Extract pubmed_id from each publication and create a list
pubmed_ids = [publication['pubmed_id'] for publication in publications]

# Step 7: Fetch article JSON from PubMed API and extract text with sentence splitting
def fetch_article_json(pubmed_id):
    url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pubmed_id}/unicode"
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors
    return response.json()

def extract_article_text(article_json):
    article_text = []
    for document in article_json[0]['documents']:
        for passage in document['passages']:
            if passage['infons'].get('section_type') != 'REF':
                article_text.append(passage['text'])
    return " ".join(article_text)

def split_article_text(article_text, chunk_size=256, chunk_overlap=64):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(article_text)

all_articles = []
for pubmed_id in pubmed_ids:
    try:
        article_json = fetch_article_json(pubmed_id)
        article_text = extract_article_text(article_json)
        article_text_split = split_article_text(article_text)
        all_articles.append({ 'pubmed_id': pubmed_id, 'full_text': article_text_split })
    except requests.exceptions.RequestException as e:
        all_articles.append({ 'pubmed_id': pubmed_id, 'full_text': 'Full text is not available in this publication' })
        print(f"Failed to fetch data for PubMed ID {pubmed_id}: {e}")

# Step 8: Merge all_articles with merged_list based on pubmed_id
pubmed_to_full_text = {article['pubmed_id']: article['full_text'] for article in all_articles}

for study in merged_list:
    if 'publications' in study:
        valid_publications = []
        for publication in study['publications']:
            pubmed_id = publication['pubmed_id']
            if pubmed_id in pubmed_to_full_text:
                publication['full_text'] = pubmed_to_full_text[pubmed_id]
                valid_publications.append(publication)
        study['publications'] = valid_publications

# Output the merged data
full_data = {
    study['pdc_study_id']: study for study in merged_list
}
print(json.dumps(full_data, indent=2))


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
          "\u03bcl of 50 mM HEPES (pH 8.5). Plate 10 \u03bcl of the diluted sample in triplicate per sample in a clear, flat-bottom 96-well microplate. Plate a 10-\u03bcl HPLC water blank and a 50 mM HEPES, pH 8.5, blank, each in triplicate. Plate eight wells of 10 \u03bcl each of an",
          "each in triplicate. Plate eight wells of 10 \u03bcl each of an 8-point BSA curve, ranging from 50 \u03bcg/ml to 2 mg/ml. Prepare the BCA reagent by mixing reagent B and reagent A in a 1:49 (vol/vol) ratio. \u25b2 CRITICAL STEP This solution must be made fresh",
          "ratio. \u25b2 CRITICAL STEP This solution must be made fresh immediately before use. Add 200 \u03bcl of freshly prepared BCA reagent to each well containing 10 \u03bcl of sample. Incubate the microplate at 37 \u00b0C for 30 min. Read the plate at 562 nm. We usually recover",
          "37 \u00b0C for 30 min. Read the plate at 562 nm. We usually recover 40\u201350% of the starting material at

# Connecting Weaviate

In [6]:
client = weaviate.Client(
  embedded_options=EmbeddedOptions(),
  additional_headers={
    "X-HuggingFace-Api-Key": "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"
  }
)

INFO:weaviate-client:Binary /root/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.26.1/weaviate-v1.26.1-Linux-amd64.tar.gz
INFO:weaviate-client:Started /root/.cache/weaviate-embedded: process ID 2157


# Create the Schema

In [7]:
from wasabi import msg

# Define a schema for the data
class_obj ={
    "classes": [
        {
            "class": "Document",
            "vectorizer": "text2vec-huggingface",
            "moduleConfig": {
                "text2vec-huggingface": {
                    "model": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
                    "vectorizeClassName": False
                }
            },
            "properties": [
                {
                    "name": "study_id",
                    "dataType": ["string"],
                    "description": "A unique identifier for a study."
                },
                {
                    "name": "pdc_study_id",
                    "dataType": ["string"],
                    "description": "The PDC-specific study ID (e.g., PDC000121)."
                },
                {
                    "name": "study_submitter_id",
                    "dataType": ["string"],
                    "description": "An identifier submitted by the study’s submitter."
                },
                {
                    "name": "program_id",
                    "dataType": ["string"],
                    "description": "Identifier for the program associated with the study."
                },
                {
                    "name": "project_id",
                    "dataType": ["string"],
                    "description": "Identifier for the project within the program."
                },
                {
                    "name": "study_name",
                    "dataType": ["string"],
                    "description": "Name of the study."
                },
                {
                    "name": "program_name",
                    "dataType": ["string"],
                    "description": "Name of the program."
                },
                {
                    "name": "project_name",
                    "dataType": ["string"],
                    "description": "Name of the project."
                },
                {
                    "name": "disease_type",
                    "dataType": ["string"],
                    "description": "Type of disease being studied."
                },
                {
                    "name": "primary_site",
                    "dataType": ["string"],
                    "description": "Anatomic site of the tumor or disease."
                },
                {
                    "name": "analytical_fraction",
                    "dataType": ["string"],
                    "description": "The type of analytical fraction used (e.g., proteome, phosphoproteome)."
                },
                {
                    "name": "experiment_type",
                    "dataType": ["string"],
                    "description": "Type of labeling reagent used. Indicate number of channels."
                },
                {
                    "name": "embargo_date",
                    "dataType": ["date"],
                    "description": "Date when study data becomes publicly accessible (null if not applicable)."
                },
                {
                    "name": "cases_count",
                    "dataType": ["int"],
                    "description": "Number of cases in the study (null if not available)."
                },
                {
                    "name": "aliquots_count",
                    "dataType": ["int"],
                    "description": "Number of aliquots (samples) in the study (null if not available)."
                },
                {
                    "name": "protocol_id",
                    "dataType": ["string"],
                    "description": "Identifier for the experimental protocol."
                },
                {
                    "name": "protocol_submitter_id",
                    "dataType": ["string"],
                    "description": "Identifier submitted by the protocol submitter."
                },
                {
                    "name": "program_submitter_id",
                    "dataType": ["string"],
                    "description": "Identifier submitted by the program submitter."
                },
                {
                    "name": "protocol_name",
                    "dataType": ["string"],
                    "description": "Name of the experimental protocol."
                },
                {
                    "name": "protocol_date",
                    "dataType": ["date"],
                    "description": "Date protocol was created or updated."
                },
                {
                    "name": "publication_id",
                    "dataType": ["string"],
                    "description": "Publication identifier."
                },
                {
                    "name": "pubmed_id",
                    "dataType": ["string"],
                    "description": "PubMed identifier."
                },
                {
                    "name": "doi",
                    "dataType": ["string"],
                    "description": "Digital Object Identifier (DOI) for the publication."
                },
                {
                    "name": "author",
                    "dataType": ["text"],
                    "description": "Author(s) of the publication."
                },
                {
                    "name": "title",
                    "dataType": ["string"],
                    "description": "Title of the publication."
                },
                {
                    "name": "journal",
                    "dataType": ["string"],
                    "description": "Journal where the publication appeared."
                },
                {
                    "name": "journal_url",
                    "dataType": ["string"],
                    "description": "URL to the journal."
                },
                {
                    "name": "year",
                    "dataType": ["string"],
                    "description": "Year of publication."
                },
                {
                    "name": "abstract",
                    "dataType": ["text"],
                    "description": "Abstract of the publication."
                },
                {
                    "name": "full_text",
                    "dataType": ["text"],
                    "description": "Full text of the publication."
                }
            ]
        }
    ]
}


if not client.schema.exists("Document"):
    client.schema.create(class_obj)
    msg.warn(f"Document class was created because it didn't exist.")
else:
    # WARNING THIS DELETES ALL Document AND CREATES A NEW Document CLASS
    client.schema.delete_class("Document")
    msg.info(f"Document class was removed because it already exists")
    client.schema.create(class_obj)

[38;5;3m⚠ Document class was created because it didn't exist.[0m


In [8]:
def jprint(json_in):
    import json
    print(json.dumps(json_in, indent=2))
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "Document",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-huggingface": {
          "model": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
          "vectorizeClassName": false
        }
      },
      "multiTenancyConfig": {
        "autoTenantActivation": false,
        "autoTenantCreation": false,
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "A unique identifier for a study.",
          "indexFilterable": true,
          "indexRangeFilters": false,
          "indexSearchable": true,
          "moduleConfig": {
            "text2vec-huggingface": {
              "skip": fal

# Import Data to Weaviate

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import logging
import numpy as np
import time

# Embed Text Data using neuml/pubmedbert-base-embeddings
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()


# Configure logging
logging.basicConfig(level=logging.INFO)
msg = logging.getLogger()

with client.batch as batch:
    batch.batch_size = 100
    for i, (pdc_study_id, doc) in enumerate(full_data.items()):
        msg.info(f"({i+1}/{len(full_data)}) Importing Study {doc.get('pdc_study_id', 'Unknown ID')}")

        # Handle multiple publications if present
        documents = []
        study_info = (
            f"PDC Study ID: {doc['pdc_study_id']}, "
            f"Study Submitter ID: {doc['study_submitter_id']}, "
            f"Program ID: {doc['program_id']}, "
            f"Project ID: {doc['project_id']}, "
            f"Study Name: {doc['study_name']}, "
            f"Program Name: {doc['program_name']}, "
            f"Project Name: {doc['project_name']}, "
            f"Disease Type: {doc['disease_type']}, "
            f"Primary Site: {doc['primary_site']}, "
            f"Analytical Fraction: {doc['analytical_fraction']}, "
            f"Experiment Type: {doc['experiment_type']}, "
            f"Embargo Date: {doc['embargo_date']}, "
            f"Cases Count: {doc['cases_count']}, "
            f"Aliquots Count: {doc['aliquots_count']}, "
            f"Protocol ID: {doc['protocol_id']}, "
            f"Protocol Submitter ID: {doc['protocol_submitter_id']}, "
            f"Program Submitter ID: {doc['program_submitter_id']}, "
            f"Protocol Name: {doc['protocol_name']}, "
            f"Protocol Date: {doc['protocol_date']}"
        )

        # Adding publication information if available
        if 'publications' in doc and doc['publications']:
            for pub in doc['publications']:
                pub_info = (
                    f"Publication ID: {pub['publication_id']}, "
                    f"PubMed ID: {pub['pubmed_id']}, "
                    f"DOI: {pub['doi']}, "
                    f"Title: {pub['title']}, "
                    f"Journal: {pub['journal']}, "
                    f"Journal URL: {pub['journal_url']}, "
                    f"Year: {pub['year']}, "
                    f"Abstract: {pub['abstract']} "
                )
                study_info += f", Publication: ({pub_info})"

        documents.append(study_info)
        embedding = generate_embedding(' '.join(documents))

        properties = {
            "study_id": doc.get("study_id", ""),
            "pdc_study_id": doc.get("pdc_study_id", ""),
            "study_submitter_id": doc.get("study_submitter_id", ""),
            "program_id": doc.get("program_id", ""),
            "project_id": doc.get("project_id", ""),
            "study_name": doc.get("study_name", ""),
            "program_name": doc.get("program_name", ""),
            "project_name": doc.get("project_name", ""),
            "disease_type": doc.get("disease_type", ""),
            "primary_site": doc.get("primary_site", ""),
            "analytical_fraction": doc.get("analytical_fraction", ""),
            "experiment_type": doc.get("experiment_type", ""),
            "cases_count": doc.get("cases_count", None),
            "aliquots_count": doc.get("aliquots_count", None),
            "text_to_embed": ' '.join(documents)
        }

        if 'publications' in doc and doc['publications']:
            for pub in doc['publications']:
                properties.update({
                    "publication_id": pub.get("publication_id", ""),
                    "pubmed_id": pub.get("pubmed_id", ""),
                    "doi": pub.get("doi", ""),
                    "author": pub.get("author", ""),
                    "title": pub.get("title", ""),
                    "journal": pub.get("journal", ""),
                    "journal_url": pub.get("journal_url", ""),
                    "year": pub.get("year", ""),
                    "abstract": pub.get("abstract", "")
                })

        client.batch.add_data_object(properties, "Document", vector=embedding)
        msg.info("Data imported")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

# Check data import

In [10]:
where_filter= {
    "path":["text_to_embed"],
    "operator":"Like",
    "valueText":"*leukemia*"
}
result1 = (
    client.query
    .get("Document",["pdc_study_id", "study_name", "primary_site"])
    .with_limit(10)
    .with_where(where_filter)
    .do()
)
print(json.dumps(result1, indent=4))


{
    "data": {
        "Get": {
            "Document": [
                {
                    "pdc_study_id": "PDC000403",
                    "primary_site": "Hematopoietic and reticuloendothelial systems;Not Reported",
                    "study_name": "AML Ex Vivo Drug Response - Combination Treatment - Phosphoproteome"
                },
                {
                    "pdc_study_id": "PDC000402",
                    "primary_site": "Hematopoietic and reticuloendothelial systems;Not Reported",
                    "study_name": "AML Ex Vivo Drug Response - Combination Treatment - Proteome"
                },
                {
                    "pdc_study_id": "PDC000399",
                    "primary_site": "Hematopoietic and reticuloendothelial systems;Not Reported",
                    "study_name": "AML Ex Vivo Drug Response - Primary Cohort - Phosphoproteome"
                },
                {
                    "pdc_study_id": "PDC000398",
                    "pri

In [None]:
def retrieve_documents(client, query_text, top_k=5):
    query_embedding = generate_embedding(query_text)

    # Define the properties to retrieve
    properties = [
        "study_id", "pdc_study_id", "study_submitter_id", "study_name",
        "program_name", "project_name", "disease_type", "primary_site",
        "analytical_fraction", "experiment_type", "cases_count",
        "aliquots_count", "pubmed_id", "doi", "author", "title",
        "journal", "journal_url", "year", "abstract", "text_to_embed"
    ]

    try:
        results = client.query.get(
            "Document",
            properties=properties
        ).with_near_vector({
            "vector": query_embedding,
            "certainty": 0.7
        }).with_limit(top_k).do()

        response = results['data']['Get']['Document']

        # Create a list of dictionaries with only the desired fields
        return [{'pdc_study_id': doc.get('pdc_study_id'), 'study_name': doc.get('study_name'), 'primary_site': doc.get('primary_site')} for doc in response]

    except Exception as e:
        logging.error(f"An error occurred while retrieving documents: {e}")
        return []

# Example usage
question = "Provide studies related to breast cancer."
retrieve_documents(client, query_text=question)


[{'pdc_study_id': 'PDC000526',
  'study_name': 'CPTAC PDAC Proteins in Serum - Proteome',
  'primary_site': 'Not Applicable;Pancreas'},
 {'pdc_study_id': 'PDC000325',
  'study_name': 'Microscaled Proteogenomic Methods for Precision Oncology DP1 Clinical Trial - Proteome',
  'primary_site': 'Breast;Not Reported'},
 {'pdc_study_id': 'PDC000329',
  'study_name': 'Microscaled Proteogenomic Methods for Precision Oncology PDX cores - Proteome',
  'primary_site': 'Not Reported'},
 {'pdc_study_id': 'PDC000327',
  'study_name': 'Microscaled Proteogenomic Methods for Precision Oncology PDX bulk - Proteome',
  'primary_site': 'Not Reported'},
 {'pdc_study_id': 'PDC000303',
  'study_name': 'Therapeutic Targets in Breast Cancer Xenografts -  Proteome',
  'primary_site': 'Not Reported'}]

# Load llm

In [11]:
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA, LLMChain

In [12]:
llm = LlamaCpp(
    model_path= "/content/drive/MyDrive/HIDS-7950/llm/BioMistral-7B.Q4_K_M.gguf",
    temperature=0.3,
    max_tokens=2048,
    n_ctx=16384,
    top_p=1)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /content/drive/MyDrive/HIDS-7950/llm/BioMistral-7B.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = hub
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.

# Set Prompt

In [13]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>>
    You are a helpful assistant eager to assist with providing better search results.
    <</SYS>>

    [INST] Provide an answer to the following question. Ensure that the answer is informative, \
            relevant, and concise:
            {question}
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better search results. \
        Provide an answer to the following question. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)

QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>>\n    You are a helpful assistant eager to assist with providing better search results.\n    <</SYS>>\n\n    [INST] Provide an answer to the following question. Ensure that the answer is informative,             relevant, and concise:\n            {question}\n    [/INST]')

# Test LLM

In [None]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What is Acute Myeloid Leukemia?"
llm_chain.invoke({"question": question})

  warn_deprecated(

llama_print_timings:        load time =    3013.80 ms
llama_print_timings:      sample time =      59.24 ms /    93 runs   (    0.64 ms per token,  1569.99 tokens per second)
llama_print_timings: prompt eval time =   44381.35 ms /    83 tokens (  534.72 ms per token,     1.87 tokens per second)
llama_print_timings:        eval time =   57974.08 ms /    92 runs   (  630.15 ms per token,     1.59 tokens per second)
llama_print_timings:       total time =  102521.96 ms /   175 tokens


{'question': 'What is Acute Myeloid Leukemia?',
 'text': ' Acute myeloid leukemia (AML) is a type of cancer that affects the blood cells in the bone marrow . It is characterized by an overproduction of immature white blood cells, which accumulate in the bone marrow and interfere with the production of normal blood cells. The exact cause of AML is unknown, but it is thought to be related to genetic mutations that affect the development of the blood cells.'}

# Retriever

In [14]:
from langchain.retrievers import WeaviateHybridSearchRetriever

# Configuration of the retriever
retriever = WeaviateHybridSearchRetriever(
    client=client,
    index_name="Document",
    text_key="text_to_embed",
    attributes=[
        "study_id", "pdc_study_id", "study_submitter_id", "study_name",
        "program_name", "project_name", "disease_type", "primary_site",
        "analytical_fraction", "experiment_type", "cases_count",
        "aliquots_count", "pubmed_id", "doi", "author", "title",
        "journal", "journal_url", "year", "abstract", "text_to_embed"
    ],
    create_schema_if_missing=True,
)


# RAG chain

In [15]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [None]:
query = "What is the disease_type of PDC000220?"
qa.invoke(query)
#take 14 mins and correct



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    3013.80 ms
llama_print_timings:      sample time =      18.43 ms /    35 runs   (    0.53 ms per token,  1899.59 tokens per second)
llama_print_timings: prompt eval time =  823218.81 ms /  1872 tokens (  439.75 ms per token,     2.27 tokens per second)
llama_print_timings:        eval time =   24499.30 ms /    34 runs   (  720.57 ms per token,     1.39 tokens per second)
llama_print_timings:       total time =  847858.43 ms /  1906 tokens



[1m> Finished chain.[0m


{'query': 'What is the disease_type of PDC000220 ?',
 'result': ' The disease type for PDC000220 is Lung Adenocarcinoma;Lung Squamous Cell Carcinoma;Other.'}

In [None]:
query1 = "Do you have any kidney cancer studies in the PDC database?"
qa.invoke(query1)
#take 25 mins



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    3013.80 ms
llama_print_timings:      sample time =      85.04 ms /   146 runs   (    0.58 ms per token,  1716.80 tokens per second)
llama_print_timings: prompt eval time = 1395695.52 ms /  3099 tokens (  450.37 ms per token,     2.22 tokens per second)
llama_print_timings:        eval time =  112205.57 ms /   145 runs   (  773.83 ms per token,     1.29 tokens per second)
llama_print_timings:       total time = 1508340.73 ms /  3244 tokens



[1m> Finished chain.[0m


{'query': 'Do you have any kidney cancer studies in the PDC database?',
 'result': ' The PDC does not currently have any kidney cancer studies in the database. However, there are several ongoing studies that will be added to the database as soon as the data is made public. In the meantime, there are a number of other resources available for researchers interested in studying kidney cancer using proteomics data. One example is The Cancer Proteome Atlas (TCPA), which provides large-scale proteomic datasets from human cancers including kidney cancer. Another resource is the Clinical Proteomic Tumor Analysis Consortium (CPTAC), which has published several papers on kidney cancer using proteomics data. These resources may be helpful in designing future studies or analyzing existing data.'}

In [None]:
query2 = "Name studies in the PDC that focus on Metabolomics."
qa.invoke(query2)
#take 25 mins



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    3013.80 ms
llama_print_timings:      sample time =      12.08 ms /    20 runs   (    0.60 ms per token,  1656.31 tokens per second)
llama_print_timings: prompt eval time = 1434714.44 ms /  3040 tokens (  471.95 ms per token,     2.12 tokens per second)
llama_print_timings:        eval time =   15787.77 ms /    20 runs   (  789.39 ms per token,     1.27 tokens per second)
llama_print_timings:       total time = 1450595.12 ms /  3060 tokens



[1m> Finished chain.[0m


{'query': 'Name studies in the PDC that focus on Metabolomics.',
 'result': ' The following are some examples of studies in the PDC that focus on metabolomics:'}

In [None]:
query3 = "What is the experiment_type of PDC000403?"
qa.invoke(query3)
#take 27 mins and correct



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    3013.80 ms
llama_print_timings:      sample time =      10.46 ms /    19 runs   (    0.55 ms per token,  1817.31 tokens per second)
llama_print_timings: prompt eval time = 1630997.24 ms /  3408 tokens (  478.58 ms per token,     2.09 tokens per second)
llama_print_timings:        eval time =   15647.07 ms /    19 runs   (  823.53 ms per token,     1.21 tokens per second)
llama_print_timings:       total time = 1646741.76 ms /  3427 tokens



[1m> Finished chain.[0m


{'query': 'What is the experiment_type of PDC000403 ?',
 'result': ' The experiment_type of PDC000403 is TMT11'}

In [None]:
query4 = "How many studies are related to leukemia?"
qa.invoke(query4)
#take 27 mins



[1m> Entering new RetrievalQA chain...[0m



llama_print_timings:        load time =    3516.18 ms
llama_print_timings:      sample time =       2.70 ms /     4 runs   (    0.68 ms per token,  1481.48 tokens per second)
llama_print_timings: prompt eval time = 2035386.64 ms /  3835 tokens (  530.74 ms per token,     1.88 tokens per second)
llama_print_timings:        eval time =    2541.64 ms /     3 runs   (  847.21 ms per token,     1.18 tokens per second)
llama_print_timings:       total time = 2038411.88 ms /  3838 tokens



[1m> Finished chain.[0m


{'query': 'How many studies are related to leukemia?', 'result': ' 21'}

In [None]:
query5 = "How many cases and aliquots are there in study PDC000303?"
qa.invoke(query5)
# correct



[1m> Entering new RetrievalQA chain...[0m



llama_print_timings:        load time =   37129.41 ms
llama_print_timings:      sample time =      20.73 ms /    25 runs   (    0.83 ms per token,  1206.16 tokens per second)
llama_print_timings: prompt eval time = 1875669.23 ms /  3126 tokens (  600.02 ms per token,     1.67 tokens per second)
llama_print_timings:        eval time =   23405.78 ms /    24 runs   (  975.24 ms per token,     1.03 tokens per second)
llama_print_timings:       total time = 1899574.14 ms /  3150 tokens



[1m> Finished chain.[0m


{'query': 'How many cases and aliquots are there in study PDC000303?',
 'result': ' There are 27 cases and 27 aliquots in study PDC000303.'}