Install Weaviate

In [None]:
!pip install weaviate-client



In [None]:
import weaviate
from weaviate.embedded import EmbeddedOptions

In [None]:
client = weaviate.Client(
  embedded_options=EmbeddedOptions(),
  additional_headers={
    "X-HuggingFace-Api-Key": "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"
  }
)

INFO:weaviate-client:Binary /root/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.26.1/weaviate-v1.26.1-Linux-amd64.tar.gz
INFO:weaviate-client:Started /root/.cache/weaviate-embedded: process ID 731


In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain)
  Downloading langchain_core-0.2.23-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.23->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m978.0 kB/s[0m eta [36m0:00:00[0m
Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-cor

Fetch data

In [None]:
import requests
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Fetch the study and protocol data
study_protocol_url = "https://pdc.cancer.gov/graphql"  # The URL to fetch study and protocol data
study_protocol_query = """
query($acceptDUA: Boolean!) {
  study(acceptDUA: $acceptDUA) {
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    project_id
    study_name
    program_name
    project_name
    disease_type
    primary_site
    analytical_fraction
    experiment_type
    embargo_date
    cases_count
    aliquots_count
  }
  protocolPerStudy {
    protocol_id
    protocol_submitter_id
    study_id
    pdc_study_id
    study_submitter_id
    program_id
    program_submitter_id
    protocol_name
    protocol_date
    document_name
    quantitation_strategy
    experiment_type
    label_free_quantitation
    labeled_quantitation
    isobaric_labeling_reagent
    reporter_ion_ms_level
    starting_amount
    starting_amount_uom
    digestion_reagent
    alkylation_reagent
    enrichment_strategy
    enrichment
    chromatography_dimensions_count
    one_d_chromatography_type
    two_d_chromatography_type
    fractions_analyzed_count
    column_type
    amount_on_column
    amount_on_column_uom
    column_length
    column_length_uom
    column_inner_diameter
    column_inner_diameter_uom
    particle_size
    particle_size_uom
    particle_type
    gradient_length
    gradient_length_uom
    instrument_make
    instrument_model
    dissociation_type
    ms1_resolution
    ms2_resolution
    dda_topn
    normalized_collision_energy
    acquistion_type
    dia_multiplexing
    dia_ims
    analytical_technique
    chromatography_instrument_make
    chromatography_instrument_model
    polarity
    reconstitution_solvent
    reconstitution_volume
    reconstitution_volume_uom
    internal_standards
    extraction_method
    ionization_mode
  }
}
"""
variables = {
    "acceptDUA": True  # Accept DUA (Data Use Agreement) as a variable
}
response = requests.post(study_protocol_url, json={'query': study_protocol_query, 'variables': variables})
data = response.json()

# Step 2: Merge study and protocol data
merged_data = {}

for study in data['data']['study']:
    pdc_study_id = study['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = study
    else:
        merged_data[pdc_study_id].update(study)

for protocol in data['data']['protocolPerStudy']:
    pdc_study_id = protocol['pdc_study_id']
    if pdc_study_id not in merged_data:
        merged_data[pdc_study_id] = protocol
    else:
        for key, value in protocol.items():
            if key not in merged_data[pdc_study_id]:
                merged_data[pdc_study_id][key] = value

# Step 3: Fetch publication data
publication_url = "https://pdc.cancer.gov/graphql"  # The URL to fetch publication data
publication_query = """
{
  getPaginatedUIPublication(offset: 0, limit: 50) {
    uiPublication {
      publication_id
      pubmed_id
      doi
      author
      title
      journal
      journal_url
      year
      abstract
      studies {
        pdc_study_id
        submitter_id_name
      }
    }
  }
}
"""
pub_response = requests.post(publication_url, json={'query': publication_query})
pub_data = pub_response.json()

# Step 4: Create a mapping of pdc_study_id to publication info
publication_mapping = {}

for publication in pub_data['data']['getPaginatedUIPublication']['uiPublication']:
    for study in publication['studies']:
        pdc_study_id = study['pdc_study_id']
        if pdc_study_id not in publication_mapping:
            publication_mapping[pdc_study_id] = []
        publication_mapping[pdc_study_id].append({
            'publication_id': publication['publication_id'],
            'pubmed_id': publication['pubmed_id'],
            'doi': publication['doi'],
            'author': publication['author'],
            'title': publication['title'],
            'journal': publication['journal'],
            'journal_url': publication['journal_url'],
            'year': publication['year'],
            'abstract': publication['abstract'],
        })

# Step 5: Add publication info to merged data
for pdc_study_id, publications in publication_mapping.items():
    if pdc_study_id in merged_data:
        merged_data[pdc_study_id]['publications'] = publications
    else:
        merged_data[pdc_study_id] = {'publications': publications}

# Convert merged_data back to a list if needed
merged_list = list(merged_data.values())

# Step 6: Fetch the list of PubMed IDs
publication_query_pubmed_ids = """
{
  getPaginatedUIPublication(offset: 0, limit: 50) {
    total
    uiPublication {
      pubmed_id
    }
  }
}
"""
pub_response_pubmed_ids = requests.post(publication_url, json={'query': publication_query_pubmed_ids})
response_json_pubmed_ids = pub_response_pubmed_ids.json()

# Extract the list of publications
publications = response_json_pubmed_ids['data']['getPaginatedUIPublication']['uiPublication']

# Extract pubmed_id from each publication and create a list
pubmed_ids = [publication['pubmed_id'] for publication in publications]

# Step 7: Fetch article JSON from PubMed API and extract text
def fetch_article_json(pubmed_id):
    url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pubmed_id}/unicode"
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors
    return response.json()

def extract_article_text(article_json):
    article_text = []
    for document in article_json[0]['documents']:
        for passage in document['passages']:
            if passage['infons'].get('section_type') != 'REF':
                article_text.append(passage['text'])
    return "\n\n".join(article_text)

def split_article_text(article_text, chunk_size=256, chunk_overlap=64):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(article_text)

all_articles = []

for pubmed_id in pubmed_ids:
    try:
        article_json = fetch_article_json(pubmed_id)
        article_text = extract_article_text(article_json)
        article_text_split = split_article_text(article_text)
        all_articles.append({
            'pubmed_id': pubmed_id,
            'full_text': article_text_split
        })
    except requests.exceptions.RequestException as e:
        all_articles.append({ 'pubmed_id': pubmed_id, 'full_text': 'Full text is not available in this publication' })
        print(f"Failed to fetch data for PubMed ID {pubmed_id}: {e}")

# Step 8: Merge all_articles with merged_list based on pubmed_id
pubmed_to_full_text = {article['pubmed_id']: article['full_text'] for article in all_articles}

for study in merged_list:
    if 'publications' in study:
        valid_publications = []
        for publication in study['publications']:
            pubmed_id = publication['pubmed_id']
            if pubmed_id in pubmed_to_full_text:
                publication['full_text'] = pubmed_to_full_text[pubmed_id]
                valid_publications.append(publication)
        study['publications'] = valid_publications

# Output the merged data
full_data = {
    study['pdc_study_id']: study for study in merged_list
}

print(json.dumps(full_data, indent=2))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          "sampling techniques on the proteomic findings. In the genomic analysis, effective enrichment of neoplastic cellularity revealed by high KRAS VAF scores was found in LMD as well as cored tumor tissues obtained from certain bulk tumor tissues containing",
          "tissues obtained from certain bulk tumor tissues containing different amount of cellularity. Knowledge of the effects and differences of sampling techniques warrant further investigation.",
          "Supplementary Information\n\nAbbreviations\n\nACN\n\nAcetonitrile\n\nAGC\n\nAutomatic gain control\n\nbRPLC\n\nBasic reversed-phase liquid chromatography\n\nCHMP\n\nChromatin-modifying protein/charged multivesicular body protein\n\nCPTAC",
          "CPTAC\n\nClinical proteomic tumor analysis consortium\n\nDDA\n\nData-dependent acquisition\n\nDTT\n\nDithiothreitol; FA: formic acid\n\nFDR\n\nFalse discovery rate\n\nH&E\n\nHematoxylin and eosin\n\nHCD\n\nC

Define Schema

In [None]:
from weaviate.classes.config import Configure, Property, DataType, Tokenization
from wasabi import msg

# Define a schema
class_definition = {
   "classes": [
  {
    "class": "Document",
    "description": "A class called document",
    "vectorizer": "text2vec-huggingface",
    "moduleConfig": {
        "text2vec-huggingface": {
            "model": "sentence-transformers/all-MiniLM-L6-v2",
            "vectorizeClassName": False
            }
        },
            "properties": [
                {
                    "name": "study_id",
            "dataType": ["string"],
            "description": "A unique identifier for a study."
        },
                {
                    "name": "pdc_study_id",
                    "dataType": ["string"],
                    "description": "The PDC-specific study ID (e.g., PDC000121)."
                },
                {
                    "name": "study_submitter_id",
                    "dataType": ["string"],
                    "description": "An identifier submitted by the study’s submitter."
                },
                {
                    "name": "program_id",
                    "dataType": ["string"],
                    "description": "Identifier for the program associated with the study."
                },
                {
                    "name": "project_id",
                    "dataType": ["string"],
                    "description": "Identifier for the project within the program."
                },
                {
                    "name": "study_name",
                    "dataType": ["string"],
                    "description": "Name of the study."
                },
                {
                    "name": "program_name",
                    "dataType": ["string"],
                    "description": "Name of the program."
                },
                {
                    "name": "project_name",
                    "dataType": ["string"],
                    "description": "Name of the project."
                },
                {
                    "name": "disease_type",
                    "dataType": ["string"],
                    "description": "Type of disease being studied."
                },
                {
                    "name": "primary_site",
                    "dataType": ["string"],
                    "description": "Anatomic site of the tumor or disease."
                },
                {
                    "name": "analytical_fraction",
                    "dataType": ["string"],
                    "description": "The type of analytical fraction used (e.g., proteome, phosphoproteome)."
                },
                {
                    "name": "experiment_type",
                    "dataType": ["string"],
                    "description": "Type of labeling reagent used. Indicate number of channels."
                },
                {
                    "name": "cases_count",
                    "dataType": ["int"],
                    "description": "Number of cases in the study (null if not available)."
                },
                {
                    "name": "aliquots_count",
                    "dataType": ["int"],
                    "description": "Number of aliquots (samples) in the study (null if not available)."
                },
                {
                    "name": "publication_id",
                    "dataType": ["string"],
                    "description": "Publication identifier."
                },
                {
                    "name": "pubmed_id",
                    "dataType": ["string"],
                    "description": "PubMed identifier."
                },
                {
                    "name": "doi",
                    "dataType": ["string"],
                    "description": "Digital Object Identifier (DOI) for the publication."
                },
                {
                    "name": "author",
                    "dataType": ["text"],
                    "description": "Author(s) of the publication."
                },
                {
                    "name": "title",
                    "dataType": ["string"],
                    "description": "Title of the publication."
                },
                {
                    "name": "journal",
                    "dataType": ["string"],
                    "description": "Journal where the publication appeared."
                },
                {
                    "name": "journal_url",
                    "dataType": ["string"],
                    "description": "URL to the journal."
                },
                {
                    "name": "year",
                    "dataType": ["number"],
                    "description": "Year of publication."
                },
                {
                    "name": "abstract",
                    "dataType": ["text"],
                    "description": "Abstract of the publication."
                },
                {
                    "name": "full_text",
                    "dataType": ["text"],
                    "description": "Full text of the publication."
                },
                {
                    "name": "embedding",
                    "dataType": ["blob"],
                    "description": "Text embedding of the document."
                }
                ]
            }
   ]
        }
# Check if the class exists
if not client.schema.exists("Document"):
            # Create the class if it does not exist
            client.schema.create(class_definition)
            msg.good("Document class was created successfully.")
else:
            # Optionally, delete and recreate the class
            client.schema.delete_class("Document")
            msg.warn("Document class existed and was removed.")
            client.schema.create(class_definition)
            msg.good("Document class was re-created successfully.")

[38;5;3m⚠ Document class existed and was removed.[0m
[38;5;2m✔ Document class was re-created successfully.[0m


In [None]:
def jprint(json_in):
    import json
    print(json.dumps(json_in, indent=2))
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "Document",
      "description": "A class called document",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-huggingface": {
          "model": "sentence-transformers/all-MiniLM-L6-v2",
          "vectorizeClassName": false
        }
      },
      "multiTenancyConfig": {
        "autoTenantActivation": false,
        "autoTenantCreation": false,
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "A unique identifier for a study.",
          "indexFilterable": true,
          "indexRangeFilters": false,
          "indexSearchable": true,
          "moduleConfig": {
            "text2vec-huggingface": {


Import Data to Weaviate

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import logging
import numpy as np
import base64

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

logging.basicConfig(level=logging.INFO)
msg = logging.getLogger()

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def encode_embedding(embedding):
    return base64.b64encode(embedding.tobytes()).decode('utf-8')

with client.batch as batch:
    batch.batch_size = 100
    for i, (pdc_study_id, doc) in enumerate(full_data.items()):
        msg.info(f"({i+1}/{len(full_data)}) Importing Study {doc.get('pdc_study_id', 'Unknown ID')}")

        publications = doc.get("publications", [])
        for publication in publications:
            full_text = publication.get("full_text", "")
            if isinstance(full_text, list):
                full_text = " ".join(full_text)

            embedding = generate_embedding(full_text)
            encoded_embedding = encode_embedding(embedding)

            year = publication.get("year")
            try:
                year = float(year) if year is not None else None
            except ValueError:
                year = None
                msg.warning(f"Invalid year value '{publication.get('year')}' encountered and set to None")

            properties = {
                "study_id": doc.get("study_id", ""),
                "pdc_study_id": doc.get("pdc_study_id", ""),
                "study_submitter_id": doc.get("study_submitter_id", ""),
                "program_id": doc.get("program_id", ""),
                "project_id": doc.get("project_id", ""),
                "study_name": doc.get("study_name", ""),
                "program_name": doc.get("program_name", ""),
                "project_name": doc.get("project_name", ""),
                "disease_type": doc.get("disease_type", ""),
                "primary_site": doc.get("primary_site", ""),
                "analytical_fraction": doc.get("analytical_fraction", ""),
                "experiment_type": doc.get("experiment_type", ""),
                "cases_count": doc.get("cases_count", None),
                "aliquots_count": doc.get("aliquots_count", None),
                "publication_id": publication.get("publication_id", ""),
                "pubmed_id": publication.get("pubmed_id", ""),
                "doi": publication.get("doi", ""),
                "author": publication.get("author", ""),
                "title": publication.get("title", ""),
                "journal": publication.get("journal", ""),
                "journal_url": publication.get("journal_url", ""),
                "year": year,
                "abstract": publication.get("abstract", ""),
                "full_text": full_text,
                "embedding": encoded_embedding
            }
            client.batch.add_data_object(properties, "Document")
            msg.info("Data imported")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
#example
publications = doc.get("publications", [])
for publication in publications:
    full_text = publication.get("full_text", "")
    embedding = generate_embedding(full_text)
print(embedding)

[-9.41009149e-02 -1.89930767e-01 -1.43516019e-01 -8.77505690e-02
  2.77019113e-01 -1.12023115e-01 -4.57238853e-02  8.36083293e-02
  7.25025982e-02 -1.04528606e-01 -7.34966993e-02 -2.59311087e-02
 -6.23768494e-02  1.20297089e-01  6.30323496e-03  1.85521960e-01
 -1.71304256e-01  1.86843783e-01 -4.39400785e-02  8.39416534e-02
  2.59517640e-01 -1.62528008e-02  6.68800771e-02  2.25069433e-01
 -1.03203878e-02 -1.27455845e-01  1.36206210e-01 -8.90714582e-03
 -2.80195743e-01  6.10978380e-02  4.65099625e-02  7.65417814e-02
  2.51891837e-02  1.21539995e-01 -2.70181578e-02  7.54228532e-02
 -5.31649068e-02 -8.51387531e-02 -1.58204600e-01 -1.75579101e-01
  1.77027836e-01 -2.66491976e-02 -2.48716921e-02  1.17250718e-04
  9.92754400e-02 -1.86344951e-01 -4.58728559e-02  7.70096108e-02
  1.03084659e-02  1.86921030e-01 -6.19113520e-02 -6.67831302e-02
 -1.40255317e-01  1.02814391e-01  3.21376920e-02 -6.19568955e-03
  1.05215780e-01 -8.85296799e-03  1.77134201e-02 -6.21088073e-02
 -2.25320105e-02 -1.73517

Querying vector database

In [42]:
result = (
    client.query
    .get("Document",["pdc_study_id", "title"])
    .with_near_text({"concepts": ["glioblastoma"]})
    .with_limit(5)
    .do()
)
print(json.dumps(result, indent=4))

{
    "data": {
        "Get": {
            "Document": [
                {
                    "pdc_study_id": "PDC000514",
                    "title": "Integrated proteogenomic characterization of glioblastoma evolution"
                },
                {
                    "pdc_study_id": "PDC000515",
                    "title": "Integrated proteogenomic characterization of glioblastoma evolution"
                },
                {
                    "pdc_study_id": "PDC000245",
                    "title": "Proteogenomic and metabolomic characterization of human glioblastoma"
                },
                {
                    "pdc_study_id": "PDC000205",
                    "title": "Proteogenomic and metabolomic characterization of human glioblastoma"
                },
                {
                    "pdc_study_id": "PDC000204",
                    "title": "Proteogenomic and metabolomic characterization of human glioblastoma"
                }
            ]


In [None]:
where_filter= {
    "path":["full_text"],
    "operator":"Like",
    "valueText":"*pancreatic cancer*"
}
result1 = (
    client.query
    .get("Document",["pdc_study_id", "title", "full_text"])
    .with_limit(5)
    .with_where(where_filter)
    .do()
)
print(json.dumps(result1, indent=4))

{
    "data": {
        "Get": {
            "Document": [
                {
                    "full_text": "A proteogenomic portrait of lung squamous cell carcinoma\n\nSummary Lung squamous cell carcinoma (LSCC) remains a leading cause of cancer death with few therapeutic options. We characterized the proteogenomic landscape of LSCC, providing a deeper exposition of LSCC biology with potential therapeutic implications. We of LSCC biology with potential therapeutic implications. We identify NSD3 as an alternative driver in FGFR1-amplified tumors and low-p63 tumors overexpressing the therapeutic target survivin. SOX2 is considered undruggable, but our analyses provide SOX2 is considered undruggable, but our analyses provide rationale for exploring chromatin modifiers such as LSD1 and EZH2 to target SOX2-overexpressing tumors. Our data support complex regulation of metabolic pathways by crosstalk between complex regulation of metabolic pathways by crosstalk between post-translational m

Load LLM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install langchain_community
from langchain_community.llms import LlamaCpp

Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.10-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [None]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.83.tar.gz (49.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.83-cp310-cp310-linux_x86_64.whl size=2843660 sha256=cdd46bf28c9315d41a

In [None]:
model_path="/content/drive/MyDrive/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"

In [None]:
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path="/content/drive/MyDrive/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
    n_ctx=8192,
    temperature=0.6,
    max_tokens=2048,
    top_p=1
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /content/drive/MyDrive/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = ..
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   7:                 llama.ro

Prompt setup

In [None]:
system_prompt = """
You are a knowledgeable AI assistant specialized in providing detailed information from the Proteomics Data Commons (PDC). Your goal is to help researchers and users find specific studies related to cancer, understand the details of these studies, and answer questions based on the PDC's extensive data repositories. Always provide precise, relevant, and up-to-date information.
"""
#directly invokes a language model with a prompt composed of a system-defined context and the user's query
def generate_response(question):
    prompt = f"{system_prompt}\nUser: {question}\nAssistant:"
    response = llm.invoke(prompt)
    return response.strip()

# Example questions
questions = [
    "What is Study PDC000251?",
    "Provide studies linked with gastric cancer.",
]
# Generating responses for the given questions
for question in questions:
    print(f"Question: {question}")
    print("Response:", generate_response(question))
    print("---")


Question: What is Study PDC000251?


Llama.generate: prefix-match hit

llama_print_timings:        load time =   10954.98 ms
llama_print_timings:      sample time =     410.56 ms /   140 runs   (    2.93 ms per token,   341.00 tokens per second)
llama_print_timings: prompt eval time =    7137.24 ms /     2 tokens ( 3568.62 ms per token,     0.28 tokens per second)
llama_print_timings:        eval time =  140220.63 ms /   139 runs   ( 1008.78 ms per token,     0.99 tokens per second)
llama_print_timings:       total time =  143769.31 ms /   141 tokens
Llama.generate: prefix-match hit


Response: According to the Proteomics Data Commons (PDC), Study PDC000251 is a mass spectrometry-based study focused on understanding the protein expression profiles of breast cancer tissues.

This study was conducted by researchers at the University of California, San Francisco (UCSF) and published in the journal Nature Communications in 2019.

The study used high-resolution mass spectrometry to analyze the proteomes of breast cancer tissues. The researchers identified over 10,000 proteins with high confidence and found that many of these proteins were differentially expressed between healthy and tumor tissues.

The study's findings have important implications for our understanding of breast cancer biology and the development of new therapeutic strategies for this disease.
---
Question: Provide studies linked with gastric cancer.



llama_print_timings:        load time =   10954.98 ms
llama_print_timings:      sample time =     673.80 ms /   245 runs   (    2.75 ms per token,   363.61 tokens per second)
llama_print_timings: prompt eval time =    7040.26 ms /    10 tokens (  704.03 ms per token,     1.42 tokens per second)
llama_print_timings:        eval time =  239415.73 ms /   244 runs   (  981.21 ms per token,     1.02 tokens per second)
llama_print_timings:       total time =  247756.10 ms /   254 tokens


Response: I'd be happy to help! According to the Proteomics Data Commons (PDC), there are several studies related to gastric cancer.

Here are a few examples:

1. "Proteomic analysis of gastric cancer" - This study published in the journal Cancer Research used mass spectrometry to identify proteins that were differentially expressed between normal and cancerous gastric tissues.
2. "Gastric cancer biomarkers identified by proteomics" - This study published in the Journal of Proteome Research used a combination of mass spectrometry and bioinformatics tools to identify potential biomarkers for gastric cancer.
3. "Proteomic analysis of the tumor microenvironment in gastric cancer" - This study published in the journal Cancer & Metabolism used mass spectrometry to analyze the proteome of the tumor microenvironment in gastric cancer, with a focus on identifying potential therapeutic targets.

These studies demonstrate the power of proteomics in understanding the biology and behavior of gastr

RAG approach

In [None]:
def retrieve_documents(client, query_text, top_k=5):
    query_embedding = generate_embedding(query_text)
    results = client.query.get(
        "Document",
        properties=["study_id", "pdc_study_id", "study_submitter_id",  "study_name", "program_name", "project_name",
        "disease_type", "primary_site", "analytical_fraction",
        "experiment_type", "cases_count", "aliquots_count",
        "pubmed_id", "doi", "author", "title", "journal", "journal_url",
        "year", "abstract", "full_text", "embedding"]
    ).with_near_vector({
        "vector": query_embedding,
        "certainty": 0.7
    }).with_limit(top_k).do()
    documents = results['data']['Get']['Document']
    # Create a list of dictionaries with only the desired fields
    return [{'pdc_study_id': doc.get('pdc_study_id'), 'abstract': doc.get('abstract')} for doc in documents]

def format_prompt(question, documents):
    context = "\n".join([f"PDC Study ID: {doc['pdc_study_id']}, Abstract: {doc['abstract']}" for doc in documents])
 #context = "\n".join([f"PDC Study ID: {doc['pdc_study_id']}, Abstract: {doc['abstract']}, Details: {doc.get('full_text', 'No additional information available.')}" for doc in documents])
    return f"{system_prompt}\nUser: {question}\nContext: {context}\nAssistant:"

def generate_response(question):
    documents = retrieve_documents(client, question, top_k=3)
    prompt = format_prompt(question, documents)
    response = llm.invoke(prompt)
    return response.strip()

# Example questions and invoking the generate_response
questions = ["What is Study PDC000251?", "Provide studies linked with gastric cancer."]
for question in questions:
    print(f"Question: {question}")
    print("Response:", generate_response(question))
    print("---")

INFO:weaviate-client:Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
INFO:weaviate-client:Started /root/.cache/weaviate-embedded: process ID 62413


Question: What is Study PDC000251?


Llama.generate: prefix-match hit

llama_print_timings:        load time =   10954.98 ms
llama_print_timings:      sample time =     523.33 ms /   135 runs   (    3.88 ms per token,   257.96 tokens per second)
llama_print_timings: prompt eval time =  324589.29 ms /    23 tokens (14112.58 ms per token,     0.07 tokens per second)
llama_print_timings:        eval time =  185580.27 ms /   134 runs   ( 1384.93 ms per token,     0.72 tokens per second)
llama_print_timings:       total time =  205194.71 ms /   157 tokens
Llama.generate: prefix-match hit


Response: Study PDC000251 is a proteomics study published in the Proteomics Data Commons (PDC). The study was conducted by researchers at the University of California, San Francisco (UCSF) and aimed to identify protein biomarkers for cancer diagnosis.

The study used a combination of mass spectrometry-based proteomics and bioinformatics tools to analyze the protein profiles of cancer tissues and matched normal tissues. The results showed that several proteins were differentially expressed between cancer and normal tissues, suggesting potential biomarkers for cancer diagnosis.

Overall, Study PDC000251 provides valuable insights into the proteomic changes associated with cancer development and progression. (Source: Proteomics Data Commons)
---
Question: Provide studies linked with gastric cancer.



llama_print_timings:        load time =   10954.98 ms
llama_print_timings:      sample time =     388.55 ms /   142 runs   (    2.74 ms per token,   365.46 tokens per second)
llama_print_timings: prompt eval time =  339215.95 ms /   603 tokens (  562.55 ms per token,     1.78 tokens per second)
llama_print_timings:        eval time =  149002.77 ms /   141 runs   ( 1056.76 ms per token,     0.95 tokens per second)
llama_print_timings:       total time =  489043.66 ms /   744 tokens


Response: I'm happy to help! Based on your request, it seems like you're looking for studies related to gastric cancer.

Here are some relevant studies that might be of interest:

1. Study ID: PDC000214, Title: Proteogenomic Analysis of Diffuse Gastric Cancers.
2. Study ID: PDC000216, Title: Proteogenomic Analysis of Gastric Cancer.
3. Study ID: PDC000215, Title: Proteogenomic Analysis of Gastric Cancer.

These studies provide insights into the molecular mechanisms underlying gastric cancer development and progression.

Please let me know if you have any further questions or if there's anything else I can assist you with!
---
