In [1]:
import json
import os
from src.data_processing.indexing.embeddings_handler import (
    get_embeddings,
    get_model_info,
    save_embeddings_details_to_json,
    load_embeddings_details_from_json,
    find_most_similar,
    save_to_csv
)
import numpy as np
from typing import List, Union, Dict, Any

In [2]:
def generate_embedding_details(
        chunk_dir: str, 
        annotations_models: List[str], 
        embedding_models: List[str],
        output_file_path: str
) -> List[Dict[str, Any]]:
    annotations_models = [
        "bioformer",
        "pubmedbert"
    ]

    embedding_models = [
        "bio_bert",
        "bio_gpt",
        "longformer",
        "sci_bert"
    ]

    all_embedding_detials = []

    chunk_dir = "../../data/PMC_7614604_chunks"
    #chunk_dir = "../../data/test/test"

    # Read the chunks and create embeddings:
    for annotation_model in annotations_models:
        print("Processing for Annotation Model: ", annotation_model)
        for cur_file in os.listdir(chunk_dir):
            if cur_file.endswith(".json") and annotation_model in cur_file:
                input_file_path = f"{chunk_dir}/{cur_file}"

                with open(f"{input_file_path}", "r") as f:
                    print(f"Processing {input_file_path}")
                    chunks = json.load(f)
                    merged_texts_with_sum = [
                        f"Summary:\n{article_summary}\nText:\n{chunk["merged_text"]}"
                        for chunk in chunks
                    ]
                    #print(merged_texts_with_sum)
                    merged_texts_without_sum = [chunk["merged_text"] for chunk in chunks]

                    # Creating Embeddings with Summary
                    for embedding_model in embedding_models:
                        print(f"Processing for Embedding Model: {embedding_model} with article summary")
                        model_info = get_model_info(embedding_model)
                        embeddings = get_embeddings(
                            model_name=model_info[0],
                            token_limit=model_info[1],
                            texts=merged_texts_with_sum
                        )

                        embeddings_details = {
                            "file": cur_file,
                            "chunks_count": len(merged_texts_with_sum),
                            "annotation_model": annotation_model,
                            "embeddings_model": embedding_model,
                            "embeddings_model_token_limit": model_info[1],
                            "contains_summary": True,
                            "embeddings": embeddings
                        }

                        all_embedding_detials.append(embeddings_details)

                    # Creating Embeddings without Summary
                    for embedding_model in embedding_models:
                        print(f"Processing for Embedding Model: {embedding_model} without article summary")
                        model_info = get_model_info(embedding_model)
                        embeddings = get_embeddings(
                            model_name=model_info[0],
                            token_limit=model_info[1],
                            texts=merged_texts_without_sum
                        )

                        embeddings_details = {
                            "file": cur_file,
                            "chunks_count": len(merged_texts_without_sum),
                            "annotation_model": annotation_model,
                            "embeddings_model": embedding_model,
                            "embeddings_model_token_limit": model_info[1],
                            "contains_summary": False,
                            "embeddings": embeddings
                        }

                        all_embedding_detials.append(embeddings_details)

    # Write the Embeddings to a file:
    file_path = "../../data/PMC_7614604_chunks/embeddings/PMC_7614604_embeddings.json"
    save_embeddings_details_to_json(all_embedding_detials, file_path)


In [None]:
{
        "chunk_sequence": "35",
        "merged_text": " than EGFR mutant clones in healthy lungs of ever-smokers. In summary, 54 out of 295 (18%) of samples of non-cancerous lung tissue harboured an EGFR driver mutation, and 43 out of 81 (53%) samples of non-cancerous lung tissue harboured a KRAS driver mutation. No associations between EGFR or KRAS mutation in non-cancerous tissue and smoking status or cancer diagnosis were observed (Supplementary Table 9). To address whether oncogenic mutations accumulate with the natural ageing process, we examined the driver mutation frequency in all 31 genes (including EGFR and KRAS) present in the Duplex-seq panel. We limited this analysis to 17 never-smoker individuals from the PEACE study to control for any effect of smoking. Consistent with previous work37,38, there was a significant correlation between age and mutation count (Fig. 4e).    \n\nAnnotations:\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 7448\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 7586\nText Length - 4\n\nText - KRAS\nType - Gene\nNCBI Gene - 3845\nText Offset - 7680\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 7726\nText Length - 4\n\nText - KRAS\nType - Gene\nNCBI Gene - 3845\nText Offset - 7734\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 8002\nText Length - 4\n\nText - KRAS\nType - Gene\nNCBI Gene - 3845\nText Offset - 8011\nText Length - 4\n\n",
        "chunk_text": " than EGFR mutant clones in healthy lungs of ever-smokers. In summary, 54 out of 295 (18%) of samples of non-cancerous lung tissue harboured an EGFR driver mutation, and 43 out of 81 (53%) samples of non-cancerous lung tissue harboured a KRAS driver mutation. No associations between EGFR or KRAS mutation in non-cancerous tissue and smoking status or cancer diagnosis were observed (Supplementary Table 9). To address whether oncogenic mutations accumulate with the natural ageing process, we examined the driver mutation frequency in all 31 genes (including EGFR and KRAS) present in the Duplex-seq panel. We limited this analysis to 17 never-smoker individuals from the PEACE study to control for any effect of smoking. Consistent with previous work37,38, there was a significant correlation between age and mutation count (Fig. 4e).    ",
        "chunk_annotations": [
            {
                "id": "228",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 7448,
                "length": 4
            },
            {
                "id": "229",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 7586,
                "length": 4
            },
            {
                "id": "230",
                "text": "KRAS",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "3845",
                "offset": 7680,
                "length": 4
            },
            {
                "id": "231",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 7726,
                "length": 4
            },
            {
                "id": "232",
                "text": "KRAS",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "3845",
                "offset": 7734,
                "length": 4
            },
            {
                "id": "233",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 8002,
                "length": 4
            },
            {
                "id": "234",
                "text": "KRAS",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "3845",
                "offset": 8011,
                "length": 4
            }
        ],
        "payload": {
            "chunk_id": "791e634b-6f5e-4b77-b85d-b0479cde3e74",
            "chunk_name": "PMC_7614604_chunk_35",
            "chunk_length": 840,
            "token_count": 129,
            "chunk_annotations_count": 7,
            "chunk_annotations_ids": [
                "228",
                "229",
                "230",
                "231",
                "232",
                "233",
                "234"
            ],
            "chunk_annotations_types": [
                "Gene"
            ],
            "chunk_offset": 7442,
            "chunk_infons": {
                "type": "Oncogenic mutations in healthy lung"
            },
            "chunker_type": "sliding_window",
            "merger_type": "append",
            "aioner_model": "bioformer",
            "gnorm2_model": "bioformer",
            "article_id": "PMC_7614604"
        }
    },
    {
        "chunk_sequence": "36",
        "merged_text": " 4e).    ",
        "chunk_text": " 4e).    ",
        "chunk_annotations": [],
        "payload": {
            "chunk_id": "0f3d732e-f960-4d76-8f90-ceb978d54c76",
            "chunk_name": "PMC_7614604_chunk_36",
            "chunk_length": 9,
            "token_count": 1,
            "chunk_annotations_count": 0,
            "chunk_annotations_ids": [],
            "chunk_annotations_types": [],
            "chunk_offset": 8273,
            "chunk_infons": {
                "type": "Oncogenic mutations in healthy lung"
            },
            "chunker_type": "sliding_window",
            "merger_type": "append",
            "aioner_model": "bioformer",
            "gnorm2_model": "bioformer",
            "article_id": "PMC_7614604"
        }
    },
    {
        "chunk_sequence": "37",
        "merged_text": "      Discussion      In this study, we explored the paradigm of tumour promotion driven by the air pollutant PM in the development of lung cancer. We build on previous studies proposing that engine exhaust39 and air pollution40 induce lung tumours through genotoxicity, induction of oxidative stress and inflammation. We propose that PM can trigger the expansion of pre-existing mutant lung cells through an inflammatory axis that may be amenable to therapy to limit the risk of tumour promotion. Extending previous findings that established associations between air pollution and lung cancer18,41, including in LCINS6, we found an association between the frequency of EGFR mutant lung cancer incidence and increasing PM2.5 levels. Temporal analysis suggested that 3 years of PM2.5 exposure may be sufficient to increase the risk of developing EGFR-driven lung cancer. A limitation of our epidemiological analysis is its ecological nature: using aggregate data instead of participant-level data. We also acknowledge that variables such as female sex, Asian ancestry and adenocarcinoma histology, which are associated with EGFR mutation status, may confound our conclusions. We balanced our study cohorts with respect to sex and covered geographically and ethnically distinct populations, and when restricting the analysis to LUAD in the English cohort, the positive association remained significant. This study suggests that PM exposure contributes to the observed geographical disparities of EGFR-driven lung cancer, in addition to other established intrinsic (for example, germline genetics5) and extrinsic (for example, occupational exposure3) factors, and it will be important to understand how these factors interact to increase risk. We observed that PM induces an\n\nAnnotations:\nText - participant\nType - Species\nNCBI Taxonomy - 9606\nText Offset - 973\nText Length - 11\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 670\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 845\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 1956\nText Offset - 1123\nText Length - 4\n\nText - EGFR\nType - Gene\nNCBI Gene - 13649\nText Offset - 1494\nText Length - 4\n\n",
        "chunk_text": "      Discussion      In this study, we explored the paradigm of tumour promotion driven by the air pollutant PM in the development of lung cancer. We build on previous studies proposing that engine exhaust39 and air pollution40 induce lung tumours through genotoxicity, induction of oxidative stress and inflammation. We propose that PM can trigger the expansion of pre-existing mutant lung cells through an inflammatory axis that may be amenable to therapy to limit the risk of tumour promotion. Extending previous findings that established associations between air pollution and lung cancer18,41, including in LCINS6, we found an association between the frequency of EGFR mutant lung cancer incidence and increasing PM2.5 levels. Temporal analysis suggested that 3 years of PM2.5 exposure may be sufficient to increase the risk of developing EGFR-driven lung cancer. A limitation of our epidemiological analysis is its ecological nature: using aggregate data instead of participant-level data. We also acknowledge that variables such as female sex, Asian ancestry and adenocarcinoma histology, which are associated with EGFR mutation status, may confound our conclusions. We balanced our study cohorts with respect to sex and covered geographically and ethnically distinct populations, and when restricting the analysis to LUAD in the English cohort, the positive association remained significant. This study suggests that PM exposure contributes to the observed geographical disparities of EGFR-driven lung cancer, in addition to other established intrinsic (for example, germline genetics5) and extrinsic (for example, occupational exposure3) factors, and it will be important to understand how these factors interact to increase risk. We observed that PM induces an",
        "chunk_annotations": [
            {
                "id": "235",
                "text": "participant",
                "type": "Species",
                "ncbi_label": "NCBI Taxonomy",
                "ncbi_id": "9606",
                "offset": 973,
                "length": 11
            },
            {
                "id": "240",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 670,
                "length": 4
            },
            {
                "id": "241",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 845,
                "length": 4
            },
            {
                "id": "242",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "1956",
                "offset": 1123,
                "length": 4
            },
            {
                "id": "243",
                "text": "EGFR",
                "type": "Gene",
                "ncbi_label": "NCBI Gene",
                "ncbi_id": "13649",
                "offset": 1494,
                "length": 4
            }
        ],
        "payload": {
            "chunk_id": "7a78f15d-c452-4920-97e6-c5e7e84f8681",
            "chunk_name": "PMC_7614604_chunk_37",
            "chunk_length": 1771,
            "token_count": 256,
            "chunk_annotations_count": 5,
            "chunk_annotations_ids": [
                "235",
                "240",
                "241",
                "242",
                "243"
            ],
            "chunk_annotations_types": [
                "Gene",
                "Species"
            ],
            "chunk_offset": 0,
            "chunk_infons": {
                "type": "Discussion"
            },
            "chunker_type": "sliding_window",
            "merger_type": "append",
            "aioner_model": "bioformer",
            "gnorm2_model": "bioformer",
            "article_id": "PMC_7614604"
        }
    }