## Clone the Private repo:

Please check the README file before executing this

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!mkdir -p /root/.ssh/

In [5]:
!cp /content/drive/MyDrive/deploy_keys/id_ed25519* /root/.ssh/

In [6]:
!ssh-keyscan github.com >> /root/.ssh/known_hosts

# github.com:22 SSH-2.0-babeld-7ce31352
# github.com:22 SSH-2.0-babeld-7ce31352
# github.com:22 SSH-2.0-babeld-7ce31352
# github.com:22 SSH-2.0-babeld-7ce31352
# github.com:22 SSH-2.0-babeld-7ce31352


In [7]:
!ssh -T git@github.com

Hi helmi0695/instadeep-llm-technical-test! You've successfully authenticated, but GitHub does not provide shell access.


In [8]:
!git clone git@github.com:helmi0695/instadeep-llm-technical-test.git

Cloning into 'instadeep-llm-technical-test'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 55 (delta 12), reused 50 (delta 7), pack-reused 0[K
Receiving objects: 100% (55/55), 252.36 KiB | 884.00 KiB/s, done.
Resolving deltas: 100% (12/12), done.


In [9]:
!ls

drive  instadeep-llm-technical-test  sample_data


In [10]:
%cd /content/instadeep-llm-technical-test

/content/instadeep-llm-technical-test


In [11]:
!ls

__init__.py  notebooks	README.md  ressources  src


In [12]:
!git pull

Already up to date.


# LLaMa 7B Chatbot in Hugging Face and LangChain - RAG

In this notebook we'll explore how we can use the open source **Llama-7b-chat** model using Hugging Face and LangChain.
To access Llama 2 models, one must first request access via [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) (access is typically granted within a few hours).

We start by doing a `pip install` of all required libraries.

Note: given the fact that in Google Colab, logs created using logging module are not directly displayed in the output cell, I will use prints for the important info.

In [13]:
!pip install -qU \
    transformers==4.31.0 \
    sentence-transformers==2.2.2 \
    pinecone-client==2.2.2 \
    datasets==2.14.0 \
    accelerate==0.21.0 \
    einops==0.6.1 \
    langchain==0.0.240 \
    xformers==0.0.20 \
    bitsandbytes==0.41.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.1/179.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m8.1 MB

## Creating the Summarization pipeline

### Initializing the Hugging Face Pipeline for summarization

The first thing we need to do is initialize a `text-generation` pipeline with Hugging Face transformers. The Pipeline requires three things that we must initialize first, those are:

* A LLM, in this case it will be `meta-llama/Llama-2-7b-chat-hf`.

* The respective tokenizer for the model.

We initialize the model and move it to our CUDA-enabled GPU. Using Colab this can take 5-10 minutes to download and initialize the model.

In [15]:
# language_model_wrapper.py
# TO DO: IMPORT SETTINGS

import transformers
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline


class WrapLlm():
    def __init__(self,
                 hf_auth,
                 model_name):
        self.hf_auth = hf_auth
        self.model_name = model_name

    def initialize_llama_model_tokenizer(self):
        device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

        # set quantization configuration to load large model with less GPU memory
        # this requires the `bitsandbytes` library
        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

        # begin initializing HF items, need auth token for these
        hf_auth = self.hf_auth
        model_config = transformers.AutoConfig.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_auth
        )

        model = transformers.AutoModelForCausalLM.from_pretrained(
            self.model_name,
            trust_remote_code=True,
            config=model_config,
            quantization_config=bnb_config,
            device_map='auto',
            use_auth_token=self.hf_auth
        )
        model.eval()
        print(f"Model loaded on {device}")

        tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_auth
        )
        return model, tokenizer


    #def create_llama_2_7b_wrapper(settings: Settings):
    def create_llama_wrapper(self):

        model, tokenizer = self.initialize_llama_model_tokenizer()

        generate_text = transformers.pipeline(
            model=model, tokenizer=tokenizer,
            return_full_text=True,  # langchain expects the full text
            task='text-generation',
            # we pass model parameters here too
            temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
            max_new_tokens=1048,  # mex number of tokens to generate in the output
            repetition_penalty=1.1  # without this output begins repeating
        )

        llm = HuggingFacePipeline(pipeline=generate_text)
        return llm


In [16]:
# TO DO : INCLUDE THIS INTO THE MAIN CODE NOT TO ONE OF THE CLASSES BECAUSE THIS NEED TO BE EXECUTED ONLY ONCE
# TO DO: ADD from src.languae_model_wrapper import WrapLlm
import json

# Specify the path to settings.local.json file
settings_file_path = '/content/instadeep-llm-technical-test/settings.local.json'

# Read JSON data from the file
with open(settings_file_path, 'r') as file:
    settings = json.load(file)

llama_2_7b_model_name = settings['llama2_7b_settings']['model_name']
hf_auth_token = settings['huggingface_settings']['hf_auth_token']

wrap_llm = WrapLlm(hf_auth=hf_auth_token, model_name=llama_2_7b_model_name)
llm = wrap_llm.create_llama_wrapper()

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

'\n nobody knows.\n\nThe COVID-19 pandemic has highlighted the importance of vaccination in preventing the spread of infectious diseases, but there is still much to be learned about the most effective ways to protect against COVID-19. While several vaccines have been developed and are being distributed around the world, it is important to recognize that no single vaccine will provide complete protection against COVID-19.\n\nOne of the biggest challenges in developing an effective COVID-19 vaccine is the incredible diversity of the virus itself. COVID-19 is caused by a coronavirus, which means that it can mutate quickly and easily, leading to new strains of the virus that may not be well-suited to existing vaccines. As a result, researchers are working on multiple fronts to develop vaccines that can provide broad protection against COVID-19, including:\n\n1. mRNA vaccines: These vaccines use a piece of genetic material called messenger RNA (mRNA) to instruct cells in the body to produce

In [17]:
# TO DO _ create summarize.py
import transformers
from torch import cuda, bfloat16
from langchain import PromptTemplate,  LLMChain


class Summarize():
    def __init__(self):
        pass


    def generate_summary(self, text, llm, how="chunk"):
        """
        Used mainly to summarize text.
        the text can be under 3 diffrent formats:
            - chunk: a single paragraph
            - list : a list of paragraphs
            - full : a full document - This is not recommended if we have large document that do not fit into memory
        Input: text_chunk, llm, how:("chunk","list", "full")
        Output: summary of text_chunk
        """
        # Defining the template to generate summary
        template = """
        Write a concise summary of the text, return your responses with 1-2 sentences that cover the key points of the text without generating any extra content.
        ```{text}```
        SUMMARY:
        """
        if how == "list":
            template = """
            Write a concise summary based the list of texts provided, return a coherent summary that covers the key points of the text without generating any extra content.
            ```{text}```
            SUMMARY:
            """
        elif how == "full":
            template = """
            Write a concise summary of the text, return your responses with 5 paragraphs that cover the key points of the text without generating any extra content.
            ```{text}```
            SUMMARY:
            """
        prompt = PromptTemplate(template=template, input_variables=["text"])
        llm_chain = LLMChain(prompt=prompt, llm=llm)
        summary = llm_chain.run(text)
        return summary


In [32]:
# TO DO : Create embedding.py
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


class Embedding():
    def __init__(self):
        pass


    def initialize_hf_embeddings(self, embedding_model_name):

        device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

        embed_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs={'device': device},
            encode_kwargs={'device': device, 'batch_size': 32}
        )
        return embed_model

In [33]:
# TO DO: Add this line: from src.embedding import Embedding
hf_embedding_model_name = settings['huggingface_settings']['model_name']

embedding = Embedding()
embed_model = embedding.initialize_hf_embeddings(hf_embedding_model_name)

In [21]:
# TO DO: create vectorstore.py
import os
import pinecone
import time
import logging
import re
import glob
import pandas as pd


class VectorStore():
    def __init__(self):
        # Set up logging
        # logging.basicConfig(level=logging.INFO) # In Google Colab, logs created using logging module are not directly displayed in the output cell
        pass

    def read_documents(self, folder_path):
        # Initialize an empty list to store data
        data_content = []

        # Get a list of all .txt files in the folder
        txt_files = glob.glob(os.path.join(folder_path, '*.txt'))

        # Loop through each file, read its content, and append to the list
        for doc_id, txt_file in enumerate(txt_files):
            try:
                file_path = os.path.join(folder_path, txt_file)
                print(f'Importing {file_path}')
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                    # Split content into documents based on "----"
                    documents = re.split(r'----', content)
                    file_name = os.path.basename(txt_file)

                    # Process each document
                    for chunk_id, document in enumerate(documents):
                        # Extract chunks based on "TITLE PARAGRAPH:"
                        chunks = re.split(r'TITLE PARAGRAPH:', document)

                        # Process each chunk
                        for sub_chunk_id, chunk in enumerate(chunks):
                            # Skip empty chunks
                            if not chunk.strip():
                                continue

                            # Extract chunk title
                            title_match = re.search(r'(.*?)\n', chunk)
                            chunk_title = title_match.group(1).strip() if title_match else None

                            data_content.append({
                                'file_name': file_name,
                                'chunk_id': f'{doc_id}-{chunk_id}-{sub_chunk_id}',
                                'doc_id': doc_id,
                                'chunk_title': chunk_title,
                                'chunk': chunk.strip(),
                                'chunk_length': len(chunk),
                                'doc':content,
                                'doc_length': len(content)
                            })
            except Exception as e:
                print(f"Error reading {txt_file}: {e}")

        # Create a Pandas DataFrame from the list
        data = pd.DataFrame(data_content)
        return data

    def initialize_pinecone_index(self, pinecone_api_key, pinecone_environment, index_name):
        # get API key from app.pinecone.io and environment from console
        pinecone.init(
            api_key=os.environ.get('PINECONE_API_KEY') or pinecone_api_key,
            environment=os.environ.get('PINECONE_ENVIRONMENT') or pinecone_environment
        )

        # Index initialisation
        if index_name not in pinecone.list_indexes():
            pinecone.create_index(
                index_name,
                dimension=384,
                metric='cosine'
            )
            # wait for index to finish initialization
            while not pinecone.describe_index(index_name).status['ready']:
                time.sleep(1)

        # connect to the index:
        index = pinecone.Index(index_name)

        # Log the index stats
        # logging.info("Pinecone index stats: %s", index.describe_index_stats())
        print(("Pinecone index stats: %s", index.describe_index_stats()))
        return index

    def create_pinecone_vectorstore(self, data, index, embed_model):
        # Embed and index the documents - This must only be done once we have new data to inject into the index
        # Note this method can be modified according to the use case: i.e. we can use the 'update' method to update an existing index
        batch_size = 32

        for i in range(0, len(data), batch_size):
            i_end = min(len(data), i+batch_size)
            batch = data.iloc[i:i_end]
            ids = [f"{x['chunk_id']}" for i, x in batch.iterrows()]
            texts = [x['chunk'] for i, x in batch.iterrows()]
            embeds = embed_model.embed_documents(texts)
            # get metadata to store in Pinecone
            metadata = [
                {'text': x['chunk'],
                'chunk_title': x['chunk_title'],
                'file_name': x['file_name'],
                'doc_id':x['doc_id']
                } for i, x in batch.iterrows()
            ]
            # add to Pinecone
            index.upsert(vectors=zip(ids, embeds, metadata))
        print(("Pinecone index stats: %s", index.describe_index_stats()))
        return index


In [49]:
# TO DO: Add this line: from src.vectorstore import VectorStore

pinecone_api_key = settings['pinecone_settings']['api_key']
pinecone_environment = settings['pinecone_settings']['environment']
pinecone_index_name = settings['pinecone_settings']['index_name']

input_documents_data_path = settings['data_paths']['inputs']['documents_folder_path']

vectorstore = VectorStore()
pinecone_index = vectorstore.initialize_pinecone_index(pinecone_api_key, pinecone_environment, pinecone_index_name)

data = vectorstore.read_documents(input_documents_data_path)

pinecone_index = vectorstore.create_pinecone_vectorstore(data, pinecone_index, embed_model)

('Pinecone index stats: %s', {'dimension': 384,
 'index_fullness': 0.00286,
 'namespaces': {'': {'vector_count': 286}},
 'total_vector_count': 286})
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/nanomaterials-10-00364-v2.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/s41392-022-01007-w.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/PMC8198544.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/82_2020_217.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/mRNA vaccines — a new era.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/pharmaceutics-12-00102-v2.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/Efficacy and Safety of the mRNA-1273 SARS-CoV-2 Vaccine.txt
Importing /content/instadeep-llm-technical-test/ressources/data/inputs/raw_text/s41591-

In [50]:
# TO DO : create rag.py

from langchain.vectorstores import Pinecone


class RAG():
    def __init__(self,
                 index,
                 embed_model):
        self.index = index
        self.embed_model = embed_model

    def get_top_k_documents(self, query, k=3):
        text_field = 'text'  # field in metadata that contains text content

        vectorstore = Pinecone(
            self.index, self.embed_model.embed_query, text_field
        )

        top_k_docs = vectorstore.similarity_search_with_score(
            query,  # the search query
            k=k  # returns top 3 most relevant chunks of text
        )
        return top_k_docs


    def doc_search(self, query, top_k = 3):
        search_results = list()
        metadata = dict()

        documents = self.get_top_k_documents(query, k=top_k)
        # Loop through the documents and get the metadata_cotent and the score
        for doc in documents:
          score = doc[-1]
          metadata = doc[0].metadata
          metadata['similarity_score'] = score
          search_results.append(metadata)

        # Create a result DataFrame
        res_df = pd.DataFrame(search_results)
        return res_df

In [51]:
# This is main
# TO DO: from src.rag import RAG
from datetime import datetime

def export_data(data, output_file_name, output_folder_path):
    '''helper function to export data'''
    # Get today's date with the hour
    current_time = datetime.now().strftime('%Y%m%d_%H%M%S')

    # Save to_summarise_df to a CSV file with the current timestamp
    csv_filename = f'/{output_file_name}_{current_time}.csv'
    csv_data_path = output_folder_path + csv_filename
    data.to_csv(csv_data_path)


rag = RAG(pinecone_index, embed_model)

# TO DO: Read tis from the API / Input
paragraph = 'mRNA vaccines have become a versatile technology for the prevention of infectious diseases and the treatment of cancers.'
#top_k_docs = rag.get_top_k_documents(paragraph, k=3)
doc_search_result = rag.doc_search(paragraph, top_k = 3)

Unnamed: 0,chunk_title,doc_id,file_name,similarity_score
0,Conclusions and future directions,4.0,mRNA vaccines — a new era.txt,0.837178
1,Safety,4.0,mRNA vaccines — a new era.txt,0.809183
2,mRNA Vaccines Against Infectious Diseases,0.0,nanomaterials-10-00364-v2.txt,0.806486


In [52]:
# prepare the data to be summarized
to_summarise_df = (pd.merge(doc_search_result, data, on=['file_name', 'chunk_title'])
             .groupby(['file_name', 'chunk_title'])
             .first()
             .reset_index()[['file_name', 'chunk_title', 'doc', 'similarity_score']]
             .sort_values(by='similarity_score', ascending=False))

# Get all the rows to be summarized for the extracted documents:
# Add a dummy 'similarity_score' column to the data dataframe
data['similarity_score'] = None

# Merge the two dataframes based on the "file_name" column
merged_df = pd.merge(data, to_summarise_df[['file_name']], on='file_name')

# Filter the merged dataframe to keep only relevant columns
final_df = merged_df[['file_name', 'chunk_title', 'doc', 'similarity_score', 'chunk']]

Unnamed: 0,file_name,chunk_title,doc,similarity_score,chunk
0,nanomaterials-10-00364-v2.txt,ABSTRACT,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,ABSTRACT\n The use of messenger RNA (mRNA) in ...
1,nanomaterials-10-00364-v2.txt,Introduction,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Introduction\nAccording to the European Medici...
2,nanomaterials-10-00364-v2.txt,Structure of Synthetic IVT mRNA and Chemical M...,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Structure of Synthetic IVT mRNA and Chemical M...
3,nanomaterials-10-00364-v2.txt,Figure 2.,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Figure 2.\nRepresentative scheme of the IVT mR...
4,nanomaterials-10-00364-v2.txt,5' Cap,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,5' Cap\nEukaryotic native mRNA possesses a 5' ...
...,...,...,...,...,...
97,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: cont.) |
98,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: \nNone||None||Targets||Tria...
99,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: \nNone||None||Targets||Tria...
100,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,"DESCRIPTION TABLE: , Biomedical Advanced Resea..."


In [53]:
# TO DO: ADD from src.summarize import Summarize

# Apply the summarize_text_chunk method to each row
print('Summarizing the extracted chunks')
final_df['summarized_chunk'] = final_df['chunk'].apply(lambda x: summarize.generate_summary(x, llm, how="chunk"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['summarized_chunk'] = final_df['chunk'].apply(lambda x: summarize.generate_summary(x, llm, how="chunk"))


Unnamed: 0,file_name,chunk_title,doc,similarity_score,chunk,summarized_chunk
0,nanomaterials-10-00364-v2.txt,ABSTRACT,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,ABSTRACT\n The use of messenger RNA (mRNA) in ...,The use of mRNA in gene therapy has gained po...
1,nanomaterials-10-00364-v2.txt,Introduction,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Introduction\nAccording to the European Medici...,Gene therapy involves using genetic material ...
2,nanomaterials-10-00364-v2.txt,Structure of Synthetic IVT mRNA and Chemical M...,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Structure of Synthetic IVT mRNA and Chemical M...,The production of IVT mRNA is typically done ...
3,nanomaterials-10-00364-v2.txt,Figure 2.,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,Figure 2.\nRepresentative scheme of the IVT mR...,The figure depicts an illustration of the IVT...
4,nanomaterials-10-00364-v2.txt,5' Cap,ABSTRACT\n The use of messenger RNA (mRNA) in ...,,5' Cap\nEukaryotic native mRNA possesses a 5' ...,The 5' cap of eukaryotic mRNA is formed by th...
...,...,...,...,...,...,...
97,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: cont.) |,This text describes the benefits of using a s...
98,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: \nNone||None||Targets||Tria...,This table summarizes clinical trials for var...
99,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,DESCRIPTION TABLE: \nNone||None||Targets||Tria...,This table summarizes clinical trials for var...
100,mRNA vaccines — a new era.txt,,ABSTRACT\n Vaccines prevent many millions of i...,,"DESCRIPTION TABLE: , Biomedical Advanced Resea...",This table lists various biotechnology compan...


In [54]:
# Group by 'file_name' and aggregate the 'summarized_chunk' into a list
grouped_df = final_df.groupby('file_name')['summarized_chunk'].agg(list).reset_index()

# Merge the grouped dataframe back to to_summarise_df
to_summarise_df = pd.merge(to_summarise_df, grouped_df, on='file_name', how='left')

Unnamed: 0,file_name,chunk_title,doc,similarity_score,summarized_chunk
0,mRNA vaccines — a new era.txt,Conclusions and future directions,ABSTRACT\n Vaccines prevent many millions of i...,0.837178,[ * Vaccines prevent millions of illnesses and...
1,mRNA vaccines — a new era.txt,Safety,ABSTRACT\n Vaccines prevent many millions of i...,0.809183,[ * Vaccines prevent millions of illnesses and...
2,nanomaterials-10-00364-v2.txt,mRNA Vaccines Against Infectious Diseases,ABSTRACT\n The use of messenger RNA (mRNA) in ...,0.806486,[ The use of mRNA in gene therapy has gained p...


In [61]:
# We use this exception handling in case we encounter a out of memory issue
# In this case, we get the full summary by joining the summaries of all chunks
print('Summarizing the extracted papers')
try:
    to_summarise_df['doc_summary'] = to_summarise_df['summarized_chunk'].apply(lambda text_list: rag.generate_summary(text_list, llm, how="list"))
    print('Documents were summarized using an LLM')
except Exception as e:
    print(f"Exception during summarization: {e}")
    to_summarise_df['doc_summary'] = to_summarise_df['summarized_chunk'].apply(lambda text_list: '\n'.join(text_list))
    print('Documents were summarized using joining of summarized chunks')
summarized_retrieved_data = to_summarise_df

Summarizing the extracted papers
Exception during summarization: 'RAG' object has no attribute 'generate_summary'
Documents were summarized using joining of summarized chunks


In [62]:
# Export the summarized data

output_file_name = 'summarized_retrieved_data'
output_folder_path = settings['data_paths']['outputs']['summarized_retrieved_data_path']

summarized_documemts = summarized_retrieved_data[['file_name', 'chunk_title', 'similarity_score', 'doc_summary']]
# summarised_documemts = to_summarise_df[['file_name', 'chunk_title', 'similarity_score', 'summarized_chunk']]

export_data(summarized_documemts, output_file_name, output_folder_path)

#TO DO Wrap the lines after paragraph into a method and add all of it into a class AND add if __main__: run the class

In [71]:
# TO DO : create validate.py

import json
import pandas as pd
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, f1_score
# from src.rag import RAG

#TO DO read settings


class Validate():
    def __init__(self):
        pass

    def predict(self, rag, val_df, query):
        # Get the most similar document
        val_df['top_3_doc'] = val_df['chunk'].apply(lambda query: rag.get_top_k_documents(query, k=3))

        # Update the similarity score to be 0 or 1:
        # All scores >= to 0.5 are considered 1
        # Note: I set the threshhold to 0.5 based on my experiments, but it can be updated upon further inspection, new data or other factors
        val_df['is_similar_pred'] = val_df['top_3_doc'].apply(lambda d: 0 if d[0][-1] < 0.5 else 1)
        return val_df



    def get_performance_metrics(self, val_df, output_metrics_path):
        val_df = self.predict(rag, val_df, query)

        # Get today's date with the hour
        current_time = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Evaluate precision, recall, and F1 score
        precision = precision_score(val_df['is_similar'], val_df['is_similar_pred'])
        recall = recall_score(val_df['is_similar'], val_df['is_similar_pred'])
        f1 = f1_score(val_df['is_similar'], val_df['is_similar_pred'])

        metrics = {
            'precision' : precision,
            'recall' : recall,
            'f1_score' : f1
        }

        # Export the metrics as JSON
        with open(output_metrics_path + f'/validation_metrics_{current_time}.json', 'w') as file:
            json.dump(metrics, file, indent=4)
        return metrics

# TO DO COMPLETE THIS:
#if __main__:


In [72]:
# Read the validation data
validation_data_path = settings['data_paths']['inputs']['validation_data_path']

val_df = pd.read_excel(validation_data_path)

# Run the validation
validate = Validate() #TO DO : Del this line later:

rag = RAG(pinecone_index, embed_model)
output_metrics_path = settings['data_paths']['outputs']['metrics_path']
output_file_name = 'val_data'
predicted_validation_data_path = settings['data_paths']['outputs']['predicted_validation_data_path']

metrics = validate.get_performance_metrics(val_df, output_metrics_path)
# Export the validation data with predictions
export_data(data=val_df, output_file_name=output_file_name, output_folder_path=predicted_validation_data_path)

In [73]:
metrics

{'precision': 1.0,
 'recall': 0.9166666666666666,
 'f1_score': 0.9565217391304348}

In [None]:
import json
import pandas as pd
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, f1_score
# from src.rag import RAG

# TO DO: import necessary modules and classes for the missing parts
# from src.rag import RAG
# from your_module import export_data

# TO DO: Read settings
# settings = ...

class Validate():
    def __init__(self):
        pass

    def predict(self, rag, val_df, query):
        # Get the most similar document
        val_df['top_3_doc'] = val_df['chunk'].apply(lambda query: rag.get_top_k_documents(query, k=3))

        # Update the similarity score to be 0 or 1:
        # All scores >= to 0.5 are considered 1
        # Note: I set the threshold to 0.5 based on my experiments, but it can be updated upon further inspection, new data or other factors
        val_df['is_similar_pred'] = val_df['top_3_doc'].apply(lambda d: 0 if d[0][-1] < 0.5 else 1)
        return val_df

    def get_performance_metrics(self, val_df, output_metrics_path):
        val_df = self.predict(rag, val_df, query)

        # Get today's date with the hour
        current_time = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Evaluate precision, recall, and F1 score
        precision = precision_score(val_df['is_similar'], val_df['is_similar_pred'])
        recall = recall_score(val_df['is_similar'], val_df['is_similar_pred'])
        f1 = f1_score(val_df['is_similar'], val_df['is_similar_pred'])

        metrics = {
            'precision' : precision,
            'recall' : recall,
            'f1_score' : f1
        }

        # Export the metrics as JSON
        with open(output_metrics_path + f'/validation_metrics_{current_time}.json', 'w') as file:
            json.dump(metrics, file, indent=4)
        return metrics

if __name__ == "__main__":
    # Read the validation data
    validation_data_path = settings['data_paths']['inputs']['validation_data_path']
    val_df = pd.read_excel(validation_data_path)

    # TO DO: Initialize other necessary variables (e.g., rag, output_metrics_path, output_file_name, predicted_validation_data_path)

    # Run the validation
    validate = Validate()
    metrics = validate.get_performance_metrics(val_df, output_metrics_path)

    # Export the validation data with predictions
    export_data(data=val_df, output_file_name=output_file_name, output_folder_path=predicted_validation_data_path)
