In [3]:
!pip install -q -U langchain_experimental langchain_openai
!pip install -q -U sentence-transformers
!pip install -q -U pinecone-text
!pip install -q -U langchain-pinecone
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U accelerate
!pip install -q -U langchain_community

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.8.2 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.

In [4]:
import getpass
import os
import logging

import pandas as pd
import torch
import transformers
import bitsandbytes
import accelerate

from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders.csv_loader import CSVLoader

In [5]:
_ARS__MAX_NEW_TOKENS = 512
_ARS__TEMPERATURE = 0.1
_ARS__TOP_P = 0.1
_ARS__K = 3
_ARS__BREAKPOINT_THRESHOLD_TYPE = "percentile"
_ARS__BREAKPOINT_THRESHOLD_AMOUNT = 95

class ARSDocumentLoader():
    r"""
    A class for loading documents from various file formats.

    Currently supported file extensions: csv.

    Example:
        loader = ARSDocumentLoader()
        documents, errors = loader.get_documents_from_files("file1.csv", "file2.csv")
    """
        
    @classmethod
    def get_documents_from_files(cls, *paths: str):
        r"""
        Retrieve documents from multiple files.

        This method takes file paths as input, checks the existence of the files,
        and extracts documents from supported file formats.

        Args:
            *paths (str): Variable-length list of file paths from which to load documents.

        Returns:
            List, List: A tuple containing a list of documents loaded from the files
                and a list of errors encountered during the loading process.

        Example:
            documents, errors = ARSDocumentLoader.get_documents_from_files("file1.csv", "file2.csv")
        """
        documents = []
        errors = []

        for path in paths:
            if not os.path.exists(path):
                logging.error(f"File does not exist for path: {path}")
                errors.append({"path": path, "error": f"File does not exist for path: {path}"})
                continue

            file_extension = path.split('.')[-1].lower()

            match file_extension:
                case "csv":
                    docs = cls.__get_documents_from_csv(path)
                case _:
                    logging.error(f"File extension is not supported: {file_extension}")
                    errors.append({"path": path, "error": f"File extension is not supported: {extension}"})
                    continue

            documents.extend(docs)

        return documents, errors
                   
    @classmethod
    def __get_documents_from_csv(cls, path: str):
        r"""
        Wrapper for langchain CSVLoader
        https://python.langchain.com/docs/integrations/document_loaders/csv/
        """
        return CSVLoader(path).load()

class ARS():
    r"""
    The RAG (Retrieval-Augmented Generation) model combines retrieval-based and generation-based approaches
    to generate responses to user queries or prompts.
    
    Args:
        llm_model (str): Path to the llama-3-chat-hf language model.
        embedding_model (str): Name of the embedding model on huggingface.
        index_name (str): Name of the index on Pinecone.
        max_new_tokens (int, optional): The maximum number of tokens to generate. Default is 512.
        temperature (float, optional): The sampling temperature. Default is 0.1.
        top_p (float, optional): The nucleus sampling top-p threshold. Default is 0.1.
        k (int, optional): The number of documents to retrieve for context. Default is 3.
        
    Notes:
        This class relies on several external dependencies. 
        For a list of dependencies and installation instructions, please refer to the project's README.md file.
    """
    
    def __init__(self, llama_model_path: str, embedding_model: str, index_name: str, 
                 max_new_tokens: int = __MAX_NEW_TOKENS, temperature: float = __TEMPERATURE, 
                 top_p: float = __TOP_P, k: int = __K, 
                 breakpoint_threshold_type: str = __BREAKPOINT_THRESHOLD_TYPE,
                 breakpoint_threshold_amount: int = __BREAKPOINT_THRESHOLD_AMOUNT):
        self.document_loader = ARSDocumentLoader
        
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=llama_model_path,
            model_kwargs={
                "torch_dtype": torch.float16,
                "quantization_config": {"load_in_8bit": True},
                "low_cpu_mem_usage": True,
            }
        )
        
        self.embedding_model = HuggingFaceEmbeddings(model_name=embedding_model)
        
        self.text_splitter = SemanticChunker(
            self.embedding_model,
            breakpoint_threshold_type=breakpoint_threshold_type,
            breakpoint_threshold_amount=breakpoint_threshold_amount
        )
        
        self.index = PineconeVectorStore(embedding=self.embedding_model, 
                                         index_name=index_name)
        
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.k = k
        
    def change_config(self, **kwargs):
        """
        Change the configuration parameters of the Article Retrieval System (ARS).

        Args:
            **kwargs: Keyword arguments representing the configuration parameters to be changed.
                Supported parameters:
                - k (int, optional): The number of documents to retrieve for context.
                - max_new_tokens (int, optional): The maximum number of tokens to generate.
                - temperature (float, optional): The sampling temperature.
                - top_p (float, optional): The nucleus sampling top-p threshold.
                - breakpoint_threshold_type (str, optional): The type of breakpoint threshold for semantic chunking.
                - breakpoint_threshold_amount (float, optional): The amount of breakpoint threshold for semantic chunking.

        Returns:
            None
        """
        self.k = kwargs.get("k", self.k)
        self.max_new_tokens = kwargs.get("max_new_tokens", self.max_new_tokens)
        self.temperature = kwargs.get("temperature", self.temperature)
        self.top_p = kwargs.get("top_p", self.top_p)
        self.breakpoint_threshold_type = kwargs.get("breakpoint_threshold_type", self.breakpoint_threshold_type)
        self.breakpoint_threshold_amount = kwargs.get("breakpoint_threshold_amount", self.breakpoint_threshold_amount)
    
    
    def add_context_from_files(self, *paths: str):
        r"""
        Adds context from files to the vector database.
        
        Args:
            *paths (str): Paths to the files to be processed.

        Returns:
            tuple: A tuple containing processed documents and errors.
                - list: Documents obtained from files.
                - list: Errors encountered during processing files.
        """
        
        documents, errors = self.document_loader.get_documents_from_files(*paths)
        
        splitted_documents = self.text_splitter.split_documents(documents)
        
        self.index.add_documents(splitted_documents)
            
        return documents, errors    
    
    def query(self, query: str, **kwargs):
        r"""
        Query the model for an answer to the given query.

        Args:
            query (str): The query to be answered.
            **kwargs:
                k (int, optional): The number of documents to retrieve for context.
                max_new_tokens (int, optional): The maximum number of tokens to generate.
                temperature (float, optional): The sampling temperature.
                top_p (float, optional): The nucleus sampling top-p threshold.
                
        Returns:
            str: The generated answer.
        """
        k = kwargs.get("k", self.k)
        max_new_tokens = kwargs.get("max_new_tokens", self.max_new_tokens)
        temperature = kwargs.get("temperature", self.temperature)
        top_p = kwargs.get("top_p", self.top_p)
        
        if not self.pipeline:
            logging.error("Language model pipeline is not initialized")
            raise RuntimeError("Language model pipeline is not initialized")
        
        user_context = self.__get_context_from_index(query, k)
        
        prompt = self.__create_prompt(query, user_context)
                               
        terminators = [
            self.pipeline.tokenizer.eos_token_id,
            self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_new_tokens,
            eos_token_id=terminators,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )
        
        return outputs[0]["generated_text"][len(prompt):]
    
    
    def __get_context_from_index(self, query: str, k: int):
        r"""
        Get user context from the search index based on the query.

        Args:
            query (str): The query for similarity search.
            k (int): The number of documents to retrieve.

        Returns:
            str: The generated user context including retrieved documents and the query.
        """
        
        docs = self.index.similarity_search(query, k=k)
        
        user_context = ""

        for doc in docs:
            user_context += "\n" + doc.page_content
            
        return user_context
    
    
    def __create_prompt(self, query: str, context: str):
        r"""
        Creates prompt using query and context.

        Args:
            query (str): The question/query to be included in the prompt.
            context (str): The context to be included in the prompt.

        Returns:
            str: The generated prompt for model input.
        """
        
        user_message = f"Context: {context} \n Question:{query}"
        
        messages = [
            {"role": "system", "content": "Using provided context answer the question without saying that it is based on the context!"},
            {"role": "user", "content": user_message},
        ]

        return self.pipeline.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
        )

In [6]:
os.environ["PINECONE_API_KEY"] = getpass.getpass("Pinecone API Key:")

Pinecone API Key: ····································


In [7]:
INPUT_PATH = "/kaggle/input/1300-towards-datascience-medium-articles-dataset/"
INDEX_NAME = "rag"
MODEL_PATH = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [8]:
rag = ARS(MODEL_PATH, EMBEDDING_MODEL, INDEX_NAME)
_, _ = rag.add_context_from_files(INPUT_PATH + 'medium.csv')

2024-04-21 22:48:21.576214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 22:48:21.576312: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 22:48:21.706870: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
result = rag.query("Will computers take over the world?")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The question of whether computers will take over the world is a topic of ongoing debate and speculation. While advancements in AI have made significant progress in recent years, experts are still unclear how to develop actual intelligence. Even if deep learning matures to a point where neural networks are equivalent to the human brain, AI experts are still unsure how to develop actual intelligence.


In [11]:
result = rag.query("What is SQL?")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SQL is a standardised programming language designed for data storage and management. It allows one to create, parse, and manipulate data fast and easy.


In [12]:
result = rag.query("What is bootstrap?")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Bootstrap is a powerful, computer-based method for statistical inference without relying on too many assumptions. It's a resampling method that involves independently sampling with replacement from an existing sample data with the same sample size, and performing inference among these resampled data.
