In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import OnlinePDFLoader
from langchain.vectorstores import Chroma

import os
import torch

  from .autonotebook import tqdm as notebook_tqdm
2023-04-08 07:02:27.874360: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# used_model = "decapoda-research/llama-7b-hf"
used_model = "chavinlo/gpt4-x-alpaca"
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
create_db=True
persist_directory = "/app/VectorStore"

In [3]:
tokenizer = LlamaTokenizer.from_pretrained(used_model)
base_model = LlamaForCausalLM.from_pretrained(
    used_model,
    load_in_8bit=True,
    device_map=device_map,
    offload_folder="/app/models_gpt/",
    #low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
pipe = pipeline(
    "text-generation",
    model=base_model, 
    tokenizer=tokenizer, 
    max_length=4000,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)
llm = HuggingFacePipeline(pipeline=pipe)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
  else: return ct.c_void_p(A.data.storage().data_ptr())
Loading checkpoint shards: 100%|██████████| 33/33 [01:13<00:00,  2.24s/it]


In [4]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

embeddings = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: ",
    model_name = "hkunlp/instructor-large",
)

## Only use HF Hub for exploration
#from langchain.embeddings import HuggingFaceEmbeddings
#embeddings = HuggingFaceEmbeddings()

load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
loader = OnlinePDFLoader("https://www-axa-com.cdn.axa-contento-118412.eu/www-axa-com/85ec0278-bf2f-4392-94b9-c086717fa8f6_axa_urd2022_accessible_va.pdf")

In [6]:
if create_db:
    text_splitter = CharacterTextSplitter(chunk_size=450, chunk_overlap=25)
    vectorstore_type = Chroma

    index = VectorstoreIndexCreator(
        embedding=embeddings, 
        text_splitter= text_splitter,
        vectorstore_cls=vectorstore_type,
        vectorstore_kwargs={"persist_directory":persist_directory}
        ).from_loaders([loader])
    vectorstore=index.vectorstore
    vectorstore.persist()

else:
    # Load Store
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
Using embedded DuckDB with persistence: data will be stored in: /app/VectorStore


: 

: 

In [None]:
docs = vectorstore.as_retriever().get_relevant_documents("what are the enviroment goals of AXA?")

In [None]:
docs

In [None]:
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(), input_key="question")

In [None]:
query = "What are the AXA Goals for the future?"
chain.run(query)

In [None]:
docs = vectorstore.as_retriever().get_relevant_documents(query)
print(docs)

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
chain_w_sources = load_qa_with_sources_chain(llm, chain_type="stuff")
chain_w_sources({"input_documents": vectorstore.as_retriever().get_relevant_documents(query), "question": query}, return_only_outputs=True)
