In [1]:
from torch import cuda, bfloat16
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig, AutoConfig
import transformers

from langchain_huggingface import HuggingFaceEmbeddings

from pinecone import Pinecone
from pinecone import ServerlessSpec

from torch import cuda
from dotenv import load_dotenv, find_dotenv

import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Loading Embedding Model

In [3]:

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)




# Connect to Pincone

In [5]:
index_name = 'llama-2-fin-rag-proto'

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=pinecone_api_key)

spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 

# connect to index
index = pc.Index(index_name)
# view index stats
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17268}},
 'total_vector_count': 17268}


## Load in LLama Model

In [6]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

hf_auth = os.environ.get("HUGGING_FACE_AUTH")

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

quanto_config = QuantoConfig(
    weights='int8'

)

# begin initializing HF items, need auth token for these
hf_auth = os.environ.get("HUGGING_FACE_AUTH")


model_config = AutoConfig.from_pretrained(
    model_id,
    token=hf_auth
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=quanto_config,
    token=hf_auth,
    device_map=device
)
model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [01:03<00:00, 31.99s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (v_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (o_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): QLinear(in_features=4096, out_features=11008, bias=False)
          (up_proj): QLinear(in_features=4096, out_features=11008, bias=False)
          (down_proj): QLinear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [12]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1
      # without this output begins repeating
)

In [13]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore


llm = HuggingFacePipeline(pipeline=generate_text)
vectorstore = PineconeVectorStore(index, embed_model)
filter = {"Company": {"$eq":"TSLA"}}
retriever = vectorstore.as_retriever(search_kwargs={"filter":filter, "k":20})



custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an expert in finance who is ready for question answering tasks. Use the context below to answer the question. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use five sentences maximum and keep the answer concise.
    Context: {context}
    Question: {question}
    Answer:
    """
)

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type='map_reduce',
    retriever=retriever,
    
    )

In [14]:
query = "What is Tesla's total revenue for 2020,2021,2022"

response = rag_pipeline.invoke(query)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Token indices sequence length is longer than the specified maximum sequence length for this model (5819 > 1024). Running this sequence through the model will result in indexing errors


ValueError: A single document was longer than the context length, we cannot handle this.