# Load Libraries

In [None]:
from torch import cuda
from pinecone import Pinecone
from pinecone import ServerlessSpec

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader


from dotenv import load_dotenv, find_dotenv
import os

from tqdm import tqdm


  from tqdm.autonotebook import tqdm


## Load Env Varibale from .env

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Load Embedding Model from Hugging Face

In [4]:

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)





  warn_deprecated(


In [5]:
device

'cuda:0'

# Connect to pinecone

In [6]:

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=pinecone_api_key)

spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 

## Create and/or Connect to index

In [7]:

index_name = 'llama-2-fin-rag-proto'

import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)


In [8]:
# connect to index
index = pc.Index(index_name)
# view index stats
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17268}},
 'total_vector_count': 17268}


# Load in HTML files

In [9]:
def get_all_file_paths(directory):
    file_paths = []
    
    # Walk the directory tree
    for root, _, files in os.walk(directory):
        for file in files:
            # Construct the full file path and add it to the list
            file_paths.append(os.path.join(root, file))
    
    return file_paths
# Example usage
directory_path = "..\data\\raw\sec-edgar-filings"
file_paths = get_all_file_paths(directory_path)

## Metadata Dictionaries

In [10]:
meta_dics = []

for path in file_paths:
    
    path_split = path.split('\\')
    com = path_split[4]
    report_type = path_split[5]
    year = '20' + path_split[6].split('-')[1]

    
    meta_data = {
                'Metadata':
                 {'Company': com,
                 'Report Type':report_type,
                 'Year' : int(year)},
                 'path' : path
                 }
    
    meta_dics.append(meta_data)



    


In [12]:
def load_html_as_text(file_path):

    loader = UnstructuredHTMLLoader(file_path)

    return loader.load()

In [13]:
def chunk_data(docs,chunk_size=600,chunk_overlap=100):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

In [14]:
text_path = file_paths[0]

doc = load_html_as_text(text_path)
split_docs = chunk_data(doc)

print(f"File Loaded and Chunked:{text_path}")

File Loaded and Chunked:..\data\raw\sec-edgar-filings\AAPL\10-K\0000320193-19-000119\full-submission.txt


In [15]:
# 
print(F"Document was split into {len(split_docs)} different chunks ")

Document was split into 581 different chunks 


In [16]:
len(split_docs[0].page_content)

464

In [17]:
split_docs[0].page_content

'UNITED STATES\n\nSECURITIES AND EXCHANGE COMMISSION\n\nWashington, D.C. 20549\n\nFORM\n\n10-K\n\n(Mark One)\n\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the fiscal year ended\n\nSeptember 28, 2019\n\nor\n\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the transition period from\n\nto\n\nCommission File Number:\n\n001-36743\n\nApple Inc.\n\n(Exact name of Registrant as specified in its charter)'

In [18]:
embeds = embed_model.embed_documents([chunk.page_content for chunk in split_docs])

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [19]:
len(embeds), len(embeds[0])


print(f"Each of the {len(embeds)} chunks is represented by a {len(embeds[0])} vector")

Each of the 581 chunks is represented by a 384 vector


# Vectorize and Store in PineCone Vectore DB

In [20]:
load_into_pinecone = False 

if load_into_pinecone:

    for i, filling in tqdm(enumerate(meta_dics), total=len(meta_dics)):

        
        
        path = filling['path']

        doc = load_html_as_text(path)
        split_docs = chunk_data(doc)

        embeds = embed_model.embed_documents([chunk.page_content for chunk in split_docs])
        metadata = [filling['Metadata'] for i in range(len(embeds))]
        metadata = [{**metadata[i], 'text': chunk.page_content}  for i,chunk in enumerate(split_docs)]

        ids = [f"{filling['Metadata']['Company']}-{filling['Metadata']['Report Type']}-{filling['Metadata']['Year']}-{i}" for i in range(len(embeds))]

        if len(ids)<1000:
            index.upsert(vectors=zip(ids, embeds, metadata))
        else:
            index.upsert(vectors=zip(ids[:1000], embeds[:1000], metadata[:1000]))
            index.upsert(vectors=zip(ids[1000:], embeds[1000:], metadata[1000:]))

        
            






# Query Pinecone

In [21]:
query = "What is Tesla Revenue Growth in 2022"

vector_query= embed_model.embed_query(query)

index.query(vector=vector_query, top_k=5, filter={
        "Company": {"$eq":"TSLA"}
    } )

{'matches': [{'id': 'TSLA-10-K-2023-432', 'score': 0.771090925, 'values': []},
             {'id': 'TSLA-10-K-2023-447', 'score': 0.701928496, 'values': []},
             {'id': 'TSLA-10-K-2020-569', 'score': 0.692607939, 'values': []},
             {'id': 'TSLA-10-K-2023-368', 'score': 0.6250965, 'values': []},
             {'id': 'TSLA-10-K-2023-434', 'score': 0.618884504, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [22]:
from langchain_pinecone import PineconeVectorStore



vectorstore = PineconeVectorStore(index, embed_model)

vectorstore.similarity_search(
    query,  # the search query
    k=3 , # returns top 3 most relevant chunks of text
    filter={
        "Company": {"$eq":"TSLA"}}
)
     

[Document(metadata={'Company': 'TSLA', 'Report Type': '10-K', 'Year': 2023.0}, page_content='Services and other revenue increased $2.29 billion, or 60%, in the year ended December 31, 2022 as compared to the year ended December 31, 2021. The change is primarily due to increase in used vehicle revenue driven by increases in volume and average selling prices of used Tesla and non-Tesla vehicles, non-warranty maintenance services revenue as our fleet continues to grow, paid Supercharging revenue, insurance services revenue and retail merchandise revenue.\n\n37\n\nEnergy Generation and Storage Segment'),
 Document(metadata={'Company': 'TSLA', 'Report Type': '10-K', 'Year': 2023.0}, page_content='Cost of services and other revenue increased $1.97 billion, or 51%, in the year ended December 31, 2022 as compared to the year ended December 31, 2021. The change is primarily due to an increase in used vehicle cost of revenue driven by increases in volume and costs of used Tesla and non-Tesla veh

# Load in Hugging Face Model


In [12]:

from torch import cuda, bfloat16
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig, BitsAndBytesConfig, AutoConfig
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

hf_auth = os.environ.get("HUGGING_FACE_AUTH")

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

quanto_config = QuantoConfig(
    weights='int8'

)

# begin initializing HF items, need auth token for these
hf_auth = os.environ.get("HUGGING_FACE_AUTH")


model_config = AutoConfig.from_pretrained(
    model_id,
    token=hf_auth
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=quanto_config,
    token=hf_auth,
    device_map=device
)
model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards:  50%|█████     | 1/2 [01:12<01:12, 72.02s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 

In [25]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [26]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.00,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1
      # without this output begins repeating
)

In [32]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

llm = HuggingFacePipeline(pipeline=generate_text)

vectorstore = PineconeVectorStore(index, embed_model)

filter = {"Company": {"$eq":"TSLA"}}
retriever = vectorstore.as_retriever(search_kwargs={"filter":filter, "k":20})



custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an expert in finance who is ready to question answering tasks. Use the context below to answer the question. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use five sentences maximum and keep the answer concise.
    Context: {context}
    Question: {question}
    Answer:
    """
)

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type='map_reduce',
    retriever=retriever,
    
    )

In [33]:
query = "What is Tesla's total revenue for 2020,2021,2022,2023"

response = rag_pipeline.invoke(query)

In [34]:
print(response['result'])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Services and other revenue increased $2.29 billion, or 60%, in the year ended December 31, 2022 as compared to the year ended December 31, 2021. The change is primarily due to increase in used vehicle revenue driven by increases in volume and average selling prices of used Tesla and non-Tesla vehicles, non-warranty maintenance services revenue as our fleet continues to grow, paid Supercharging revenue, insurance services revenue and retail merchandise revenue.

37

Energy Generation and Storage Segment

Cost of services and other revenue increased $1.97 billion, or 51%, in the year ended December 31, 2022 as compared to the year ended December 31, 2021. The change is primarily due to an increase in used vehicle cost of revenue driven by increases in volume and costs of used Tesla and non-Tesla vehicle sales, an increase in n