In [1]:
from huggingface_hub import login, notebook_login
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, BitsAndBytesConfig, AutoConfig
import torch
from textwrap import fill
from langchain.prompts import PromptTemplate
import locale
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

locale.getpreferredencoding = lambda: "UTF-8"

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" 

quantization_config = BitsAndBytesConfig()

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="cuda",
                                             quantization_config=quantization_config,
                                             trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

gen_cfg = GenerationConfig.from_pretrained(model_name)
gen_cfg.max_new_tokens=512
gen_cfg.temperature=0.0000001 
gen_cfg.return_full_text=True
gen_cfg.do_sample=True
gen_cfg.repetition_penalty=1.11

pipe=pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=gen_cfg
)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
prompt_template_llama3 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner, focusing on the most important figures and datapoints.
Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers.

{context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt_template = prompt_template_llama3

prompt = PromptTemplate(
    input_variables=["context"],
    template=prompt_template,
)


In [4]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os

# get all .md files in the statements/ directory
md_files = [os.path.join('statements', fn) for fn in os.listdir('statements') if fn.endswith('.md')]

# create loaders for each .md file
loaders = [UnstructuredMarkdownLoader(fn) for fn in md_files]

chunked_md_doc = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    md_doc = loader.load()
    updated_md_doc = filter_complex_metadata(md_doc)
    print("Splitting text...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
    documents = text_splitter.split_documents(updated_md_doc)
    chunked_md_doc.extend(documents)

len(chunked_md_doc)

Loading raw document...statements/REGN_14-A.md
Splitting text...
Loading raw document...statements/SCHW_8-K.md
Splitting text...
Loading raw document...statements/AMZN_S-1.md
Splitting text...
Loading raw document...statements/V_14-A.md
Splitting text...
Loading raw document...statements/DUK_14-A.md
Splitting text...
Loading raw document...statements/JNJ_10-K.md
Splitting text...
Loading raw document...statements/CSCO_8-K.md
Splitting text...
Loading raw document...statements/MDLZ_10-Q.md
Splitting text...
Loading raw document...statements/ABBV_S-1.md
Splitting text...
Loading raw document...statements/TMUS_8-K.md
Splitting text...
Loading raw document...statements/MSFT_10-Q.md
Splitting text...
Loading raw document...statements/AMGN_14-A.md
Splitting text...
Loading raw document...statements/NEE_10-K.md
Splitting text...
Loading raw document...statements/VZ_10-K.md
Splitting text...
Loading raw document...statements/LLY_8-K.md
Splitting text...
Loading raw document...statements/NVDA_1

4528

In [5]:
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
)

vector_store = FAISS.from_documents(chunked_md_doc, embeddings)

  warn_deprecated(


In [13]:
from langchain.document_loaders import UnstructuredMarkdownLoader
import os

# Assume vector_store and llm are already defined and initialized

# Load and process each markdown file
test_md_files = [os.path.join('testing', f'{i}.md') for i in range(5)]

for fn in test_md_files:
    print(f"Loading raw document... {fn}")
    loader = UnstructuredMarkdownLoader(fn)
    test_md_doc = loader.load()
    
    # Use the entire document content as the query
    query = test_md_doc[0].page_content  # Assuming the loader returns a list of documents
    print(f"Performing similarity search for document: {fn}")
    similar_docs = vector_store.similarity_search(query, k=5)
    
    context = " ".join([similar_doc.page_content for similar_doc in similar_docs])
    response = llm(prompt.format(context=context))
    print(f"Response for document {fn}:\n{response}\n")

Loading raw document... testing/0.md
Performing similarity search for document: testing/0.md


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response for document testing/0.md:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner, focusing on the most important figures and datapoints.
Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers.

0 1. Name and Address of Reporting Person*QUAN NANCY (Last) (First) (Middle) THE COCA-COLA COMPANY ONE COCA-COLA PLAZA (Street) ATLANTA GA 30313 (City) (State) (Zip) QUAN NANCY (Last) (First) (Middle) THE COCA-COLA COMPANY ONE COCA-COLA PLAZA ATLANTA GA 30313 (City) (State) (Zip) 2. Issuer Name and Ticker or Trading Symbol COCA COLA CO [ KO ] 5. Relationship of Reporting Person(s) to Issuer (Check all applicable) Directo

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response for document testing/2.md:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner, focusing on the most important figures and datapoints.
Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers.

| 0 | · | Underlying supplement no. 1-I dated April 13, 2023: http://www.sec.gov/Archives/edgar/data/19617/000121390023029543/ea151873_424b2.pdf | | | | | |---:|:---|:--------------------------------------------------------------------------------------------------------------------------------------------------------| | 0 | · | Prospectus supplement and prospectus, each dated April 13, 2023: http://www.sec.gov/Archives/

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response for document testing/3.md:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to report key figures and datapoints that would be of use for other applications. Do this in a concise manner, focusing on the most important figures and datapoints.
Then answer these three questions in a concise manner:

1) What are the key takeaways from the context?
2) What key figures should focused on and what could be potentially drawn from them?
3) What are steps for further analysis?

Make sure to include specific details in your answers.

1 PART I. FINANCIAL INFORMATION 2 2 Item 1. Financial Statements 2 3 Consolidated Statements of Income for the Three and Six Months Ended June 30, 2024 and July 2, 2023 4 Consolidated Statements of Comprehensive Income for the Three and Six Months Ended June 30, 2024 and July 2, 2023 5 Consolidated Balance Sheets as of June 30, 2024 and December 31, 2023 6 Consolidated Statements of Cash Flows for the Six Months Ended Ju