In [1]:
from huggingface_hub import login, notebook_login
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, BitsAndBytesConfig, AutoConfig
import torch
from textwrap import fill
from langchain.prompts import PromptTemplate
import locale
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

locale.getpreferredencoding = lambda: "UTF-8"

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" 

# Configuration for loading the model with CPU offloading
quantization_config = BitsAndBytesConfig(load_in_8bit_fp32_cpu_offload=True)

# Device mapping for model
device_map = {
    "model.embed_tokens": 0,
    "model.embed_positions": 0,
    "model.layers": 0,
    "model.norm": 0,
    "lm_head": 0
}

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map=device_map,
                                             quantization_config=quantization_config,
                                             trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

gen_cfg = GenerationConfig.from_pretrained(model_name)
gen_cfg.max_new_tokens=512
gen_cfg.temperature=0.0000001 
gen_cfg.return_full_text=True
gen_cfg.do_sample=True
gen_cfg.repetition_penalty=1.11

pipe=pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=gen_cfg
)

llm = HuggingFacePipeline(pipeline=pipe)

Unused kwargs: ['load_in_8bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
prompt_template_llama3 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the following context to generate a concise summary. Do not include any information that is not present in the context. If the context is insufficient to generate a summary, just say you don't have enough information to create a summary. Don't try to make up any information.

{context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt_template = prompt_template_llama3

prompt = PromptTemplate(
    input_variables=["context"],
    template=prompt_template,
)


In [6]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os

# get all .md files in the statements/ directory
md_files = [os.path.join('statements', fn) for fn in os.listdir('statements') if fn.endswith('.md')]

# create loaders for each .md file
loaders = [UnstructuredMarkdownLoader(fn) for fn in md_files]

chunked_md_doc = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    md_doc = loader.load()
    updated_md_doc = filter_complex_metadata(md_doc)
    print("Splitting text...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
    documents = text_splitter.split_documents(updated_md_doc)
    chunked_md_doc.extend(documents)

len(chunked_md_doc)

Loading raw document...statements/REGN_14-A.md
Splitting text...
Loading raw document...statements/SCHW_8-K.md
Splitting text...
Loading raw document...statements/AMZN_S-1.md
Splitting text...
Loading raw document...statements/V_14-A.md
Splitting text...
Loading raw document...statements/DUK_14-A.md
Splitting text...
Loading raw document...statements/JNJ_10-K.md
Splitting text...
Loading raw document...statements/CSCO_8-K.md
Splitting text...
Loading raw document...statements/MDLZ_10-Q.md
Splitting text...
Loading raw document...statements/ABBV_S-1.md
Splitting text...
Loading raw document...statements/TMUS_8-K.md
Splitting text...
Loading raw document...statements/MSFT_10-Q.md
Splitting text...
Loading raw document...statements/AMGN_14-A.md
Splitting text...
Loading raw document...statements/NEE_10-K.md
Splitting text...
Loading raw document...statements/VZ_10-K.md
Splitting text...
Loading raw document...statements/LLY_8-K.md
Splitting text...
Loading raw document...statements/NVDA_1

4528

In [7]:
%%time
embeddings = HuggingFaceEmbeddings()
db_pdf = FAISS.from_documents(chunked_md_doc, embeddings)

  warn_deprecated(


CPU times: user 1min 22s, sys: 2 s, total: 1min 24s
Wall time: 1min 6s


In [11]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load testing markdown files
test_md_files = [os.path.join('testing', f'{i}.md') for i in range(5)]

# Create loaders for each .md file
test_loaders = [UnstructuredMarkdownLoader(fn) for fn in test_md_files]

# Process and split the text in each testing markdown file
test_documents = []
for loader in test_loaders:
    print(f"Loading raw document... {loader.file_path}")
    test_md_doc = loader.load()
    updated_test_md_doc = filter_complex_metadata(test_md_doc)
    print("Splitting text...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
    test_documents.extend(text_splitter.split_documents(updated_test_md_doc))

# Print the total number of chunks
print(f"Total chunks from testing documents: {len(test_documents)}")

# Perform similarity search and generate responses for each chunk in the testing documents
responses = []
for doc in test_documents:
    query = doc.page_content
    # Perform similarity search using FAISS vector store
    similar_docs = db_pdf.similarity_search(query, k=5)
    # Combine the content of the similar documents
    context = " ".join([similar_doc.page_content for similar_doc in similar_docs])
    # Generate response using the fine-tuned LLM
    response = llm(prompt.format(context=context))
    responses.append(response)

# Print the responses
for i, response in enumerate(responses):
    print(f"Response for chunk {i+1}:\n{fill(response, width=80)}\n")


Loading raw document... testing/0.md
Splitting text...
Loading raw document... testing/1.md
Splitting text...
Loading raw document... testing/2.md
Splitting text...
Loading raw document... testing/3.md
Splitting text...
Loading raw document... testing/4.md
Splitting text...
Total chunks from testing documents: 90


  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)