### download data

In [None]:
# from sec_edgar_downloader import Downloader

# # Provide your email address (as required by SEC)
# dl = Downloader(email_address="your_email@example.com", download_folder="./sec_data", company_name="school")

# dl.get(
#     "10-K",
#     ticker_or_cik = "AAPL",
#     after = "2015-01-01"
# )

# dl.get(
#     "10-K",
#     ticker_or_cik = "MSFT",
#     after = "2015-01-01"
# )


10

In [2]:
import requests
import os
import time

CIK = "0000320193"  # Apple
EMAIL = "your_email@example.com"  # Use your real email
SAVE_FOLDER = "apple_10k_txts"
os.makedirs(SAVE_FOLDER, exist_ok=True)

HEADERS = {
    "User-Agent": f"MyApp/1.0 ({EMAIL})"
}

def get_10k_filing_info(cik):
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()
    
    info = []
    for i, form in enumerate(data["filings"]["recent"]["form"]):
        if form == "10-K":
            acc_num = data["filings"]["recent"]["accessionNumber"][i]  # e.g., 0000320193-23-000106
            acc_num_clean = acc_num.replace("-", "")  # e.g., 000032019323000106
            info.append((acc_num_clean, acc_num))  # (folder, file)
    return info

def download_txt_file(folder_acc, file_acc):
    url = f"https://www.sec.gov/Archives/edgar/data/{CIK}/{folder_acc}/{file_acc}.txt"
    res = requests.get(url, headers=HEADERS)
    if res.status_code == 200:
        file_path = os.path.join(SAVE_FOLDER, f"{file_acc}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(res.text)
        print(f"Downloaded: {file_path}")
    else:
        print(f"Failed to download: {file_acc} (status code: {res.status_code})")

# === MAIN ===
filings = get_10k_filing_info(CIK)

for folder_acc, file_acc in filings:
    try:
        download_txt_file(folder_acc, file_acc)
        time.sleep(0.5)
    except Exception as e:
        print(f"Error downloading {file_acc}: {e}")


Downloaded: apple_10k_txts/0000320193-24-000123.txt
Downloaded: apple_10k_txts/0000320193-23-000106.txt
Downloaded: apple_10k_txts/0000320193-22-000108.txt
Downloaded: apple_10k_txts/0000320193-21-000105.txt
Downloaded: apple_10k_txts/0000320193-20-000096.txt
Downloaded: apple_10k_txts/0000320193-19-000119.txt
Downloaded: apple_10k_txts/0000320193-18-000145.txt
Downloaded: apple_10k_txts/0000320193-17-000070.txt
Downloaded: apple_10k_txts/0001628280-16-020309.txt
Downloaded: apple_10k_txts/0001193125-15-356351.txt
Downloaded: apple_10k_txts/0001193125-14-383437.txt


In [3]:
import os

# Define directories
txt_folder = 'apple_10k_txts'
html_folder = 'apple_10k_html'

# Create output folder if it doesn't exist
os.makedirs(html_folder, exist_ok=True)

# Process all .txt files
for filename in os.listdir(txt_folder):
    if filename.endswith('.txt'):
        txt_path = os.path.join(txt_folder, filename)
        html_filename = filename.replace('.txt', '.html')
        html_path = os.path.join(html_folder, html_filename)

        # Read .txt (HTML content)
        with open(txt_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Write as .html
        with open(html_path, 'w', encoding='utf-8') as html_file:
            html_file.write(html_content)

        print(f"Converted: {filename} → {html_filename}")

print("All conversions complete.")


Converted: 0000320193-19-000119.txt → 0000320193-19-000119.html
Converted: 0001628280-16-020309.txt → 0001628280-16-020309.html
Converted: 0000320193-17-000070.txt → 0000320193-17-000070.html
Converted: 0001193125-14-383437.txt → 0001193125-14-383437.html
Converted: 0000320193-24-000123.txt → 0000320193-24-000123.html
Converted: 0000320193-21-000105.txt → 0000320193-21-000105.html
Converted: 0000320193-22-000108.txt → 0000320193-22-000108.html
Converted: 0000320193-23-000106.txt → 0000320193-23-000106.html
Converted: 0001193125-15-356351.txt → 0001193125-15-356351.html
Converted: 0000320193-20-000096.txt → 0000320193-20-000096.html
Converted: 0000320193-18-000145.txt → 0000320193-18-000145.html
All conversions complete.


### sample to process first document

In [66]:
import re
from bs4 import BeautifulSoup

import unicodedata

def normalize_text(s):
    return unicodedata.normalize("NFKC", s).replace('\xa0', ' ').strip()

def extract_clean_text(file_path):
    # Step 1: Load and clean up the HTML
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    # Step 2: Convert to plain text
    text = soup.get_text(separator='\n')

    # Step 3: Skip everything until the real content begins (after TOC)
    # We'll skip everything until the second "Item " occurrence
    item_matches = list(re.finditer(r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\.", text, flags=re.IGNORECASE))
    # item_pattern = r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\."

    # Use only the real content (skip the TOC)
    content_start = item_matches[1].start() if len(item_matches) > 1 else 0
    main_text = text[content_start:]

    # Step 4: Extract items and their contents
    pattern = re.compile(r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\.", re.IGNORECASE)
    matches = list(pattern.finditer(main_text))

    extracted_clean = {}

    for i in range(len(matches)):
        item_title = normalize_text(matches[i].group(1))
        if item_title.lower() == "item 16":
            continue  # Skip Item 16
        start = matches[i].end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(main_text)
        content = main_text[start:end].strip()
        
        # Clean content: normalize spaces and remove excessive line breaks
        cleaned_content = re.sub(r'\s+', ' ', content)
        
        extracted_clean[item_title] = cleaned_content

    return extracted_clean


extracted_clean = extract_clean_text('apple_10k_html/0000320193-17-000070.html')

for item, content in list(extracted_clean.items())[:3]:
    print(f"\n{item}")
    print("=" * len(item))
    print(content[:2000]) 


Item 1A
Risk Factors The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K. The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results. Any of these factors, in whole or in part, could materially and adversely a

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline

# Load summarizer
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY", device=-1)
# knkarthick/MEETING_SUMMARY
# google/pegasus-cnn_dailymail
# sshleifer/distilbart-cnn-12-6

def summarize_text(text, max_len=100, min_len=40):
    text = text[:4000]  # Prevent exceeding token limit
    return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']

def recursive_summarize(text, max_chunk_chars=3500):
    """Break long text into chunks and summarize, then summarize summaries."""
    import textwrap
    if len(text) <= max_chunk_chars:
        return summarize_text(text)
    
    # Split into smaller chunks
    chunks = textwrap.wrap(text, max_chunk_chars)
    chunk_summaries = [summarize_text(chunk) for chunk in chunks]
    
    # Combine and summarize again
    final_input = " ".join(chunk_summaries)
    return summarize_text(final_input)

def chunk_10k_sections(parsed_sections: dict, company: str, year: str, chunk_size: int = 1000, chunk_overlap: int = 200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    all_chunks = []

    for section_title, section_text in parsed_sections.items():
        normalized_title = section_title.upper().replace(".", "").replace("ITEM", "Item").strip()
        section_docs = splitter.create_documents([section_text])

        for i, doc in enumerate(section_docs):
            doc.metadata = {
                "company": company,
                "year": year,
                "section": normalized_title,
                "chunk_index": i
            }
            all_chunks.append(doc)

    return all_chunks

def summarize_10k_chunks(all_chunks: list, item = []):
    section_summaries = {}

    # Group chunks by section
    sections = {}
    for doc in all_chunks:
        section = doc.metadata["section"]
        if section not in sections:
            sections[section] = []
        sections[section].append(doc)

    for section, docs in sections.items():
        chunk_summaries = []

        for doc in docs:
            i = doc.metadata["chunk_index"]
            if i % 3 == 1:
                chunk_summary = summarize_text(doc.page_content)
                doc.metadata["chunk_summary"] = chunk_summary
                chunk_summaries.append(chunk_summary)
                print(chunk_summary)

        combined_summary_text = " ".join(chunk_summaries)
        section_summary = recursive_summarize(combined_summary_text)

        if "Maybe you might find this helpful." in section_summary:
            for doc in docs:
                if doc.metadata['chunk_index'] == 0:
                    section_summary = doc.page_content[:30]
                    break

        section_summaries[section] = section_summary
        print(section_summary)
        print("__________________________________________")

    return section_summaries



Device set to use cpu


In [None]:
# chunks = chunk_10k_sections(extracted_clean, company="Apple", year="2017")
# print(len(chunks))

# section_summaries = summarize_10k_chunks(chunks)
# section_summaries

# import json

# with open('section_summaries.json', 'w', encoding='utf-8') as f:
#     json.dump(section_summaries, f, ensure_ascii=False, indent=4)

370


# Vector Database Storage



In [51]:
# Basic Retrieval Method
retrieved_docs = vector_db.similarity_search("What color is the sky?")
retrieved_docs
print(retrieved_docs[0].metadata)

# Retrieval Method with extra filtering. Note that filtering happens after the retrieval process. If to combine both the metadata and also the content, we could consider using the weaviate vector database
retriever = vector_db.as_retriever(
    search_kwargs = {
        "filter" : {"company": "Apple",
            "year": "2017",
            "section": "Item 1A"},  # Only retrieve from "Item 1A Section"
        "k" : 15,
        "fetch_k": 100
    }
)

# WE should do the reranking.
result = retriever.invoke("What is the category of risk that carry the greatest risk to the company?")
filtered_docs = [doc.page_content for doc in result]


{'company': 'Apple', 'year': '2017', 'section': 'Item 1', 'chunk_index': 8}


# Retrieval System

In [26]:
import os
model_id = "meta-llama/Llama-3.3-70B-Instruct"
API_URL = f"https://api-inference.huggingface.co/models/{model_id}"

In [None]:
api_key = os.environ["HUGGINGFACE_API_KEY"]
headers = {"Authorization": f"Bearer {api_key}"}

In [None]:
input = f"""Educate me on the university NUS"""
data = {"inputs": input}
response = requests.post(API_URL, headers=headers, json=data)

In [None]:
text  = response.json()[0]["generated_text"]

In [None]:
print(text[text.index(input) + len(input) + 1:])


The National University of Singapore (NUS) is a public research university located in Singapore. It is the flagship university of the country and one of the top universities in Asia. Here are some key points to know about NUS:

### History
- **Founded in 1905**: Originally established as the Raffles College, it later merged with the University of Malaya in 1962 to form the University of Singapore, which was renamed the National University of Singapore in 1980.
- **Heritage and Tradition**: NUS has a rich history and has played a significant role in Singapore's educational and economic development.

### Academics
- **Schools and Faculties**: NUS offers a wide range of undergraduate and graduate programs across various disciplines, including Arts and Social Sciences, Business, Computing, Dentistry, Design and Environment, Engineering, Law, Medicine, Music, Public Policy, Science, and more.
- **Rankings**: NUS consistently ranks among the top universities globally. It is frequently place

# Design a Multi-hop RAG and also the updating of the list


In [68]:
k = 10
query = "What are the revenue of Apple in 2017"
docs = vector_db.similarity_search(query, k= k)
context = "\n".join([doc.page_content for doc in docs])
print(context)

compared to 2015, primarily driven by a year-over-year decrease in iPhone net sales and the effect of weakness in most foreign currencies relative to the U.S. dollar, partially offset by an increase in Services. In April 2016, the Company announced an increase to its capital return program by raising the expected total size of the program from $200 billion to $250 billion through March 2018. This included increasing its share repurchase authorization from $140 billion to $175 billion and raising its quarterly dividend from $0.52 to $0.57 per share beginning in May 2016. During 2016, the Company spent $29.0 billion to repurchase shares of its common stock and paid dividends and dividend equivalents of $12.2 billion. Additionally, the Company issued $23.9 billion of U.S. dollar-denominated term debt and A$1.4 billion of Australian dollar-denominated term debt during 2016.Apple Inc. | 2017 Form 10-K | 22Sales DataThe following table shows net sales by operating segment and net sales and
c

In [None]:
""

In [None]:
# Retrieval Method with extra filtering. Note that filtering happens after the retrieval process. If to combine both the metadata and also the content, we could consider using the weaviate vector database
retriever = vector_db.as_retriever(
    search_kwargs = {
        "filter" : {"company": "Apple",
            "year": "2017",
            "section": "Item 1A"},  # Only retrieve from "Item 1A Section"
        "k" : k,
        "fetch_k": 100
    }
)

result = retriever.invoke("What is the category of risk that carry the greatest risk to the company?")
filtered_docs = [doc.page_content for doc in result]

In [None]:
# Ask the LLM to provide a feedback on the accuracy of the retrieval context and then made update to the list or rerun the retrieval process until find the corresponding section that contains it.
prompt = f"""You are given a user query and a set of retrieved documents. Determine whether the retrieved documents are relevant to answering the query.

Query: "{query}"

Retrieved Context:
{context}

Are the retrieved documents relevant to the query? Answer with "Yes" or "No", nothing else"""


In [None]:
# prompt: I would like to ask some queries such that using the code above, it would be able to retrieve the context from my database

def multi_hop_rag(query, vector_db, initial_k=5, max_hops=3):
    """
    Performs multi-hop retrieval augmented generation.

    Args:
        query: The user's query.
        vector_db: The vector database.
        initial_k: The initial number of documents to retrieve.
        max_hops: The maximum number of hops.

    Returns:
        A list of relevant documents.
    """

    retrieved_docs = vector_db.similarity_search(query, k=initial_k)
    retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])

    # Use the LLM to evaluate relevance
    prompt = f"""You are given a user query and a set of retrieved documents. Determine whether the retrieved documents are relevant to answering the query.

Query: "{query}"

Retrieved Context:
{retrieved_context}

Are the retrieved documents relevant to the query? Answer with "Yes" or "No", nothing else"""

    # Replace with your actual LLM call
    # For demonstration purposes, we will simulate the LLM response
    # In real scenario, call your LLM API here with the prompt
    llm_response = "Yes" #Simulate LLM response. Replace with an actual LLM call

    if llm_response.strip().lower() == "yes":
      return retrieved_docs

    hops = 1
    while llm_response.strip().lower() != "yes" and hops < max_hops:
        # Refine the query based on the retrieved context and the LLM's feedback
        # (You'll need to implement a query refinement strategy)
        refined_query = query #Replace with query refinement logic using LLM

        retrieved_docs = vector_db.similarity_search(refined_query, k=initial_k)
        retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])

        prompt = f"""
        
                You are given a user query and a set of retrieved documents. Determine whether the retrieved documents are relevant to answering the query.

                Query: "{query}"

                Retrieved Context:
                {retrieved_context}

                Are the retrieved documents relevant to the query? Answer with "Yes" or "No", nothing else
                
                """

        llm_response = "Yes" #Simulate LLM response. Replace with an actual LLM call
        hops += 1

    return retrieved_docs


# Example usage
query = "What are the major risks Apple faced in 2017?"
relevant_docs = multi_hop_rag(query, vector_db)

# Process the relevant documents (e.g., summarize, answer the question)
relevant_docs


[Document(id='31c33c86-7aee-4647-9b90-08c3a2d58260', metadata={'company': 'Apple', 'year': '2017', 'section': 'Item 1A', 'chunk_index': 43, 'chunk_summary': "Apple Inc.'s business may be disrupted by cyber attacks, such as thefts, break-ins, or other events. There may be losses or unauthorized access to confidential information, including personally identifiable information, that could subject the Company to significant reputational, financial, legal and operational consequences."}, page_content='or electronic break-ins, or other events or disruptions. System redundancy and other continuity measures may be ineffective or inadequate, and the Company’s business continuity and disaster recovery planning may not be sufficient for all eventualities. Such failures or disruptions could adversely impact the Company’s business by, among other things, preventing access to the Company’s online services, interfering with customer transactions or impeding the manufacturing and shipping of the Compa

In [None]:
from collections import defaultdict

def update_section_summaries(llm_output, relevant_docs, section_summaries):
    if llm_output.strip() != "Yes":
        return None  # Or skip / return empty

    new_summarisation = summarize_10k_chunks(relevant_docs)
    
    # Update the section_summaries dictionary
    for key, value in new_summarisation.items():
        if key in section_summaries:
            section_summaries[key] += " " + value  # Append new summary
        else:
            section_summaries[key] = value  # Add if doesn't exist yet
    

    return section_summaries


section_summaries = update_section_summaries(llm_output, relevant_docs, section_summaries)

In [64]:
section_summaries

# for key, value in section_summaries.items():
#     print(section_summaries[key])


section_summaries['Item 1B'].join(section_summaries['Item 1B'], " ")

TypeError: str.join() takes exactly one argument (2 given)

In [63]:
section_summaries['Item 1B']

'Unresolved Staff Comments None'

# Main Pipeline

### Create database

In [4]:
import re
from bs4 import BeautifulSoup
import unicodedata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
import html 
from bs4 import BeautifulSoup
import json
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import os
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import textwrap
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def normalize_text(s):
    return unicodedata.normalize("NFKC", s).replace('\xa0', ' ').strip()

def extract_clean_text(file_path):
    # Step 1: Load and clean up the HTML
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    # Step 2: Convert to plain text
    text = soup.get_text(separator='\n')

    # Step 3: Skip everything until the real content begins (after TOC)
    # We'll skip everything until the second "Item " occurrence
    item_matches = list(re.finditer(r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\.", text, flags=re.IGNORECASE))
    # item_pattern = r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\."

    # Use only the real content (skip the TOC)
    content_start = item_matches[1].start() if len(item_matches) > 1 else 0
    main_text = text[content_start:]

    # Step 4: Extract items and their contents
    pattern = re.compile(r"(Item\s+[0-9A-Za-z]+(?:[A-Z])?)\.", re.IGNORECASE)
    matches = list(pattern.finditer(main_text))

    extracted_clean = {}

    for i in range(len(matches)):
        item_title = normalize_text(matches[i].group(1))
        if item_title.lower() == "item 16":
            continue  # Skip Item 16
        start = matches[i].end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(main_text)
        content = main_text[start:end].strip()
        
        # Clean content: normalize spaces and remove excessive line breaks
        cleaned_content = re.sub(r'\s+', ' ', content)
        
        extracted_clean[item_title] = cleaned_content

    return extracted_clean

# extracted_clean = extract_clean_text('apple_10k_html/0000320193-17-000070.html')

# for item, content in list(extracted_clean.items())[:3]:
#     print(f"\n{item}")
#     print("=" * len(item))
#     print(content[:2000]) 


# Load summarizer
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY", device=-1)
# knkarthick/MEETING_SUMMARY
# google/pegasus-cnn_dailymail
# sshleifer/distilbart-cnn-12-6

def summarize_text(text, max_len=100, min_len=40):
    text = text[:4000]  # Prevent exceeding token limit
    return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']

def recursive_summarize(text, max_chunk_chars=3500):
    """Break long text into chunks and summarize, then summarize summaries."""
    if len(text) <= max_chunk_chars:
        return summarize_text(text)
    
    # Split into smaller chunks
    chunks = textwrap.wrap(text, max_chunk_chars)
    chunk_summaries = [summarize_text(chunk) for chunk in chunks]
    
    # Combine and summarize again
    final_input = " ".join(chunk_summaries)
    return summarize_text(final_input)

def chunk_10k_sections(parsed_sections: dict, company: str, year: str, chunk_size: int = 1000, chunk_overlap: int = 200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    all_chunks = []

    for section_title, section_text in parsed_sections.items():
        normalized_title = section_title.upper().replace(".", "").replace("ITEM", "Item").strip()
        section_docs = splitter.create_documents([section_text])

        for i, doc in enumerate(section_docs):
            doc.metadata = {
                "company": company,
                "year": year,
                "section": normalized_title,
                "chunk_index": i
            }
            all_chunks.append(doc)

    return all_chunks

def summarize_10k_chunks(all_chunks: list, item = []):
    section_summaries = {}

    # Group chunks by section
    sections = {}
    for doc in all_chunks:
        section = doc.metadata["section"]
        if section not in sections:
            sections[section] = []
        sections[section].append(doc)

    for section, docs in sections.items():
        chunk_summaries = []

        for doc in docs:
            i = doc.metadata["chunk_index"]
            if i % 3 == 1:
                chunk_summary = summarize_text(doc.page_content)
                doc.metadata["chunk_summary"] = chunk_summary
                chunk_summaries.append(chunk_summary)
                print(chunk_summary)

        combined_summary_text = " ".join(chunk_summaries)
        section_summary = recursive_summarize(combined_summary_text)

        if "Maybe you might find this helpful." in section_summary:
            for doc in docs:
                if doc.metadata['chunk_index'] == 0:
                    section_summary = doc.page_content[:30]
                    break

        section_summaries[section] = section_summary
        print(section_summary)
        print("__________________________________________")

    return section_summaries


Device set to use cpu


In [10]:
def add_documents(vector_db, chunks):
  vector_db.add_documents(chunks)
  return vector_db

def add_10_K(file_path, vector_db, company, year, section_summaries):

    extracted_clean = extract_clean_text(file_path)
    chunks = chunk_10k_sections(extracted_clean, company=company, year=year)


    # Collect chunks that belong to new sections (not yet summarized)
    summarise_this_chunks = [doc for doc in chunks if doc.metadata['section'] not in section_summaries]

    # If there are new sections, summarize and update the dict
    if summarise_this_chunks:
        new_summaries = summarize_10k_chunks(summarise_this_chunks)
        section_summaries.update({k: v for k, v in new_summaries.items() if k not in section_summaries})

        with open('section_summaries.json', 'w', encoding='utf-8') as f:
          json.dump(section_summaries, f, ensure_ascii=False, indent=4)

    # Add all chunks to the vector DB
    vector_db = add_documents(vector_db, chunks)

    return vector_db, section_summaries

# add_10_K("apple_10k_html/0000320193-18-000145.html", vector_db, "Apple", "2018", section_summaries)

# # It's working usually as our database is working normally
# vector_db.index.ntotal

In [11]:
for i, filename in enumerate(os.listdir(html_folder)):
    if filename == '.DS_Store':
        continue

    file_path = os.path.join(html_folder, filename)
    

    if os.path.isfile(file_path):
        full_path = os.path.join(html_folder, filename)  # This is now the full path
        print(f"Working On: {full_path}")

        company = full_path.split('/')[0].split('_')[0]
        year = '20' + full_path.split('/')[-1].split('-')[1]

        if i == 0:
            ## initialise chunks and vector database and section_summaries
            print(f"This is a first file: {full_path}")
            
            extracted_clean = extract_clean_text(full_path)
            chunks = chunk_10k_sections(extracted_clean, company=company, year=year)
            
            # test
            # section_summaries = summarize_10k_chunks(chunks)

            # with open('section_summaries.json', 'w', encoding='utf-8') as f:
            #     json.dump(section_summaries, f, ensure_ascii=False, indent=4)

            # no need to run the summary
            with open('section_summaries.json', 'r', encoding='utf-8') as f:
                section_summaries = json.load(f)

            embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
            vector_db = FAISS.from_documents(chunks, embeddings)

        else:
            add_10_K(full_path, vector_db, company, year, section_summaries)
            print(vector_db.index.ntotal)


Working On: apple_10k_html/0000320193-23-000106.html
This is a first file: apple_10k_html/0000320193-23-000106.html
Working On: apple_10k_html/0000320193-19-000119.html
544
Working On: apple_10k_html/0000320193-21-000105.html


Your max_length is set to 100, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


Disclosure Regarding Foreign J
__________________________________________
821
Working On: apple_10k_html/0000320193-24-000123.html


Your max_length is set to 100, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


The Company’s Information Security team coordinates with teams across the Company to prevent, respond to and manage security incidents. A dedicated Supplier Trust team manages information security risks the Company is exposed to through its supplier relationships.
The Company’s Information Security team coordinates with teams across the Company to prevent, respond to and manage security incidents. A dedicated Supplier Trust team manages information security risks the Company is exposed to through its supplier relationships.
__________________________________________
1074
Working On: apple_10k_html/0000320193-22-000108.html
1342
Working On: apple_10k_html/0000320193-20-000096.html
1642
Working On: apple_10k_html/0000320193-18-000145.html
1989
Working On: apple_10k_html/0000320193-17-000070.html
2359


In [12]:
vector_db

<langchain_community.vectorstores.faiss.FAISS at 0x2f64686d0>

In [23]:
import faiss
# Save vector_db to a local folder
vector_db.save_local("vector_db")


In [14]:
print(type(vector_db))

<class 'langchain_community.vectorstores.faiss.FAISS'>


In [21]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings  # or your original embedding model

# embedding_model = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_db2 = FAISS.load_local(
    "faiss_index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)


In [22]:
vector_db2

<langchain_community.vectorstores.faiss.FAISS at 0x2f468a990>

## test


In [67]:

query = "What is the revenue of the company"
# Includes the Reranking documents technique: 

from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_documents(query, retrieved_docs, cross_encoder):
    # Prepare the inputs for the cross-encoder
    cross_encoder_inputs = [[query, doc.page_content] for doc in retrieved_docs]

    # Compute relevance scores
    relevance_scores = cross_encoder.predict(cross_encoder_inputs)

    # Attach scores to documents
    pairs_list = []
    for idx, doc in enumerate(retrieved_docs):
        pairs_list.append((doc, relevance_scores[idx]))

    # Sort documents by relevance score in descending order
    sorted_docs = sorted(pairs_list, key=lambda x: x[1], reverse=True)

    # Final output
    reranked_docs = [doc for doc, _ in sorted_docs]

    return reranked_docs

# reranked_docs = rerank_documents(query, docs, cross_encoder)

# print("\n\n".join([doc.page_content for doc in reranked_docs]))


def filtered_retrieval(query, vector_db, cross_encoder, filter, fetch_k = 100, k = 5):
    retriever = vector_db.as_retriever(
        search_kwargs = {
            "filter" : filter,
            "fetch_k" : fetch_k,
            "k" : k
        }
    )

    filtered_docs = retriever.invoke(query)
    reranked = rerank_documents(query, filtered_docs, cross_encoder)
    print(len(reranked))


    return reranked

x = ['Item 1A', 'Item 1']
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
# filter = {"company": "apple",
#             "year": "2017",
#             "section": "Item 1A"}


for i in x:
    filter = {"company": "apple",
            "year": "2017",
            "section": i}
    "\n\n".join([doc.page_content for doc in filtered_retrieval(query, vector_db, cross_encoder, filter)])

    
# "\n\n".join([doc.page_content for doc in filtered_retrieval(query, vector_db, cross_encoder, filter)])

# temp = filtered_retrieval(query, vector_db, cross_encoder, filter)

# type(filtered_retrieval(query, vector_db, cross_encoder, filter))


IndexError: list index out of range

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_db = FAISS.load_local(
    "vector_db",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)



In [70]:
docs = vector_db.docstore._dict  # It's a dictionary {doc_id: Document}

for doc_id, doc in docs.items():
    print(f"ID: {doc_id}")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("----------")

ID: b1f1a776-71f0-448f-a04b-ec6287ed6e5e
Content: Risk Factors The Company’s business, reputation, results of operations, financial condition and stock price can be affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these risks materialize from time to time, the Company’s business, reputation, results of operations, financial condition and stock price can be materially and adversely affected. Because of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical trends to anticipate results or trends in future periods. This discussion of risk factors contains forward-looking statements. This section should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and 

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x2f7a6e460> >

In [43]:
# # Get all stored documents
# docs = vector_db.docstore._dict  # It's a dictionary {doc_id: Document}

# for doc_id, doc in docs.items():
#     print(f"ID: {doc_id}")
#     print(f"Content: {doc.page_content}")
#     print(f"Metadata: {doc.metadata}")
#     print("----------")
#     if (doc.metadata['company'] == 'apple')


In [46]:
from sentence_transformers import SentenceTransformer, util
import json
import torch

# Load your JSON file
with open('section_summaries.json', 'r') as f:
    section_summaries = json.load(f)

# Define your query
query = "Apple's strategy in global smartphone markets"

# Load a sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare data
items = list(section_summaries.items())
item_keys = [item[0] for item in items]
summaries = [item[1] for item in items]

# Compute embeddings
summary_embeddings = model.encode(summaries, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.cos_sim(query_embedding, summary_embeddings)[0]

# Get top 3 results
top_k = 3
top_results = torch.topk(cosine_scores, k=top_k)

print(f"\nTop {top_k} most similar sections to the query:\n")

for score, idx in zip(top_results.values, top_results.indices):
    print(f"Section: {item_keys[idx]}")
    print(f"Similarity Score: {score:.4f}")
    print(f"Summary: {summaries[idx]}\n")



Top 3 most similar sections to the query:

Section: Item 1A
Similarity Score: 0.6958
Summary: The business of Apple Inc. is highly competitive and involves a number of risks. The Company supplies computer parts to the market for personal computers and accessories. It sells its products through cellular network carriers, wholesalers, national and regional retailers and value-added resellers. It has a minority market share in the global smartphone market. The availability and quality of third-party digital content and applications for its iOS devices is dependent on the availability of developers and the right to make them

Section: Item 7
Similarity Score: 0.6261
Summary: Apple sold iPhones in the Americas, Europe, Greater China, Japan and Rest of Asia Pacific during the fiscal year 2017. Net sales increased by 6%, but unit sales fell by 11%. The company borrowed A$3.2 billion in the US dollar and A$1.4 billion in Australia dollars during 2017. In the first quarter of 2016, the Company

In [50]:
def find_items():
    # Load your JSON file
    with open('section_summaries.json', 'r') as f:
        section_summaries = json.load(f)

    # Define your query
    query = "Apple's strategy in global smartphone markets"

    # Load a sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare data
    items = list(section_summaries.items())
    item_keys = [item[0] for item in items]
    summaries = [item[1] for item in items]

    # Compute embeddings
    summary_embeddings = model.encode(summaries, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarities
    cosine_scores = util.cos_sim(query_embedding, summary_embeddings)[0]

    # Get top 3 results
    top_k = 3
    top_results = torch.topk(cosine_scores, k=top_k)

    top_matches = []
    for score, idx in zip(top_results.values, top_results.indices):
        top_matches.append(item_keys[idx])

    return top_matches


find_items()

['Item 1A', 'Item 7', 'Item 1']

In [None]:
import re
from bs4 import BeautifulSoup
import unicodedata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
import html 
from bs4 import BeautifulSoup
import json
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import os
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import textwrap
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
from sec_edgar_downloader import Downloader
import requests
import os
import time
import faiss
from sentence_transformers import CrossEncoder

from sentence_transformers import SentenceTransformer, util
import json
import torch
from sentence_transformers import CrossEncoder




###INITIALIZED
########################################################################

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_db = FAISS.load_local(
    "vector_db",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

with open('section_summaries.json', 'r', encoding='utf-8') as f:
    section_summaries = json.load(f)

query = "Apple's strategy in global smartphone markets"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


########################################################################


###FUNCTIONS
########################################################################
def find_items(query, section_summaries):

    # Load a sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare data
    items = list(section_summaries.items())
    item_keys = [item[0] for item in items]
    summaries = [item[1] for item in items]

    # Compute embeddings
    summary_embeddings = model.encode(summaries, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarities
    cosine_scores = util.cos_sim(query_embedding, summary_embeddings)[0]

    # Get top 3 results
    top_k = 3
    top_results = torch.topk(cosine_scores, k=top_k)

    top_matches = []
    for score, idx in zip(top_results.values, top_results.indices):
        top_matches.append(item_keys[idx])

    
    return top_matches

def rerank_documents(query, retrieved_docs, cross_encoder):
    # Prepare the inputs for the cross-encoder
    cross_encoder_inputs = [[query, doc.page_content] for doc in retrieved_docs]

    # Compute relevance scores
    relevance_scores = cross_encoder.predict(cross_encoder_inputs)

    # Attach scores to documents
    pairs_list = []
    for idx, doc in enumerate(retrieved_docs):
        pairs_list.append((doc, relevance_scores[idx]))

    # Sort documents by relevance score in descending order
    sorted_docs = sorted(pairs_list, key=lambda x: x[1], reverse=True)

    # Final output
    reranked_docs = [doc for doc, _ in sorted_docs]

    return reranked_docs

def filtered_retrieval(query, vector_db, cross_encoder, filter, fetch_k = 100, k = 5):
    retriever = vector_db.as_retriever(
        search_kwargs = {
            "filter" : filter,
            "fetch_k" : fetch_k,
            "k" : k
        }
    )

    filtered_docs = retriever.invoke(query)
    print(filtered_docs)
    reranked = rerank_documents(query, filtered_docs, cross_encoder)

    return reranked

########################################################################



###SCRIPT
########################################################################

# similarity search between section_summaries and query, output item 
top_matches = find_items(query, section_summaries)
print(top_matches)

# retrieve topk chunks that is relevant to query in each item
all_relevant_chunks = []
for item in top_matches:
    filter = {"company": "apple",
                "year": "2017",
                "section": item}
    
    retrieved_docs = filtered_retrieval(query, vector_db, cross_encoder, filter)
    all_relevant_chunks.append(retrieved_docs)

print(all_relevant_chunks)



########################################################################





['Item 1A', 'Item 7', 'Item 1']
[Document(id='d293ed11-4241-4148-8edf-5761201f7f36', metadata={'company': 'apple', 'year': '2017', 'section': 'Item 1A', 'chunk_index': 7}, page_content='competitors seek to compete primarily through aggressive pricing and very low cost structures, and emulating the Company’s products and infringing on its intellectual property. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be adversely affected. Apple Inc. | 2017 Form 10-K | 8 The Company markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resour

In [72]:
all_relevant_chunks

[[Document(id='d293ed11-4241-4148-8edf-5761201f7f36', metadata={'company': 'apple', 'year': '2017', 'section': 'Item 1A', 'chunk_index': 7}, page_content='competitors seek to compete primarily through aggressive pricing and very low cost structures, and emulating the Company’s products and infringing on its intellectual property. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be adversely affected. Apple Inc. | 2017 Form 10-K | 8 The Company markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established har

In [76]:
# Flatten all chunks from all items
flattened_chunks = []
for chunk_list in all_relevant_chunks:
    for chunk in chunk_list:
        flattened_chunks.append(chunk)  # Assumes each chunk is a dict with at least 'content' key
flattened_chunks

[Document(id='d293ed11-4241-4148-8edf-5761201f7f36', metadata={'company': 'apple', 'year': '2017', 'section': 'Item 1A', 'chunk_index': 7}, page_content='competitors seek to compete primarily through aggressive pricing and very low cost structures, and emulating the Company’s products and infringing on its intellectual property. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be adversely affected. Apple Inc. | 2017 Form 10-K | 8 The Company markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established hard

In [88]:
from operator import itemgetter

# Create inputs for the cross encoder (query, chunk)
rerank_inputs = [(query, doc.page_content) for doc in flattened_chunks]

# Get similarity scores from cross_encoder
scores = cross_encoder.predict(rerank_inputs)

# Attach scores to the documents
scored_chunks = []
for doc, score in zip(flattened_chunks, scores):
    scored_chunks.append({
        "document": doc,
        "score": float(score)
    })

# Sort by score descending
reranked_chunks = sorted(scored_chunks, key=itemgetter("score"), reverse=True)


In [89]:
# Store top reranked chunk previews in a list variable
retrieved_context = [
    f"{i}. {item['document'].page_content.strip()}"
    for i, item in enumerate(reranked_chunks[:3], 1)
]

# Optionally print to verify
for line in retrieved_context:
    print(line)


1. competitors seek to compete primarily through aggressive pricing and very low cost structures, and emulating the Company’s products and infringing on its intellectual property. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be adversely affected. Apple Inc. | 2017 Form 10-K | 8 The Company markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software and digital content supplier relationships; and the Company has a minority market share in the global smartphone market. Additionally, t

In [2]:
def retrieve_process(retrieved_context, query):
    prompt = f"""
        You are given a user query and a set of retrieved documents. Determine whether the retrieved documents are relevant to answering the query.

        Query: "{query}"

        Retrieved Context:
        {retrieved_context}

        Are the retrieved documents relevant to the query? Answer with "Yes" or "No", nothing else
        """
        
    model_id = "meta-llama/Llama-3.3-70B-Instruct"
    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"

    # Remove the following line when push to github

    api_key = os.environ["HUGGINGFACE_API_KEY"]
    headers = {"Authorization": f"Bearer {api_key}"}

    data = {"inputs": prompt}
    response = requests.post(API_URL, headers=headers, json=data)

    text = response.json()[0]["generated_text"]

    return(text[text.index(prompt) + len(prompt) + 1:])



In [None]:
from collections import defaultdict

def update_section_summaries(llm_output, relevant_docs, section_summaries):
    if llm_output.strip() != "Yes":
        return None  # Or skip / return empty

    new_summarisation = summarize_10k_chunks(relevant_docs)
    
    # Update the section_summaries dictionary
    for key, value in new_summarisation.items():
        if key in section_summaries:
            section_summaries[key] += " " + value  # Append new summary
        else:
            section_summaries[key] = value  # Add if doesn't exist yet
    

    return section_summaries


section_summaries = update_section_summaries(llm_output, relevant_docs, section_summaries)

In [1]:
import re
from bs4 import BeautifulSoup
import unicodedata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
import html 
from bs4 import BeautifulSoup
import json
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import os
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import textwrap
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
from sec_edgar_downloader import Downloader
import requests
import os
import time
import faiss
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer, util
import torch
from sentence_transformers import SentenceTransformer, util
import json
import torch
from sentence_transformers import CrossEncoder
from collections import defaultdict

from operator import itemgetter
import re



###INITIALIZED
########################################################################

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_db = FAISS.load_local(
    "vector_db",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

with open('section_summaries.json', 'r', encoding='utf-8') as f:
    section_summaries = json.load(f)

query = "Apple's strategy in global smartphone markets"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY", device=-1)


########################################################################


###FUNCTIONS
########################################################################
# def find_items(query, section_summaries, removed = []):

#     # Load a sentence transformer model
#     model = SentenceTransformer('all-MiniLM-L6-v2')

#     # Prepare data
#     items = list(section_summaries.items())
#     item_keys = [item[0] for item in items]
#     summaries = [item[1] for item in items]

#     # Compute embeddings
#     summary_embeddings = model.encode(summaries, convert_to_tensor=True)
#     query_embedding = model.encode(query, convert_to_tensor=True)

#     # Compute cosine similarities
#     cosine_scores = util.cos_sim(query_embedding, summary_embeddings)[0]

#     # Get top 3 results
#     top_k = 3
#     top_results = torch.topk(cosine_scores, k=top_k)

#     top_matches = []
#     for score, idx in zip(top_results.values, top_results.indices):
#         top_matches.append(item_keys[idx])

#     return top_matches



def find_items(query, section_summaries, removed=[]):
    # Load a sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare data
    items = list(section_summaries.items())
    
    # Filter out removed items
    filtered_items = [item for item in items if item[0] not in removed]
    
    item_keys = [item[0] for item in filtered_items]
    summaries = [item[1] for item in filtered_items]

    # Compute embeddings
    summary_embeddings = model.encode(summaries, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarities
    cosine_scores = util.cos_sim(query_embedding, summary_embeddings)[0]

    # Get top 3 results
    top_k = 3
    top_results = torch.topk(cosine_scores, k=top_k)

    top_matches = []
    for score, idx in zip(top_results.values, top_results.indices):
        top_matches.append(item_keys[idx])

    return top_matches

def rerank_documents(query, retrieved_docs, cross_encoder):
    # Prepare the inputs for the cross-encoder
    cross_encoder_inputs = [[query, doc.page_content] for doc in retrieved_docs]

    # Compute relevance scores
    relevance_scores = cross_encoder.predict(cross_encoder_inputs)

    # Attach scores to documents
    pairs_list = []
    for idx, doc in enumerate(retrieved_docs):
        pairs_list.append((doc, relevance_scores[idx]))

    # Sort documents by relevance score in descending order
    sorted_docs = sorted(pairs_list, key=lambda x: x[1], reverse=True)

    # Final output
    reranked_docs = [doc for doc, _ in sorted_docs]

    return reranked_docs

def filtered_retrieval(query, vector_db, cross_encoder, filter, fetch_k = 100, k = 5):
    retriever = vector_db.as_retriever(
        search_kwargs = {
            "filter" : filter,
            "fetch_k" : fetch_k,
            "k" : k
        }
    )

    filtered_docs = retriever.invoke(query)
    print(filtered_docs)
    reranked = rerank_documents(query, filtered_docs, cross_encoder)

    return reranked

# Load summarizer
def summarize_text(text, max_len=100, min_len=40):
    text = text[:4000]  # Prevent exceeding token limit
    return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']

def recursive_summarize(text, max_chunk_chars=3500):
    """Break long text into chunks and summarize, then summarize summaries."""
    if len(text) <= max_chunk_chars:
        return summarize_text(text)
    
    # Split into smaller chunks
    chunks = textwrap.wrap(text, max_chunk_chars)
    chunk_summaries = [summarize_text(chunk) for chunk in chunks]
    
    # Combine and summarize again
    final_input = " ".join(chunk_summaries)
    return summarize_text(final_input)

def summarize_10k_chunks(all_chunks: list, item = []):
    section_summaries = {}

    # Group chunks by section
    sections = {}
    for doc in all_chunks:
        section = doc.metadata["section"]
        if section not in sections:
            sections[section] = []
        sections[section].append(doc)

    for section, docs in sections.items():
        chunk_summaries = []

        for doc in docs:
            i = doc.metadata["chunk_index"]
            if i % 3 == 1:
                chunk_summary = summarize_text(doc.page_content)
                doc.metadata["chunk_summary"] = chunk_summary
                chunk_summaries.append(chunk_summary)
                print(chunk_summary)

        combined_summary_text = " ".join(chunk_summaries)
        section_summary = recursive_summarize(combined_summary_text)

        if "Maybe you might find this helpful." in section_summary:
            for doc in docs:
                if doc.metadata['chunk_index'] == 0:
                    section_summary = doc.page_content[:30]
                    break

        section_summaries[section] = section_summary
        print(section_summary)
        print("__________________________________________")

    return section_summaries

########################################################################



###SCRIPT
########################################################################

# # similarity search between section_summaries and query, output item 
# top_matches = find_items(query, section_summaries)
# print(top_matches)

# retrieve topk chunks that is relevant to query in each item
def retrieve_topk_chunks(top_matches):
    all_relevant_chunks = []
    for item in top_matches:
        filter = {"company": "apple",
                    "year": "2017",
                    "section": item}
        
        retrieved_docs = filtered_retrieval(query, vector_db, cross_encoder, filter)
        all_relevant_chunks.append(retrieved_docs)

    return all_relevant_chunks


# all_relevant_chunks = retrieve_topk_chunks(top_matches)

def flattened_chunks(all_relevant_chunks):
    # Flatten all chunks from all items
    flattened_chunks = []
    for chunk_list in all_relevant_chunks:
        for chunk in chunk_list:
            flattened_chunks.append(chunk)  # Assumes each chunk is a dict with at least 'content' key
    return flattened_chunks

# flattened_chunks = flattened_chunks(all_relevant_chunks)

def rerank2(flattened_chunks):

    # Create inputs for the cross encoder (query, chunk)
    rerank_inputs = [(query, doc.page_content) for doc in flattened_chunks]

    # Get similarity scores from cross_encoder
    scores = cross_encoder.predict(rerank_inputs)

    # Attach scores to the documents
    scored_chunks = []
    for doc, score in zip(flattened_chunks, scores):
        scored_chunks.append({
            "document": doc,
            "score": float(score)
        })

    # Sort by score descending
    reranked_chunks = sorted(scored_chunks, key=itemgetter("score"), reverse=True)
    return reranked_chunks

def generate_retrieve_context(reranked_chunks):
    retrieved_context = [
        f"{i}. {item['document'].page_content.strip()}"
        for i, item in enumerate(reranked_chunks[:3], 1)
    ]

    return retrieved_context


# reranked_chunks = rerank2(flattened_chunks)
# retrieved_context = generate_retrieve_context(reranked_chunks)
# print(retrieved_context)


def retrieve_process(retrieved_context, query):

    prompt = f"""
        You are given a user query and a set of retrieved documents. Determine whether the retrieved documents are relevant to answering the query.

        Query: "{query}"

        Retrieved Context:
        {retrieved_context}

        Instructions:
        - If the documents are not relevant, respond exactly: "No"
        - If the documents are relevant, respond exactly in the following format:
        Yes, from line X

        Where X is the line number in the Retrieved Context that contains the relevant information.

        Example:
        Yes, from line 2
    """
        
    model_id = "meta-llama/Llama-3.3-70B-Instruct"
    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"

    # Remove the following line when push to github
    # api_key = os.environ["HUGGINGFACE_API_KEY"]
    headers = {"Authorization": f"Bearer {api_key}"}

    data = {"inputs": prompt}
    response = requests.post(API_URL, headers=headers, json=data)

    text = response.json()[0]["generated_text"]

    return(text[text.index(prompt) + len(prompt) + 1:])

# llm_output = retrieve_process(retrieved_context, query)



def update_section_summaries(llm_output, reranked_chunks, section_summaries):
    # Extract the line number from llm_output
    match = re.search(r"line\s+(\d+)", llm_output.lower())
    if not match:
        return None  # Line number not found
    
    line_number = int(match.group(1)) - 1  # Convert to 0-based index

    # Safety check
    if line_number < 0 or line_number >= len(reranked_chunks):
        return None

    # Get the relevant document and summarize
    relevant_doc = [reranked_chunks[line_number]["document"]]
    new_summarisation = summarize_10k_chunks(relevant_doc)

    # Update the section_summaries dictionary
    for key, value in new_summarisation.items():
        if key in section_summaries:
            section_summaries[key] += " " + value
        else:
            section_summaries[key] = value
    
    with open('section_summaries.json', 'w', encoding='utf-8') as f:
          json.dump(section_summaries, f, ensure_ascii=False, indent=4)

    return section_summaries


# updated_section_summaries = update_section_summaries(llm_output, reranked_chunks, section_summaries)





########################################################################



# similarity search between section_summaries and query, output item 
top_matches = find_items(query, section_summaries)
print(top_matches)

all_relevant_chunks = retrieve_topk_chunks(top_matches)
print(all_relevant_chunks)


flattened_chunks = flattened_chunks(all_relevant_chunks)

reranked_chunks = rerank2(flattened_chunks)
retrieved_context = generate_retrieve_context(reranked_chunks)
print(retrieved_context)



llm_output = retrieve_process(retrieved_context, query)
print(llm_output)

# updated_section_summaries = update_section_summaries(llm_output, reranked_chunks, section_summaries)


  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: 'section_summaries.json'

In [93]:
llm_output = "Yes, from line 1"

updated_section_summaries = update_section_summaries(llm_output, reranked_chunks, section_summaries)


Your max_length is set to 100, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Apple Inc. markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. The Company has a minority market share in the global smartphone market.
Apple Inc. markets certain mobile communication and media devices based on the iOS mobile operating system and also markets related services, including third-party digital content and applications. It has a minority market share in the global smartphone market.
__________________________________________


In [None]:
# similarity search between section_summaries and query, output item 

all_items = section_summaries.keys()

def pipeline():
    top_matches = find_items(query, section_summaries)
    print(top_matches)

    all_relevant_chunks = retrieve_topk_chunks(top_matches)
    print(all_relevant_chunks)


    flattened_chunks = flattened_chunks(all_relevant_chunks)

    reranked_chunks = rerank2(flattened_chunks)
    retrieved_context = generate_retrieve_context(reranked_chunks)
    print(retrieved_context)


    llm_output = retrieve_process(retrieved_context, query)
    print(llm_output)

    llm_output = llm_output.strip()

    if not llm_output.lower().startswith("yes"):
        
        return None  # Skip if not relevant
    else:
        update_section_summaries(llm_output, reranked_chunks, section_summaries)


In [101]:
all_items = section_summaries.keys()
all_items

dict_keys(['Item 1A', 'Item 1B', 'Item 2', 'Item 3', 'Item 4', 'Item 5', 'Item 6', 'Item 7', 'Item 7A', 'Item 8', 'Item 9', 'Item 9A', 'Item 9B', 'Item 10', 'Item 11', 'Item 14', 'Item 15', 'Item 1', 'Item 12', 'Item 13', 'Item 9C', 'Item 1C'])

In [100]:
all_items.remove(top_matches)

AttributeError: 'dict_keys' object has no attribute 'remove'