In [1]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

import nest_asyncio

nest_asyncio.apply()

from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)
        

# Load all wiki documents
city_docs = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()
    docs[0].doc_id = wiki_title
    city_docs.extend(docs)

In [44]:
from llama_index.llms.fireworks import Fireworks
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.fireworks import FireworksEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.llms import ChatMessage
import os
from dotenv import load_dotenv


load_dotenv()

OPENAI = 'openai'
FIREWORKS = 'fireworks'


def get_llm():
    llm_type = os.getenv("LLM_TYPE", OPENAI)
    if llm_type == OPENAI:
        llm = OpenAI(
            model=os.getenv("OPENAI_MODEL"),
            api_key=os.getenv("OPENAI_API_KEY")
        )
    elif llm_type == FIREWORKS:
        llm = Fireworks(
            model=os.getenv("FIREWORKS_LLM"),
            api_key=os.getenv("FIREWORKS_API_KEY")
        )
    else:
        raise ValueError(f"Unknown LLM type: {llm_type}")
    return llm


def get_embed_model():
    llm_type = os.getenv("EMBED_MODEL_TYPE")
    if llm_type == OPENAI:
        return OpenAIEmbedding(
            api_key=os.getenv("OPENAI_API_KEY"),
            embed_batch_size=os.getenv("OPENAI_EMBED_BATCH_SIZE", 16)
        )
    elif llm_type == FIREWORKS:
        return FireworksEmbedding(
            api_key=os.getenv("FIREWORKS_API_KEY"),
            embed_batch_size=os.getenv("FIREWORKS_EMBED_BATCH_SIZE", 16)
        )


def get_llm_response(query: str, system_prompt: str = None, llm=None):
    if llm is None:
        llm = get_llm()
    messages = [ChatMessage(role = "system", content=system_prompt)] if system_prompt else []
    messages.append(ChatMessage(role="user", content=query))
    response = llm.chat(messages)
    return response.message.blocks[0].text.strip()

In [17]:
from llama_index.core import get_response_synthesizer
from llama_index.core.schema import Document
from settings import (
    CHUNK_SIZE,
    CHUNK_OVERLAP,
    chroma_data_dir    
)

from llama_index.core import (
    StorageContext, 
    load_index_from_storage
)


def get_nodes_from_documents(data: str):
    splitter = SentenceSplitter(
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP
    )
    docs = splitter.split_text(data)
    print("Total number of documents generated:", len(docs))
    return [
        Document(text=doc)
        for doc in docs
    ]

def create_index_from_document(
    data: str, 
    index_name: str = 'default',
    response_mode="tree_summarize"
):
    nodes = get_nodes_from_documents(data)
    response_synthesizer = get_response_synthesizer(
        response_mode=response_mode, 
        use_async=True
    )
    doc_summary_index = DocumentSummaryIndex.from_documents(
        documents=nodes,
        llm=get_llm(),
        response_synthesizer=response_synthesizer,
        show_progress=True,
    )
    print(f"Persisting index to {chroma_data_dir}/{index_name}")
    doc_summary_index.storage_context.persist(f"{chroma_data_dir}/{index_name}")
    return doc_summary_index


def get_vector_index(
    index_name: str, 
) -> DocumentSummaryIndex:
    storage_context = StorageContext.from_defaults(persist_dir=f"{chroma_data_dir}/{index_name}")
    doc_summary_index = load_index_from_storage(storage_context)
    return doc_summary_index

In [26]:
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexLLMRetriever,
    DocumentSummaryIndexEmbeddingRetriever
)
from llama_index.core.query_engine import RetrieverQueryEngine


SIMILARITY_TOP_K = 2


def get_summary_brute_force(data: str):
    nodes = get_nodes_from_documents(data)
    


def get_response(
    query: str, 
    index_name: str, 
    retrieval_mode: str = 'embed',
    response_mode="tree_summarize"
):
    doc_summary_index = get_vector_index(index_name)
    if retrieval_mode == 'embed':
        retriever = DocumentSummaryIndexEmbeddingRetriever(
            index=doc_summary_index,
            similarity_top_k=SIMILARITY_TOP_K,
            embed_model=get_embed_model(),
        )
    elif retrieval_mode == 'llm':
        retriever = DocumentSummaryIndexLLMRetriever(
            doc_summary_index,
            get_llm(),
            choice_top_k=SIMILARITY_TOP_K
        )
    else:
        raise ValueError(f"Unknown retrieval mode: {retrieval_mode}")
    
    response_synthesizer = get_response_synthesizer(
        response_mode=response_mode
    )

    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    # query
    response = query_engine.query(query)
    return response

In [11]:
text_data = open("data/Toronto.txt").read()
text_data

'Toronto is the most populous city in Canada and the capital city of the Canadian province of Ontario. With a population of 2,794,356 in 2021, it is the fourth-most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. As of 2024, the CMA had an estimated population of 7,106,379. Toronto is an international centre of business, finance, arts, sports, and culture and is one of the most multicultural and cosmopolitan cities in the world.\nIndigenous peoples have travelled through and inhabited the Toronto area, located on a broad sloping plateau interspersed with rivers, deep ravines, and urban forest, for more than 10,000 years. After the broadly disputed Toronto Purchase, when the Mississauga surrendered the area to the British Crown, the British established the town of York in 1793 and la

In [12]:
index_name = "afreen"

In [21]:
create_index_from_document(
    data=text_data, 
    index_name=index_name
)

  from .autonotebook import tqdm as notebook_tqdm


Total number of documents generated: 5


Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 66.79it/s]
Summarizing documents:   0%|          | 0/5 [00:00<?, ?it/s]

current doc id: 42a4efc9-1a37-4145-9fd2-b1a805a70043


Summarizing documents:  20%|██        | 1/5 [00:03<00:12,  3.05s/it]

current doc id: ac37fcad-c398-4f0a-ac2b-e22d08a606ee


Summarizing documents:  40%|████      | 2/5 [00:05<00:08,  2.68s/it]

current doc id: 165a944c-bddd-4020-b917-a14e9847e63d


Summarizing documents:  60%|██████    | 3/5 [00:08<00:05,  2.63s/it]

current doc id: 90fed31d-f47e-4c55-b246-1fe4b15e08ba


Summarizing documents:  80%|████████  | 4/5 [00:09<00:02,  2.20s/it]

current doc id: 00e89d99-40d6-4534-8f46-906499a51f3d


Summarizing documents: 100%|██████████| 5/5 [00:11<00:00,  2.37s/it]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00,  5.13it/s]

Persisting index to chroma_db_data/afreen





<llama_index.core.indices.document_summary.base.DocumentSummaryIndex at 0x1346b59a0>

In [39]:
query = "Provide a summary of the entire document"
response = get_response(query, index_name)

In [33]:
index = get_vector_index(index_name)

In [None]:
index.docstore.get_all_document_hashes()

In [45]:
get_llm_response(text_data, system_prompt="You are a helpful AI and you can provide a summary of the document")

"Toronto is the most populous city in Canada and the capital city of the province of Ontario. With a population of 2,794,356 in 2021, it is the fourth-most populous city in North America. The city is an international center of business, finance, arts, sports, and culture and is one of the most multicultural and cosmopolitan cities in the world. It is located on a broad sloping plateau interspersed with rivers, deep ravines, and urban forest. The diverse population of Toronto reflects its current and historical role as an important destination for immigrants to Canada, with about half of its residents born outside of Canada and over 200 ethnic origins represented among its inhabitants. The city is known for its many skyscrapers and high-rise buildings, in particular the CN Tower, the tallest freestanding structure on land outside of Asia. It is home to the Toronto Stock Exchange, the headquarters of Canada's five largest banks, and the headquarters of many large Canadian and multination

In [47]:
from tqdm.auto import tqdm
# from app_utils.prompts import SUMMARIZATION_PROMPT

SUMMARIZATION_PROMPT = (
    "You are an expert in summarizing legal documents.\n"
    "You need to summarize the document such that the summary comprehensively covers the key points of the document."
    "You MUST be very careful in summarizing the document as the summary will be used in the court of law."
    "You should follow the guidelines below:\n"
    "1. The summary should not be too long or too short.\n"
    "2. The summary should be accurate and should not contain any false information.\n"
    "Your response MUST NOT contain any extra help text and should only contain information related to the document summary"
    ""
)

SUMMARIES_MERGING_PROMPT = (
    "You are an expert in summarizing legal documents.\n"
    "You need to merge the summaries of the document chunks into a single comprehensive summary."
    "You MUST be very careful in merging the summaries as the summary will be used in the court of law."
    "You should follow the guidelines below:\n"
    "1. The summary should not be too long or too short.\n"
    "2. The summary should be accurate and should not contain any false information.\n"
    "Your response MUST NOT contain any extra help text and should only contain information related to the document summary"
)

def doc_summarize_by_chunk(data: str):
    nodes = get_nodes_from_documents(data)
    chunk_summaries = [
        get_llm_response(node.text, system_prompt=SUMMARIZATION_PROMPT)
        for node in tqdm(nodes, desc="Summarizing Document")
    ]
    chunk_summaries_text = "\n".join(chunk_summaries)
    
    return get_llm_response(chunk_summaries_text, system_prompt=SUMMARIES_MERGING_PROMPT)

print(doc_summarize_by_chunk(text_data))

Total number of documents generated: 5


Summarizing Document: 100%|██████████| 5/5 [00:12<00:00,  2.60s/it]


Toronto is the most populous city in Canada and the capital of the province of Ontario, with a diverse population of over 2.7 million people representing over 200 ethnic origins. Established by the British in 1793, Toronto has a rich cultural heritage and is known for its role as a major center for business, finance, arts, sports, and culture. The city has a diverse economy with strengths in technology, design, financial services, life sciences, education, arts, fashion, aerospace, environmental innovation, food services, and tourism. Toronto is also a major center for postsecondary education, with several public universities and colleges.

Toronto is located on the northwestern shore of Lake Ontario, covering an area of 630 square kilometers, and has a robust public health system with 20 public hospitals and several specialized hospitals. The city is also home to a number of health-focused non-profit organizations.

Toronto has a thriving cultural scene, with more than 50 ballet and d

In [None]:
import PyPDF2
import fitz

def pdf_to_markdown(pdf_file):
    # Initialize the PDF reader
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    texts = list()
    
    # Iterate through all pages and extract text
    for page in pdf_reader.pages:
        part_text = page.extract_text()
        texts.append(part_text)
    
    return "\n".join(texts)


def pdf_to_markdown(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Initialize an empty string to store the markdown text
    markdown_text = ""
    
    # Iterate over all pages
    for page_num in range(len(doc)):
        # Get the page text
        page = doc[page_num]
        text = page.get_text()
        
        # Add a header for each page (optional)
        markdown_text += f"# Page {page_num + 1}\n\n"
        markdown_text += text + "\n\n"

f_name = '/Users/sali/Downloads/Systems Research and Behavioral Science - 2007 - Hindle - Developing a systemic textual analysis methodology based on the.pdf'
print(pdf_to_markdown(f_name))

In [7]:
import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
f_name = '/Users/sali/Downloads/Systems Research and Behavioral Science - 2007 - Hindle - Developing a systemic textual analysis methodology based on the.pdf'

parser = LlamaParse(
    api_key="llx-DGclKdF0mHcy6aqZxwfOc2YpjtnRTQLLQTVl6U7qjono9VVn",
    result_type="markdown"
)

documents = parser.load_data(f_name)

Started parsing the file under job_id 68e05825-ee28-4fa0-9333-59b4ce798bce


In [9]:
import pickle
with open('documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

In [12]:
full_text = "\n".join([d.text for d in documents])
print(full_text)

# Developing a Systemic Textual Analysis Methodology Based on the Human Activity System Modelling Language of Soft Systems Methodology (SSM)

Giles A. Hindle*

Warwick Business School, University of Warwick, Coventry, CV4 7AL, UK

The paper introduces a textual analysis methodology which utilizes a systemic concept and a modified version of the systems modelling language from soft systems methodology (SSM). For ease of reference, the methodology is referred to as the systemic textual analysis methodology (STAM). Following trends in hermeneutics and linguistics, STAM balances objective and subjective aspects of the process of textual analysis by using the notion of a ‘recoverable’ textual analysis. STAM can be used in a wide range of Management Science and qualitative research projects, but is presented in this paper in a form suitable for the analysis of texts which constitute formal descriptions of processes or methodologies. Such analysis supports critical evaluation of texts in term

In [24]:
import openai
import streamlit as st

def get_llm_client():
    llm_type = st.secrets['LLM_TYPE']
    if llm_type == OPENAI:
        client = openai.OpenAI(api_key='OPENAI_API_KEY')
    elif llm_type == FIREWORKS:
        client = openai.OpenAI(
            base_url="https://api.fireworks.ai/inference/v1",
            api_key='FIREWORKS_API_KEY'
        )
    else:
        raise ValueError('Invalid LLM type')
    
    return client

def get_llm_model_id():
    llm_type = st.secrets['LLM_TYPE']
    if llm_type == OPENAI:
        return st.secrets['OPENAI_MODEL']
    elif llm_type == FIREWORKS:
        return st.secrets['FIREWORKS_LLM']
    else:
        raise ValueError('Invalid LLM type')



def get_llm_response(messages):
    client = get_llm_client()
    chat_completion = client.chat.completions.create(
        model=get_llm_model_id(),
        messages=messages,
    )
    response = chat_completion.choices[0].message.content
    return response


In [22]:
client = openai.OpenAI(api_key=st.secrets['FIREWORKS_API_KEY'])

In [23]:
client.base_url

URL('https://api.openai.com/v1/')

In [3]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

def extract_text_from_scanned_pdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open PDF
    extracted_text = []

    for page_num in range(len(doc)):
        images = doc[page_num].get_images(full=True)  # Extract images

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img = Image.open(io.BytesIO(image_bytes))  # Convert to PIL image
            
            text = pytesseract.image_to_string(img)  # OCR
            extracted_text.append(f"Page {page_num + 1}, Image {img_index + 1}:\n{text}\n")

    return "\n".join(extracted_text)

# Example usage
pdf_path = "/Users/sali/Downloads/Prem Prakash vs ED.pdf"  # Change to your PDF path
extracted_text = extract_text_from_scanned_pdf(pdf_path)

# Save to text file
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(extracted_text)

print("OCR complete. Text saved to extracted_text.txt")

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.