In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_community.llms import Ollama

In [2]:
# Ollama instance
llm = Ollama(model="llama3:instruct", temperature=0, base_url="http://localhost:11434", verbose=False)

In [3]:
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from llmsherpa.readers import LayoutPDFReader

config = {
    "nlm_url": "http://localhost:5001/api/parseDocument?renderFormat=all&applyOcr=yes&useNewIndentParser=yes",
}

def parent_chain(node):
    """
    Returns the parent chain of the block consisting of all the parents of the block until the root.
    """
    chain = []
    parent = node.parent
    while parent:
        chain.append(parent)
        parent = parent.parent
    chain.reverse()
    return chain

def parent_text(node):
    """
    Returns the text of the parent chain of the block. This is useful for adding section information to the text.
    """
    chain = parent_chain(node)
    header_texts = []
    para_texts = []
    for p in chain:
        if p.tag == "header":
            header_texts.append(p.to_text()) 
        elif p.tag in ['list_item', 'para']:
            para_texts.append(p.to_text())
    text = "\n>\n".join(header_texts)
    if len(para_texts) > 0:
        text +="\n\n".join(para_texts)
    return text
   
def to_context_text(node, include_section_info=True):
    """
    This is a customised function largely derived from layout_reader.py of the llmsherpa library
    Returns the text of the block with section information. This provides context to the text.
    """
    text = "Metadata:\n"
    if include_section_info and parent_text(node) != "":
        text += parent_text(node) + "  >\n"
    text += "Content:\n"
    if node.tag in ['list_item', 'para']:
        text += node.to_text(include_children=True, recurse=True)
    elif node.tag == 'table':
        text += node.to_html()
    else:
        text += node.to_text(include_children=True, recurse=True)
    return text

def is_use_semantic_chunking(leaf_nodes):
    # Returns true if more than 50% of paragraphs have only one line
    count = 0
    num_paras = len([node for node in leaf_nodes if node.tag == "para"])
    
    for node in leaf_nodes:
        if node.tag == "para":
            txt = node.to_text().strip()

            lines = txt.split("\n")
            if len(lines) == 1:
                count += 1
                # print("Single line para:", txt)

    print("Number of single line para:", count)
    print("Number of paragraphs:", num_paras)
    return count > num_paras/2

def find_leaf_nodes(node, leaf_nodes=None):
    if leaf_nodes is None:
        leaf_nodes = []

    if len(node.children) == 0:
        leaf_nodes.append(node)
    for child in node.children:
        find_leaf_nodes(child, leaf_nodes)

    return leaf_nodes

def docParser(file_path, st=None, tenant_id=None, visualise_chunking=False, layout_reader=None, prev_leaf_node=None):
    print("run")
    layout_root = None

    try:
        reader = LayoutPDFReader(config["nlm_url"]) if layout_reader is None else layout_reader
        try:
            print("Reading file:", file_path)
            # Instead of giving by file_path, give as bytes instead
            with open(file_path, "rb") as f:
                file_in_bytes = f.read()
            parsed_doc = reader.read_pdf(file_path, contents=file_in_bytes)
        except FileNotFoundError:
            if st is not None:
                st.error(f"File {file_path} not found.")
            print(f"File {file_path} not found.")
            return []
        layout_root = parsed_doc.root_node
    except Exception as e:
        if st is not None:
            st.error("Error:", e)
        print("Error:", e)

    leaf_nodes = find_leaf_nodes(layout_root)
    if prev_leaf_node is not None:
        if prev_leaf_node == leaf_nodes:
            prev_leaf_node = leaf_nodes
            print("Same leaf nodes")
        else:
            print("Different leaf nodes")
            print(prev_leaf_node[-1].to_text())
            print(leaf_nodes[-1].to_text())

    if is_use_semantic_chunking(leaf_nodes):
        # Perform semantic chunking

        ## Load embeddings
        embeddings = OllamaEmbeddings(
            base_url=config["ollama_base_url"],	
            model=config["llm_name"]
        )
        ## Chunk documents using semantic chunker
        text_splitter = SemanticChunker(
            embeddings, breakpoint_threshold_type="percentile"
        )

        full_text = ""
        for child in layout_root.children:
            full_text += child.to_text(include_children=True, recurse=True) + "\n"

        docs = text_splitter.create_documents([full_text])
    else:
        print("Using llmsherpa")
        # Use chunks from llmsherpa
        # Each chunk is each leaf_node with to_context_text()
        collated_pg_content = [to_context_text(node) for node in leaf_nodes]
        print("REMOVE:", collated_pg_content[0])

        # Convert to Langchain documents
        docs = [LangchainDocument(page_content=collated_pg_content[i], metadata={key: leaf_nodes[i].block_json[key] for key in ('bbox', 'page_idx', 'level')} | {"file_path": file_path}) for i in range(len(collated_pg_content))]

    return (docs, prev_leaf_node)

Parse Document into LangChain Document format

In [4]:
docs_path = ["/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf", "/home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf"]	

In [5]:
def parseAllDocs(docs_path):

    docs = []
    prev_leaf_node = None
    for doc in docs_path:
        print(f"Parsing document: {doc}")
        new_docs, prev_leaf_node = docParser(doc, layout_reader=LayoutPDFReader(config["nlm_url"]), prev_leaf_node=prev_leaf_node)
        docs.extend(new_docs)
        print("Document parsed successfully.", end="\n\n")

    return docs

In [6]:
docs = parseAllDocs(docs_path)

Parsing document: /home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf
run
Reading file: /home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf
Number of single line para: 26
Number of paragraphs: 104
Using llmsherpa
REMOVE: Metadata:
Content:
amazon
Document parsed successfully.

Parsing document: /home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf
run
Reading file: /home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf
Number of single line para: 0
Number of paragraphs: 48
Using llmsherpa
REMOVE: Metadata:
Introduction  >
Content:
 
 
  Patrick   
 
:      [00:01:53]      Hello.
Welcome      to      Business      Breakdowns  
 
.      I\\\'myour      host      Patrick      O\\\'Shaughnessy  
 
.      Today      we\\\'ll      be      breaking  
 
  down      the      world\\\'s      largest      e-commerce      company   
 
,      Alibaba.
Alibaba      was      founded      in      1997      by      Jack      Maand      almost      20  

In [7]:
[doc for doc in docs if "state_of_union" in doc.metadata["file_path"]][:3]

[]

In [8]:
docs[:3]

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': [199.68, 73.22, 199.68, 85.22], 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': [49.92, 195.42, 525.6, 219.42], 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared  

Remove stopwords

In [9]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jianyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# pre-process documents
# docs_str = [doc.page_content for doc in docs]
docs_str = []
stop_words = set(stopwords.words('english'))

for doc in docs:
    # Remove stop words
    # Only referencing the page_content
    text = doc.page_content
    tokens = word_tokenize(text)
    temp_filtered_text = [word for word in tokens if word.casefold() not in stop_words]
    # ignore all instances of the word Metadata or Content
    temp_filtered_text = [word for word in temp_filtered_text if word.casefold() not in ["metadata", "content"]]
    filtered_text = " ".join(temp_filtered_text)
    docs_str.append(filtered_text)


In [11]:
docs_str[:3]

[': : amazon',
 ': AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS > : SEATTLE— ( BUSINESS WIRE ) April 30 , 2024—Amazon.com , Inc. ( NASDAQ : AMZN ) today announced financial results first quarter ended March 31 , 2024 .',
 ': AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS > : ¢ Net sales increased 13 % $ 143.3 billion first quarter , compared $ 127.4 billion first quarter 2023 . Excluding $ 0.2 billion unfavorable impact year-over-year changes foreign exchange rates throughout quarter , net sales increased 13 % compared first quarter 2023 .']

In [12]:
docs[:3]

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': [199.68, 73.22, 199.68, 85.22], 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': [49.92, 195.42, 525.6, 219.42], 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared  

Initialise [BERTopic Model](https://maartengr.github.io/BERTopic/algorithm/algorithm.html)

In [13]:
# Taken from tutorial
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  min_topic_size=3,                          # Minimum size of the topic
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)


  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [14]:
topics, _ = topic_model.fit_transform(docs_str)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [15]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22,-1_aws_amazon_customers_customer,"[aws, amazon, customers, customer, availabilit...",[: Y/Y % > Certain Definitions > : e Reference...
1,0,48,0_alibaba_commerce_merchants_business,"[alibaba, commerce, merchants, business, compa...",[: Portable Lessons American Companies > : [ 0...
2,1,26,1_aws_amazon_ec2_cloud,"[aws, amazon, ec2, cloud, enterprise, capabili...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
3,2,25,2_income_amazon_sales_billion,"[income, amazon, sales, billion, finance, taxe...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,15,3_amazon_march_diluted_2023,"[amazon, march, diluted, 2023, quarter, 2024, ...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
5,4,15,4_financing_repayments_leases_finance,"[financing, repayments, leases, finance, incom...",[: Y/Y % > : ( 4 ) Free cash flow less princip...
6,5,9,5_sales_income_expenses_aws,"[sales, income, expenses, aws, expense, consol...",[: AWS > Consolidated > : Net sales Operating ...
7,6,9,6_stockholders_liabilities_equity_earnings,"[stockholders, liabilities, equity, earnings, ...",[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
8,7,7,7_amazon_2024_nasdaq_reports,"[amazon, 2024, nasdaq, reports, announces, fin...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
9,8,7,8_amazon_analysts_ranking_companies,"[amazon, analysts, ranking, companies, awards,...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...


In [16]:
type(topic_model.visualize_topics())

plotly.graph_objs._figure.Figure

In [17]:
def get_chunks_from_topic(topic_id, topic_model, docs_str, docs):
    """
    docs_str - list of strings
    docs - list of Langchain Document objects
    """
    temp = topic_model.get_document_info(docs_str)["Topic"] == topic_id
    # print(temp)
    df = topic_model.get_document_info(docs_str)[temp]
    # get list of all index
    doc_index = df.index.tolist()
    return [docs[i] for i in doc_index]
    # return df["Document"]

In [18]:
get_chunks_from_topic(0, topic_model, docs_str, docs)

[Document(page_content="Metadata:\nIntroduction  >\nContent:\n \n \n  Patrick   \n \n:      [00:01:53]      Hello.\nWelcome      to      Business      Breakdowns  \n \n.      I\\\\\\'myour      host      Patrick      O\\\\\\'Shaughnessy  \n \n.      Today      we\\\\\\'ll      be      breaking  \n \n  down      the      world\\\\\\'s      largest      e-commerce      company   \n \n,      Alibaba.\nAlibaba      was      founded      in      1997      by      Jack      Maand      almost      20      other      co-  \n \n  founders   \n \n,      as      an      online      bulletin      board      that      allowed      small      Chinese      manufacturers      to      tel      buyers      around      the      world      that      they  \n \n  were      open      for      business  \n \n.      Today,      Alibaba      operates      asprawling      ecosystem      of      businesses      that      includes      e-commerce  \n \n  marketplaces   \n \n,      cloud      computing   \n \n,   

In [19]:
# perform vector search on top 2 topics from the query
def get_chunks_from_query(query, topic_model, docs_str, docs):
    topics = topic_model.find_topics(query)
    print(topics)
    chunks = []
    
    # Select all topics with probability > 0.5
    for i in range(len(topics[1])):
        if topics[1][i] > 0.5:
            print("Topic Chosen:", topics[0][i], "Probability:", topics[1][i])
            chunks.extend(get_chunks_from_topic(topics[0][i], topic_model, docs_str, docs))

    if len(chunks) > 3:
        return chunks
    
    # If not enough chunks, get the top 3 topics
    for i in range(len(chunks), 3):
        print("Getting topic:", i)
        chunks.extend(get_chunks_from_topic(i, topic_model, docs_str, docs))
        if len(chunks) > 3:
            break

    return chunks

In [20]:
topic_model.find_topics("Alibaba")

([0, -1, 8, 1, 3], [0.4906037, 0.43695295, 0.40536618, 0.370501, 0.36816785])

In [21]:
# Define query here
query = "discuss the business model differences between amazon and alibaba"

In [22]:
langchain_docs = get_chunks_from_query(query, topic_model, docs_str, docs)

([8, 7, -1, 2, 1], [0.56528264, 0.5249168, 0.52279174, 0.50984883, 0.4957823])
Topic Chosen: 8 Probability: 0.56528264
Topic Chosen: 7 Probability: 0.5249168
Topic Chosen: -1 Probability: 0.52279174
Topic Chosen: 2 Probability: 0.50984883


In [23]:
# Remove bbox metadata from langchain_docs due to ChromaDB limitations
for doc in langchain_docs:
    doc.metadata["bbox"] = ""

In [24]:
len(langchain_docs)

61

In [25]:
langchain_docs[0].metadata

{'bbox': '',
 'page_idx': 1,
 'level': 1,
 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}

In [26]:
# identify number of non-alibaba documents in the list
len([doc for doc in langchain_docs if "alibaba" in doc.metadata["file_path"].lower()])

2

In [27]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma, FAISS

In [28]:

embeddings = OllamaEmbeddings(base_url = "http://localhost:11434", model = "llama3:instruct")
vectorstore = Chroma.from_documents(langchain_docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# retriever = vectorstore.as_retriever()

In [29]:
topic_docs = retriever.invoke(query)

In [30]:
topic_docs

[Document(page_content='Metadata:\nY/Y      %\n>\nAmazon      Investor      Relations      Amazon      Public      Relations  >\nContent:\namazon-ir@amazon.com      amazon-pr@amazon.com', metadata={'bbox': '', 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf', 'level': 2, 'page_idx': 16}),
 Document(page_content='Metadata:\nContent:\nSource      :      2020      Annual      Report', metadata={'bbox': '', 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf', 'level': 0, 'page_idx': 2}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS¢\n \n   International      segment      sales      increased      10%      year-over-year      to      $31.9      billion,      or      increased      11%      excluding  \n \n  changes      in      foreign      exchange      rates.  >\nContent:\n*\n \n   AWS      segment      sales      increased      17%      year-over-year      to      $25.0      

In [31]:
print(len(langchain_docs))

61


In [32]:
topic_model_resp = llm.invoke(f"Answer the question: {query} with the following documents: {langchain_docs}")
print(topic_model_resp)

I've analyzed the provided documents and extracted relevant information to discuss the business model differences between Amazon and Alibaba.

**Amazon's Business Model:**

1. **Fulfillment by Amazon (FBA):** Amazon offers FBA, a service that allows third-party sellers to store their products in Amazon's warehouses and handle logistics, packaging, and shipping.
2. **E-commerce Platform:** Amazon operates an e-commerce platform, allowing customers to purchase products from various categories, including electronics, clothing, home goods, and more.
3. **Content Creation:** Amazon produces original content through its Prime Video service, offering a range of TV shows, movies, and original productions.
4. **Advertising:** Amazon generates revenue through targeted advertising on its platforms, such as Amazon.com, Amazon Alexa, and Amazon Fire TV.

**Alibaba's Business Model:**

1. **E-commerce Platforms:** Alibaba operates several e-commerce platforms, including Taobao Marketplace, Tmall, an

### Comparing with Documents retrieved WITHOUT topic modelling

In [33]:
# Proof that docs contain all the unfiltered documents by topic
print(set(doc.metadata["file_path"] for doc in docs))
print(len(docs))

{'/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf', '/home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf'}
189


In [34]:
assert langchain_docs != docs
assert len(langchain_docs) < len(docs)

In [35]:
# remove bbox
for doc in docs:
    doc.metadata["bbox"] = ""

In [36]:
vectorstore_before_topic_model = FAISS.from_documents(docs, embeddings)
retriever_before_topic_model = vectorstore_before_topic_model.as_retriever(search_kwargs={"k": 10})

KeyboardInterrupt: 

In [None]:
no_topic_docs = retriever_before_topic_model.invoke(query)

In [None]:
no_topic_docs

In [None]:
# See the number of document chunks referencing alibaba
len([doc for doc in docs if "alibaba" in doc.metadata["file_path"].lower()])

In [None]:
no_topic_resp = llm.invoke(f"Answer the question: {query} with the following documents: {no_topic_docs}")
print(no_topic_resp)

### Compare differences in document retrieval between topic modelling and no topic modelling

In [None]:
# Evaluate using ChatGPT.
# The printed output will be the input to ChatGPT

print(f"Help me compare the relevant document retrieval performance comparing between\
      using topic modelling and not using topic modelling. I am using the query '{query}'\
      to retrieve relevant documents from both topic model and non-topic model methods.\
      I need you to identify which set of documents are better in terms of relevance to the query.\
      TO make the test fair, you won't know which set of documents are from the topic model and which are not.\
      Please compare the two sets of documents and identify which set of documents are more relevant to the query.\n\
      Be succinct and NOT verbose, give me the better version straight away with a brief explanation why you think so.\n\
      Focus on how good the documents are at answering the query. Some documents may have more keywords but less relevant content and that's not what we want.\n\
      Version 1: Documents retrieved using topic model:\
        {topic_docs}\n\
        Version 2: Documents retrieved without using topic model:\
        {no_topic_docs}")

In [None]:
print(query)

Saving and loading model for future re-use

In [37]:
topic_model.save("topic_cache", serialization="pickle", save_embedding_model=False, save_ctfidf=True)



In [38]:
loaded_model = BERTopic.load("topic_cache", embedding_model=SentenceTransformer("all-MiniLM-L6-v2"))

In [39]:
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22,-1_aws_amazon_customers_customer,"[aws, amazon, customers, customer, availabilit...",[: Y/Y % > Certain Definitions > : e Reference...
1,0,48,0_alibaba_commerce_merchants_business,"[alibaba, commerce, merchants, business, compa...",[: Portable Lessons American Companies > : [ 0...
2,1,26,1_aws_amazon_ec2_cloud,"[aws, amazon, ec2, cloud, enterprise, capabili...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
3,2,25,2_income_amazon_sales_billion,"[income, amazon, sales, billion, finance, taxe...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,15,3_amazon_march_diluted_2023,"[amazon, march, diluted, 2023, quarter, 2024, ...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
5,4,15,4_financing_repayments_leases_finance,"[financing, repayments, leases, finance, incom...",[: Y/Y % > : ( 4 ) Free cash flow less princip...
6,5,9,5_sales_income_expenses_aws,"[sales, income, expenses, aws, expense, consol...",[: AWS > Consolidated > : Net sales Operating ...
7,6,9,6_stockholders_liabilities_equity_earnings,"[stockholders, liabilities, equity, earnings, ...",[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
8,7,7,7_amazon_2024_nasdaq_reports,"[amazon, 2024, nasdaq, reports, announces, fin...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
9,8,7,8_amazon_analysts_ranking_companies,"[amazon, analysts, ranking, companies, awards,...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...


In [53]:
len(loaded_model.get_topic_info())

11

In [55]:
loaded_model.get_topic_info()["Representation"][-1]

KeyError: -1

In [57]:
# Generate topic name
topic_name = {}
for i in range(len(loaded_model.get_topic_info())):
    rep_words = loaded_model.get_topic_info()["Representation"][i]
    topic_name[i-1] = llm.invoke(f"Generate a 2 to 3 word TOPIC NAME from the words it's represented by: ({rep_words[:4]}). IMPORTANT: EXCLUDE ANY PREAMBLE OR EXPLANATION.")

loaded_model.set_topic_labels(topic_name)

![images](ideal_state.png)

Note the linked topic under business practices. This happens when we try to topic model across multiple documents.

In [58]:
topic_name

{-1: 'Amazon Customers',
 0: 'E-commerce Hub',
 1: 'AWS Cloud Compute',
 2: 'Amazon Sales',
 3: 'Amazon March Report',
 4: 'Financial Leases',
 5: 'AWS Financials',
 6: 'Financial Statements',
 7: 'Amazon Nasdaq Reports',
 8: 'Amazon Rankings',
 9: 'AWS March 2023'}

In [None]:
loaded_model.get_document_info(docs_str)[:1]

In [77]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7688"
with GraphDatabase.driver(uri, auth=("neo4j", "password")) as driver:
    # Clear database
    driver.execute_query("MATCH (n) DETACH DELETE n")
    # Add all topics
    for topic in topic_name:
        driver.execute_query(f"CREATE (t:Topic {{name: '{topic_name[topic]}', id: '{topic}'}})")
    # Add all documents
    doc_path = [doc.metadata['file_path'].split("/")[-1] for doc in docs]
    set_doc_path = set(doc_path)
    for doc in set_doc_path:
        driver.execute_query(f"CREATE (d:Document {{name: '{doc}'}})")
    # Add all document chunks
    for i, doc in enumerate(docs):
        driver.execute_query(f"CREATE (c:Chunk {{id: 'chunk_{i}', text: '{doc.page_content}', name: '{doc.metadata['file_path']}', topic: '{loaded_model.get_document_info(docs_str)['Topic'][i]}'}})")
    # Add relationships between documents and chunks
    for i, doc in enumerate(docs):
        driver.execute_query(f"MATCH (d:Document {{name: '{doc.metadata['file_path'].split('/')[-1]}'}}), (c:Chunk {{id: 'chunk_{i}'}}) CREATE (d)-[:HAS_CHUNK]->(c)")
        # Add relationships between chunks and topics
        driver.execute_query(f"MATCH (c:Chunk {{id: 'chunk_{i}'}}), (t:Topic {{id: '{loaded_model.get_document_info(docs_str)['Topic'][i]}'}}) CREATE (c)-[:HAS_TOPIC]->(t)")
    # Add direct link between documents and topics
    # for doc in set_doc_path:
    #     driver.execute_query(f"MATCH (d:Document {{name: '{doc}'}}), (t:Topic {{id: '{loaded_model.get_document_info(docs_str)['Topic'][doc_path.index(doc)]}'}}) CREATE (d)-[:HAS_TOPIC]->(t)")
    for doc_path in set_doc_path:
        # Each main doc has multiple topics associated with it
        # Find all topics associated with the doc
        chunks = [doc for doc in docs if doc.metadata['file_path'].split('/')[-1] == doc_path]
        topics = [loaded_model.get_document_info(docs_str)['Topic'][docs.index(chunk)] for chunk in chunks]
        set_topics = set(topics)
        for topic in set_topics:
            driver.execute_query(f"MATCH (d:Document {{name: '{doc_path}'}}), (t:Topic {{id: '{topic}'}}) CREATE (d)-[:HAS_TOPIC]->(t)")


In [69]:
loaded_model.get_document_info(docs_str)["Topic"][0]

-1

In [None]:
# Get all documents with topic 0
