In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_community.llms import Ollama

In [3]:
# Ollama instance
llm = Ollama(model="llama3.1", temperature=0, base_url="http://localhost:11434", verbose=False)

In [4]:
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from llmsherpa.readers import LayoutPDFReader

config = {
    "nlm_url": "http://localhost:5001/api/parseDocument?renderFormat=all&applyOcr=yes&useNewIndentParser=yes",
}

def parent_chain(node):
    """
    Returns the parent chain of the block consisting of all the parents of the block until the root.
    """
    chain = []
    parent = node.parent
    while parent:
        chain.append(parent)
        parent = parent.parent
    chain.reverse()
    return chain

def parent_text(node):
    """
    Returns the text of the parent chain of the block. This is useful for adding section information to the text.
    """
    chain = parent_chain(node)
    header_texts = []
    para_texts = []
    for p in chain:
        if p.tag == "header":
            header_texts.append(p.to_text()) 
        elif p.tag in ['list_item', 'para']:
            para_texts.append(p.to_text())
    text = "\n>\n".join(header_texts)
    if len(para_texts) > 0:
        text +="\n\n".join(para_texts)
    return text
   
def to_context_text(node, include_section_info=True):
    """
    This is a customised function largely derived from layout_reader.py of the llmsherpa library
    Returns the text of the block with section information. This provides context to the text.
    """
    text = "Metadata:\n"
    if include_section_info and parent_text(node) != "":
        text += parent_text(node) + "  >\n"
    text += "Content:\n"
    if node.tag in ['list_item', 'para']:
        text += node.to_text(include_children=True, recurse=True)
    elif node.tag == 'table':
        text += node.to_html()
    else:
        text += node.to_text(include_children=True, recurse=True)
    return text

def is_use_semantic_chunking(leaf_nodes):
    # Returns true if more than 50% of paragraphs have only one line
    count = 0
    num_paras = len([node for node in leaf_nodes if node.tag == "para"])
    
    for node in leaf_nodes:
        if node.tag == "para":
            txt = node.to_text().strip()

            lines = txt.split("\n")
            if len(lines) == 1:
                count += 1
                # print("Single line para:", txt)

    print("Number of single line para:", count)
    print("Number of paragraphs:", num_paras)
    return count > num_paras/2

def find_leaf_nodes(node, leaf_nodes=None):
    if leaf_nodes is None:
        leaf_nodes = []

    if len(node.children) == 0:
        leaf_nodes.append(node)
    for child in node.children:
        find_leaf_nodes(child, leaf_nodes)

    return leaf_nodes

def docParser(file_path, st=None, tenant_id=None, visualise_chunking=False, layout_reader=None, prev_leaf_node=None):
    print("run")
    layout_root = None

    try:
        reader = LayoutPDFReader(config["nlm_url"]) if layout_reader is None else layout_reader
        try:
            print("Reading file:", file_path)
            # Instead of giving by file_path, give as bytes instead
            with open(file_path, "rb") as f:
                file_in_bytes = f.read()
            parsed_doc = reader.read_pdf(file_path, contents=file_in_bytes)
        except FileNotFoundError:
            if st is not None:
                st.error(f"File {file_path} not found.")
            print(f"File {file_path} not found.")
            return []
        layout_root = parsed_doc.root_node
    except Exception as e:
        if st is not None:
            st.error("Error:", e)
        print("Error:", e)

    leaf_nodes = find_leaf_nodes(layout_root)
    if prev_leaf_node is not None:
        if prev_leaf_node == leaf_nodes:
            prev_leaf_node = leaf_nodes
            print("Same leaf nodes")
        else:
            print("Different leaf nodes")
            print(prev_leaf_node[-1].to_text())
            print(leaf_nodes[-1].to_text())

    if is_use_semantic_chunking(leaf_nodes):
        # Perform semantic chunking

        ## Load embeddings
        embeddings = OllamaEmbeddings(
            base_url=config["ollama_base_url"],	
            model=config["llm_name"]
        )
        ## Chunk documents using semantic chunker
        text_splitter = SemanticChunker(
            embeddings, breakpoint_threshold_type="percentile"
        )

        full_text = ""
        for child in layout_root.children:
            full_text += child.to_text(include_children=True, recurse=True) + "\n"

        docs = text_splitter.create_documents([full_text])
    else:
        print("Using llmsherpa")
        # Use chunks from llmsherpa
        # Each chunk is each leaf_node with to_context_text()
        collated_pg_content = [to_context_text(node) for node in leaf_nodes]
        print("REMOVE:", collated_pg_content[0])

        # Convert to Langchain documents
        docs = [LangchainDocument(page_content=collated_pg_content[i], metadata={key: leaf_nodes[i].block_json[key] for key in ('bbox', 'page_idx', 'level')} | {"file_path": file_path}) for i in range(len(collated_pg_content))]

    return (docs, prev_leaf_node)

Parse Document into LangChain Document format

In [5]:
docs_path = ["/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf", "/home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf"]	

In [6]:
def parseAllDocs(docs_path):

    docs = []
    prev_leaf_node = None
    for doc in docs_path:
        print(f"Parsing document: {doc}")
        new_docs, prev_leaf_node = docParser(doc, layout_reader=LayoutPDFReader(config["nlm_url"]), prev_leaf_node=prev_leaf_node)
        docs.extend(new_docs)
        print("Document parsed successfully.", end="\n\n")

    return docs

In [7]:
docs = parseAllDocs(docs_path)

Parsing document: /home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf
run
Reading file: /home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf
Number of single line para: 26
Number of paragraphs: 104
Using llmsherpa
REMOVE: Metadata:
Content:
amazon
Document parsed successfully.

Parsing document: /home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf
run
Reading file: /home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf
Number of single line para: 0
Number of paragraphs: 48
Using llmsherpa
REMOVE: Metadata:
Introduction  >
Content:
 
 
  Patrick   
 
:      [00:01:53]      Hello.
Welcome      to      Business      Breakdowns  
 
.      I\\\'myour      host      Patrick      O\\\'Shaughnessy  
 
.      Today      we\\\'ll      be      breaking  
 
  down      the      world\\\'s      largest      e-commerce      company   
 
,      Alibaba.
Alibaba      was      founded      in      1997      by      Jack      Maand      almost      20  

In [8]:
[doc for doc in docs if "state_of_union" in doc.metadata["file_path"]][:3]

[]

In [9]:
docs[:3]

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': [199.68, 73.22, 199.68, 85.22], 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': [49.92, 195.42, 525.6, 219.42], 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared  

Remove stopwords

In [10]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jianyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# pre-process documents
# docs_str = [doc.page_content for doc in docs]
docs_str = []
stop_words = set(stopwords.words('english'))

for doc in docs:
    # Remove stop words
    # Only referencing the page_content
    text = doc.page_content
    tokens = word_tokenize(text)
    temp_filtered_text = [word for word in tokens if word.casefold() not in stop_words]
    # ignore all instances of the word Metadata or Content
    temp_filtered_text = [word for word in temp_filtered_text if word.casefold() not in ["metadata", "content"]]
    filtered_text = " ".join(temp_filtered_text)
    docs_str.append(filtered_text)


In [12]:
docs_str[:3]

[': : amazon',
 ': AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS > : SEATTLE— ( BUSINESS WIRE ) April 30 , 2024—Amazon.com , Inc. ( NASDAQ : AMZN ) today announced financial results first quarter ended March 31 , 2024 .',
 ': AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS > : ¢ Net sales increased 13 % $ 143.3 billion first quarter , compared $ 127.4 billion first quarter 2023 . Excluding $ 0.2 billion unfavorable impact year-over-year changes foreign exchange rates throughout quarter , net sales increased 13 % compared first quarter 2023 .']

In [13]:
docs[:3]

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': [199.68, 73.22, 199.68, 85.22], 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': [49.92, 195.42, 525.6, 219.42], 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared  

Initialise [BERTopic Model](https://maartengr.github.io/BERTopic/algorithm/algorithm.html)

In [14]:
# Taken from tutorial
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  min_topic_size=3,                          # Minimum size of the topic
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)


  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [15]:
topics, _ = topic_model.fit_transform(docs_str)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_report_references_annual_2020,"[report, references, annual, 2020, seller, sel...","[: : Source : 2020 Annual Report, : Y/Y % > Ce..."
1,0,94,0_income_amazon_aws_sales,"[income, amazon, aws, sales, billion, quarter,...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
2,1,48,1_alibaba_commerce_business_companies,"[alibaba, commerce, business, companies, merch...",[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...
3,2,15,2_amazon_march_diluted_2023,"[amazon, march, diluted, 2023, quarter, 2024, ...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,15,3_financing_repayments_leases_finance,"[financing, repayments, leases, finance, incom...",[: Y/Y % > : ( 4 ) Free cash flow less princip...
5,4,9,4_stockholders_liabilities_equity_earnings,"[stockholders, liabilities, equity, earnings, ...",[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
6,5,6,5_aws_2023_months_march,"[aws, 2023, months, march, 032, 2024, 037, 247...","[: AWS > Three Months Ended March 31 , 2023 20..."


In [17]:
type(topic_model.visualize_topics())

plotly.graph_objs._figure.Figure

In [18]:
def get_chunks_from_topic(topic_id, topic_model, docs_str, docs):
    """
    docs_str - list of strings
    docs - list of Langchain Document objects
    """
    temp = topic_model.get_document_info(docs_str)["Topic"] == topic_id
    # print(temp)
    df = topic_model.get_document_info(docs_str)[temp]
    # get list of all index
    doc_index = df.index.tolist()
    return [docs[i] for i in doc_index]
    # return df["Document"]

In [19]:
get_chunks_from_topic(0, topic_model, docs_str, docs)

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': [199.68, 73.22, 199.68, 85.22], 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': [49.92, 195.42, 525.6, 219.42], 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared  

In [20]:
# perform vector search on top 2 topics from the query
def get_chunks_from_query(query, topic_model, docs_str, docs):
    topics = topic_model.find_topics(query)
    print(topics)
    chunks = []
    
    # Select all topics with probability > 0.5
    for i in range(len(topics[1])):
        if topics[1][i] > 0.5:
            print("Topic Chosen:", topics[0][i], "Probability:", topics[1][i])
            chunks.extend(get_chunks_from_topic(topics[0][i], topic_model, docs_str, docs))

    if len(chunks) > 3:
        return chunks
    
    # If not enough chunks, get the top 3 topics
    for i in range(len(chunks), 3):
        print("Getting topic:", i)
        chunks.extend(get_chunks_from_topic(i, topic_model, docs_str, docs))
        if len(chunks) > 3:
            break

    return chunks

In [21]:
topic_model.find_topics("Alibaba")

([1, 0, 2, 4, 5], [0.4906037, 0.40404835, 0.36816785, 0.271473, 0.25330758])

In [22]:
# Define query here
query = "discuss the business model differences between amazon and alibaba"

In [23]:
langchain_docs = get_chunks_from_query(query, topic_model, docs_str, docs)

([0, 1, 2, 5, 4], [0.57957596, 0.4945858, 0.4455811, 0.31860283, 0.23087329])
Topic Chosen: 0 Probability: 0.57957596


In [24]:
# Remove bbox metadata from langchain_docs due to ChromaDB limitations
for doc in langchain_docs:
    doc.metadata["bbox"] = ""

In [25]:
len(langchain_docs)

94

In [26]:
langchain_docs[0].metadata

{'bbox': '',
 'page_idx': 0,
 'level': 0,
 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}

In [27]:
# identify number of non-alibaba documents in the list
len([doc for doc in langchain_docs if "alibaba" in doc.metadata["file_path"].lower()])

1

In [28]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma, FAISS

In [29]:

embeddings = OllamaEmbeddings(base_url = "http://localhost:11434", model = "llama3:instruct")
vectorstore = Chroma.from_documents(langchain_docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# retriever = vectorstore.as_retriever()

In [30]:
topic_docs = retriever.invoke(query)

In [31]:
topic_docs

[Document(page_content='Metadata:\nY/Y      %\n>\nAmazon      Investor      Relations      Amazon      Public      Relations  >\nContent:\namazon-ir@amazon.com      amazon-pr@amazon.com', metadata={'bbox': '', 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf', 'level': 2, 'page_idx': 16}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS\n>\nHighlights  \n \n*      Launched   \n \na      Disaster      Relief      Hub      in      Rheinberg,      Germany—Amazon’s      first      Hub      in      Europe      and      the      company’s      13th  \n \n  around      the      world.\nThe      21,000-square-foot      Hub      allows      Amazon      to      store      and      quickly      pack      relief      items      that      are      most  \n \n  needed      following      natural      disasters      and      other      emergencies.\nAWS      also      provided      technology      to      support     

In [32]:
print(len(langchain_docs))

94


In [33]:
topic_model_resp = llm.invoke(f"Answer the question: {query} with the following documents: {langchain_docs}")
print(topic_model_resp)

The provided text appears to be a collection of documents extracted from PDF files using OCR (Optical Character Recognition) technology. The documents are in the format of `Document(page_content='...')`, where each document contains metadata and page content.

To answer your question, there is no specific information about Amazon's revenue or growth rate in the provided text. However, I can provide some general insights based on the content:

1. **Revenue and Growth**: The documents mention "Y/Y %", which likely refers to year-over-year (YoY) percentage change in revenue or another metric. However, without specific numbers, it's difficult to determine the actual growth rate.
2. **Units Sold**: One document mentions "units sold" as a measure of physical and digital units sold by Amazon and its sellers. This could be related to revenue, but again, no specific numbers are provided.
3. **Customer Accounts and AWS Customers**: The documents define customer accounts and AWS customers, which 

### Comparing with Documents retrieved WITHOUT topic modelling

In [34]:
# Proof that docs contain all the unfiltered documents by topic
print(set(doc.metadata["file_path"] for doc in docs))
print(len(docs))

{'/home/jianyang/local-llm/topic-modelling/documents/Alibaba.pdf', '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}
189


In [35]:
assert langchain_docs != docs
assert len(langchain_docs) < len(docs)

In [36]:
# remove bbox
for doc in docs:
    doc.metadata["bbox"] = ""

In [37]:
vectorstore_before_topic_model = FAISS.from_documents(docs, embeddings)
retriever_before_topic_model = vectorstore_before_topic_model.as_retriever(search_kwargs={"k": 10})

In [38]:
no_topic_docs = retriever_before_topic_model.invoke(query)

In [39]:
no_topic_docs

[Document(page_content='Metadata:\nY/Y      %\n>\nAmazon      Investor      Relations      Amazon      Public      Relations  >\nContent:\namazon-ir@amazon.com      amazon-pr@amazon.com', metadata={'bbox': '', 'page_idx': 16, 'level': 2, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS\n>\nDiluted       March      31,  >\nContent:\n67,791      72,633 20,905      22,317 20,450      20,424 10,172      9,662 3,043      2,742 223      228 122,584      128,006 4,774      15,307 611      993  \n \n  (823)      (644) (443)      (2,673) (655)      (2,324) 4,119      12,983  \n \n  (948)      (2,467) 1\n \n   (85) 3,172      10,431 0.31      1.00 0.31      0.98', metadata={'bbox': '', 'page_idx': 9, 'level': 2, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAWS\n>\nThree      Months     

In [40]:
# See the number of document chunks referencing alibaba
len([doc for doc in docs if "alibaba" in doc.metadata["file_path"].lower()])

50

In [41]:
no_topic_resp = llm.invoke(f"Answer the question: {query} with the following documents: {no_topic_docs}")
print(no_topic_resp)

The provided code snippet appears to be a Python script that uses the PyMuPDF library to extract text from a PDF file. The script iterates over each page of the document and creates a `Document` object for each page, which contains the extracted text content and metadata.

To answer your question about how to improve the performance of this script, here are some suggestions:

1. **Use a more efficient PDF parsing library**: PyMuPDF is not the most efficient library for large documents or complex layouts. You may want to consider using a more modern and optimized library like `pdfplumber` or `pdfquery`.
2. **Optimize memory usage**: The script creates a new `Document` object for each page, which can lead to high memory usage if the document is very large. Consider using a single data structure to store all the extracted text content and metadata.
3. **Use multi-threading or parallel processing**: If you're dealing with extremely large documents or multiple PDF files, consider using mult

### Compare differences in document retrieval between topic modelling and no topic modelling

In [42]:
# Evaluate using ChatGPT.
# The printed output will be the input to ChatGPT

print(f"Help me compare the relevant document retrieval performance comparing between\
      using topic modelling and not using topic modelling. I am using the query '{query}'\
      to retrieve relevant documents from both topic model and non-topic model methods.\
      I need you to identify which set of documents are better in terms of relevance to the query.\
      TO make the test fair, you won't know which set of documents are from the topic model and which are not.\
      Please compare the two sets of documents and identify which set of documents are more relevant to the query.\n\
      Be succinct and NOT verbose, give me the better version straight away with a brief explanation why you think so.\n\
      Focus on how good the documents are at answering the query. Some documents may have more keywords but less relevant content and that's not what we want.\n\
      Version 1: Documents retrieved using topic model:\
        {topic_docs}\n\
        Version 2: Documents retrieved without using topic model:\
        {no_topic_docs}")

Help me compare the relevant document retrieval performance comparing between      using topic modelling and not using topic modelling. I am using the query 'discuss the business model differences between amazon and alibaba'      to retrieve relevant documents from both topic model and non-topic model methods.      I need you to identify which set of documents are better in terms of relevance to the query.      TO make the test fair, you won't know which set of documents are from the topic model and which are not.      Please compare the two sets of documents and identify which set of documents are more relevant to the query.
      Be succinct and NOT verbose, give me the better version straight away with a brief explanation why you think so.
      Focus on how good the documents are at answering the query. Some documents may have more keywords but less relevant content and that's not what we want.
      Version 1: Documents retrieved using topic model:        [Document(page_content='M

In [43]:
print(query)

discuss the business model differences between amazon and alibaba


Saving and loading model for future re-use

In [44]:
topic_model.save("topic_cache", serialization="pickle", save_embedding_model=False, save_ctfidf=True)



In [45]:
loaded_model = BERTopic.load("topic_cache", embedding_model=SentenceTransformer("all-MiniLM-L6-v2"))

In [46]:
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_report_references_annual_2020,"[report, references, annual, 2020, seller, sel...","[: : Source : 2020 Annual Report, : Y/Y % > Ce..."
1,0,94,0_income_amazon_aws_sales,"[income, amazon, aws, sales, billion, quarter,...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
2,1,48,1_alibaba_commerce_business_companies,"[alibaba, commerce, business, companies, merch...",[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...
3,2,15,2_amazon_march_diluted_2023,"[amazon, march, diluted, 2023, quarter, 2024, ...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,15,3_financing_repayments_leases_finance,"[financing, repayments, leases, finance, incom...",[: Y/Y % > : ( 4 ) Free cash flow less princip...
5,4,9,4_stockholders_liabilities_equity_earnings,"[stockholders, liabilities, equity, earnings, ...",[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
6,5,6,5_aws_2023_months_march,"[aws, 2023, months, march, 032, 2024, 037, 247...","[: AWS > Three Months Ended March 31 , 2023 20..."


In [47]:
len(loaded_model.get_topic_info())

7

In [48]:
loaded_model.get_topic_info()["Representation"][-1]

KeyError: -1

In [49]:
# Generate topic name
topic_name = {}
for i in range(len(loaded_model.get_topic_info())):
    rep_words = loaded_model.get_topic_info()["Representation"][i]
    topic_name[i-1] = llm.invoke(f"Generate a 2 to 3 word TOPIC NAME from the words it's represented by: ({rep_words[:4]}). IMPORTANT: EXCLUDE ANY PREAMBLE OR EXPLANATION.")

loaded_model.set_topic_labels(topic_name)

![images](ideal_state.png)

Note the linked topic under business practices. This happens when we try to topic model across multiple documents.

In [50]:
topic_name

{-1: 'Annual Report',
 0: 'Amazon Sales',
 1: 'E-commerce platforms',
 2: 'Amazon March Dilution',
 3: 'Lease Repayments',
 4: 'Financial Reports',
 5: 'AWS March'}

In [None]:
loaded_model.get_document_info(docs_str)[:1]

In [None]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7688"
with GraphDatabase.driver(uri, auth=("neo4j", "password")) as driver:
    # Clear database
    driver.execute_query("MATCH (n) DETACH DELETE n")
    # Add all topics
    for topic in topic_name:
        driver.execute_query(f"CREATE (t:Topic {{name: '{topic_name[topic]}', id: '{topic}'}})")
    # Add all documents
    doc_path = [doc.metadata['file_path'].split("/")[-1] for doc in docs]
    set_doc_path = set(doc_path)
    for doc in set_doc_path:
        driver.execute_query(f"CREATE (d:Document {{name: '{doc}'}})")
    # Add all document chunks
    for i, doc in enumerate(docs):
        driver.execute_query(f"CREATE (c:Chunk {{id: 'chunk_{i}', text: '{doc.page_content}', name: '{doc.metadata['file_path']}', topic: '{loaded_model.get_document_info(docs_str)['Topic'][i]}'}})")
    # Add relationships between documents and chunks
    for i, doc in enumerate(docs):
        driver.execute_query(f"MATCH (d:Document {{name: '{doc.metadata['file_path'].split('/')[-1]}'}}), (c:Chunk {{id: 'chunk_{i}'}}) CREATE (d)-[:HAS_CHUNK]->(c)")
        # Add relationships between chunks and topics
        driver.execute_query(f"MATCH (c:Chunk {{id: 'chunk_{i}'}}), (t:Topic {{id: '{loaded_model.get_document_info(docs_str)['Topic'][i]}'}}) CREATE (c)-[:HAS_TOPIC]->(t)")
    # Add direct link between documents and topics
    # for doc in set_doc_path:
    #     driver.execute_query(f"MATCH (d:Document {{name: '{doc}'}}), (t:Topic {{id: '{loaded_model.get_document_info(docs_str)['Topic'][doc_path.index(doc)]}'}}) CREATE (d)-[:HAS_TOPIC]->(t)")
    for doc_path in set_doc_path:
        # Each main doc has multiple topics associated with it
        # Find all topics associated with the doc
        chunks = [doc for doc in docs if doc.metadata['file_path'].split('/')[-1] == doc_path]
        topics = [loaded_model.get_document_info(docs_str)['Topic'][docs.index(chunk)] for chunk in chunks]
        set_topics = set(topics)
        for topic in set_topics:
            driver.execute_query(f"MATCH (d:Document {{name: '{doc_path}'}}), (t:Topic {{id: '{topic}'}}) CREATE (d)-[:HAS_TOPIC]->(t)")


In [None]:
loaded_model.get_document_info(docs_str)["Topic"][0]

-1

In [58]:
topic_name

{-1: 'Annual Report',
 0: 'Amazon Sales',
 1: 'E-commerce platforms',
 2: 'Amazon March Dilution',
 3: 'Lease Repayments',
 4: 'Financial Reports',
 5: 'AWS March'}

In [68]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_report_references_annual_2020,"[report, references, annual, 2020, seller, sel...","[: : Source : 2020 Annual Report, : Y/Y % > Ce..."
1,0,94,0_income_amazon_aws_sales,"[income, amazon, aws, sales, billion, quarter,...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
2,1,48,1_alibaba_commerce_business_companies,"[alibaba, commerce, business, companies, merch...",[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...
3,2,15,2_amazon_march_diluted_2023,"[amazon, march, diluted, 2023, quarter, 2024, ...",[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,15,3_financing_repayments_leases_finance,"[financing, repayments, leases, finance, incom...",[: Y/Y % > : ( 4 ) Free cash flow less princip...
5,4,9,4_stockholders_liabilities_equity_earnings,"[stockholders, liabilities, equity, earnings, ...",[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
6,5,6,5_aws_2023_months_march,"[aws, 2023, months, march, 032, 2024, 037, 247...","[: AWS > Three Months Ended March 31 , 2023 20..."


In [72]:
list(topic_model.get_topic_info()["Topic"])

[-1, 0, 1, 2, 3, 4, 5]

In [90]:
docs

[Document(page_content='Metadata:\nContent:\namazon', metadata={'bbox': '', 'page_idx': 0, 'level': 0, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\nSEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  \n \n  for      its      first      quarter      ended      March      31,      2024.', metadata={'bbox': '', 'page_idx': 0, 'level': 1, 'file_path': '/home/jianyang/local-llm/topic-modelling/documents/amazon_report.pdf'}),
 Document(page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >\nContent:\n \n \n \n¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared      with      $127.4      billion      in      first    

In [111]:
import networkx as nx
import json
import pandas as pd

# Extracting data from the topic model and storing it in a dictionary
topic_info = topic_model.get_topic_info()
data = {
    'Topic': list(topic_info['Topic']),
    'Count': list(topic_info['Count']),
    'Name': topic_name.values(),
    'Top4Words': [rep[:4] for rep in topic_info['Representation']]
}

# Printing the data for verification
print(data)

# Creating a DataFrame from the data
df = pd.DataFrame(data)

# Creating a graph using NetworkX
G = nx.Graph()

### NODES
## Topic nodes
# Creating a list of nodes with their attributes
topic_node_data = [
    (f"[TOPIC]\n{row['Name']}", {
        'size': max(row['Count'], 6),
        'color': "#ccab1b"
    }) for row in df.to_dict('records')
]


# Printing the node data for verification
print(topic_node_data)

## Representation nodes
rep_node_data = []
for i in range(len(df)):
    rep_node_data.append((f"[REP_WORDS]\n{df['Top4Words'][i]}", {
        'size': 8,
        'color': "#3b0afc",
    }))

print(rep_node_data)

## Document nodes
doc_path = [doc.metadata['file_path'].split("/")[-1] for doc in docs]
set_doc_path = set(doc_path)
doc_node_data = []
for doc in set_doc_path:
    doc_node_data.append((f"[DOC]\n{doc}", {
        'size': 2/3 * max(df['Count']),
        'color': "#ed4ae2"
    }))

# Collate all nodes
node_data = []
node_data.extend(topic_node_data)
node_data.extend(rep_node_data)
node_data.extend(doc_node_data)

# Adding nodes to the graph
G.add_nodes_from(node_data)


### EDGES
## Document to Topics
for doc_path in set_doc_path:
    # Each main doc has multiple topics associated with it
    # Find all topics associated with the doc
    chunks = [doc for doc in docs if doc.metadata['file_path'].split('/')[-1] == doc_path]
    topics = [topic_model.get_document_info(docs_str)['Topic'][docs.index(chunk)] for chunk in chunks]
    set_topics = set(topics)
    for topic in set_topics:
        G.add_edge(f"[DOC]\n{doc_path}", f"[TOPIC]\n{topic_name[topic]}", label="HAS_TOPIC", weight=2)

## Topics to Representation
for i in range(len(df)):
    G.add_edge(f"[TOPIC]\n{df['Name'][i]}", f"[REP_WORDS]\n{df['Top4Words'][i]}", label="HAS_WORDS", weight=2)

# Converting the graph to JSON format
graph_data = nx.node_link_data(G)

# Saving the graph data to a JSON file
with open("graph.json", "w") as f:
    json.dump(graph_data, f)


{'Topic': [-1, 0, 1, 2, 3, 4, 5], 'Count': [2, 94, 48, 15, 15, 9, 6], 'Name': dict_values(['Annual Report', 'Amazon Sales', 'E-commerce platforms', 'Amazon March Dilution', 'Lease Repayments', 'Financial Reports', 'AWS March']), 'Top4Words': [['report', 'references', 'annual', '2020'], ['income', 'amazon', 'aws', 'sales'], ['alibaba', 'commerce', 'business', 'companies'], ['amazon', 'march', 'diluted', '2023'], ['financing', 'repayments', 'leases', 'finance'], ['stockholders', 'liabilities', 'equity', 'earnings'], ['aws', '2023', 'months', 'march']]}
[('[TOPIC]\nAnnual Report', {'size': 6, 'color': '#ccab1b'}), ('[TOPIC]\nAmazon Sales', {'size': 94, 'color': '#ccab1b'}), ('[TOPIC]\nE-commerce platforms', {'size': 48, 'color': '#ccab1b'}), ('[TOPIC]\nAmazon March Dilution', {'size': 15, 'color': '#ccab1b'}), ('[TOPIC]\nLease Repayments', {'size': 15, 'color': '#ccab1b'}), ('[TOPIC]\nFinancial Reports', {'size': 9, 'color': '#ccab1b'}), ('[TOPIC]\nAWS March', {'size': 6, 'color': '#ccab1

In [133]:
    # Generate topic name
    unique_topics = set(topics)
    topic_name = {}
    for i in range(len(topic_model.get_topic_info())):
        rep_words = topic_model.get_topic_info()["Representation"][i]
        gen_llm_topic_name = llm.invoke(f"Generate a 2 to 3 word TOPIC NAME from the words it's represented by: ({rep_words[:4]}). IMPORTANT: EXCLUDE ANY PREAMBLE OR EXPLANATION.")
        print(gen_llm_topic_name)
        print(unique_topics)
        topic_name[i+min(unique_topics)] = gen_llm_topic_name

    topic_model.set_topic_labels(topic_name)

Annual Report
{0, 2, 3, 4, 5, -1}
Amazon Sales
{0, 2, 3, 4, 5, -1}
E-commerce platforms
{0, 2, 3, 4, 5, -1}
Amazon March Dilution
{0, 2, 3, 4, 5, -1}
Lease Repayments
{0, 2, 3, 4, 5, -1}
Financial Reports
{0, 2, 3, 4, 5, -1}
AWS March
{0, 2, 3, 4, 5, -1}


In [196]:
df_copy = topic_model.get_topic_info()[["Topic", "CustomName", "Representation", "Count", "Representative_Docs"]].copy(deep=True)

In [197]:
df_copy

Unnamed: 0,Topic,CustomName,Representation,Count,Representative_Docs
0,-1,Annual Report,"[report, references, annual, 2020, seller, sel...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce..."
1,0,Amazon Sales,"[income, amazon, aws, sales, billion, quarter,...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
2,1,E-commerce platforms,"[alibaba, commerce, business, companies, merch...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...
3,2,Amazon March Dilution,"[amazon, march, diluted, 2023, quarter, 2024, ...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,Lease Repayments,"[financing, repayments, leases, finance, incom...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...
5,4,Financial Reports,"[stockholders, liabilities, equity, earnings, ...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
6,5,AWS March,"[aws, 2023, months, march, 032, 2024, 037, 247...",6,"[: AWS > Three Months Ended March 31 , 2023 20..."


In [198]:
# Find a mapping from each topic to the Documents in the topic
topic_to_docs = {}
doc_path = [doc.metadata['file_path'].split("/")[-1] for doc in docs]
set_doc_path = set(doc_path)
for path in set_doc_path:
    # Each main doc has multiple topics associated with it
    # Find all topics associated with the doc
    chunks = [doc for doc in docs if doc.metadata['file_path'].split('/')[-1] == path]
    topics = [topic_model.get_document_info(docs_str)['Topic'][docs.index(chunk)] for chunk in chunks]
    set_topics = set(topics)
    print(set_topics)
    for topic in set_topics:
        if topic not in topic_to_docs:
            topic_to_docs[topic] = []
        topic_to_docs[topic].append(path)


{0, 1, -1}
{0, 2, 3, 4, 5, -1}


In [199]:
# Format each value such that it's a string
for topic in topic_to_docs:
    topic_to_docs[topic] = ", ".join(topic_to_docs[topic])

# Format Representation
df_copy["Representation"] = df_copy["Representation"].apply(lambda x: ", ".join(x))

In [200]:
topic_to_docs

{0: 'Alibaba.pdf, amazon_report.pdf',
 1: 'Alibaba.pdf',
 -1: 'Alibaba.pdf, amazon_report.pdf',
 2: 'amazon_report.pdf',
 3: 'amazon_report.pdf',
 4: 'amazon_report.pdf',
 5: 'amazon_report.pdf'}

In [201]:
df_copy

Unnamed: 0,Topic,CustomName,Representation,Count,Representative_Docs
0,-1,Annual Report,"report, references, annual, 2020, seller, sell...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce..."
1,0,Amazon Sales,"income, amazon, aws, sales, billion, quarter, ...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
2,1,E-commerce platforms,"alibaba, commerce, business, companies, mercha...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...
3,2,Amazon March Dilution,"amazon, march, diluted, 2023, quarter, 2024, a...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...
4,3,Lease Repayments,"financing, repayments, leases, finance, income...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...
5,4,Financial Reports,"stockholders, liabilities, equity, earnings, s...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...
6,5,AWS March,"aws, 2023, months, march, 032, 2024, 037, 247,...",6,"[: AWS > Three Months Ended March 31 , 2023 20..."


In [202]:
# Add to df
df_copy["Documents"] = df_copy["Topic"].map(topic_to_docs)

In [203]:
df_copy

Unnamed: 0,Topic,CustomName,Representation,Count,Representative_Docs,Documents
0,-1,Annual Report,"report, references, annual, 2020, seller, sell...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce...","Alibaba.pdf, amazon_report.pdf"
1,0,Amazon Sales,"income, amazon, aws, sales, billion, quarter, ...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,"Alibaba.pdf, amazon_report.pdf"
2,1,E-commerce platforms,"alibaba, commerce, business, companies, mercha...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...,Alibaba.pdf
3,2,Amazon March Dilution,"amazon, march, diluted, 2023, quarter, 2024, a...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,amazon_report.pdf
4,3,Lease Repayments,"financing, repayments, leases, finance, income...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...,amazon_report.pdf
5,4,Financial Reports,"stockholders, liabilities, equity, earnings, s...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...,amazon_report.pdf
6,5,AWS March,"aws, 2023, months, march, 032, 2024, 037, 247,...",6,"[: AWS > Three Months Ended March 31 , 2023 20...",amazon_report.pdf


In [204]:
# rename column CustomName to Name
df_copy.rename(columns={"Topic": "Topic ID", "CustomName": "Topic", "Representation": "Key Words in Topic", "Count": "No. of Docs", "Representative_Docs": "Key Chunks in Topic"}, inplace=True)

In [205]:
df_copy

Unnamed: 0,Topic ID,Topic,Key Words in Topic,No. of Docs,Key Chunks in Topic,Documents
0,-1,Annual Report,"report, references, annual, 2020, seller, sell...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce...","Alibaba.pdf, amazon_report.pdf"
1,0,Amazon Sales,"income, amazon, aws, sales, billion, quarter, ...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,"Alibaba.pdf, amazon_report.pdf"
2,1,E-commerce platforms,"alibaba, commerce, business, companies, mercha...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...,Alibaba.pdf
3,2,Amazon March Dilution,"amazon, march, diluted, 2023, quarter, 2024, a...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,amazon_report.pdf
4,3,Lease Repayments,"financing, repayments, leases, finance, income...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...,amazon_report.pdf
5,4,Financial Reports,"stockholders, liabilities, equity, earnings, s...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...,amazon_report.pdf
6,5,AWS March,"aws, 2023, months, march, 032, 2024, 037, 247,...",6,"[: AWS > Three Months Ended March 31 , 2023 20...",amazon_report.pdf


In [207]:
df_copy

Unnamed: 0,Topic ID,Topic,Key Words in Topic,No. of Docs,Key Chunks in Topic,Documents
0,-1,Annual Report,"report, references, annual, 2020, seller, sell...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce...","Alibaba.pdf, amazon_report.pdf"
1,0,Amazon Sales,"income, amazon, aws, sales, billion, quarter, ...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,"Alibaba.pdf, amazon_report.pdf"
2,1,E-commerce platforms,"alibaba, commerce, business, companies, mercha...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...,Alibaba.pdf
3,2,Amazon March Dilution,"amazon, march, diluted, 2023, quarter, 2024, a...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,amazon_report.pdf
4,3,Lease Repayments,"financing, repayments, leases, finance, income...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...,amazon_report.pdf
5,4,Financial Reports,"stockholders, liabilities, equity, earnings, s...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...,amazon_report.pdf
6,5,AWS March,"aws, 2023, months, march, 032, 2024, 037, 247,...",6,"[: AWS > Three Months Ended March 31 , 2023 20...",amazon_report.pdf


In [208]:
df_copy.set_index("Topic", inplace=True)

In [209]:
df_copy

Unnamed: 0_level_0,Topic ID,Key Words in Topic,No. of Docs,Key Chunks in Topic,Documents
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Annual Report,-1,"report, references, annual, 2020, seller, sell...",2,"[: : Source : 2020 Annual Report, : Y/Y % > Ce...","Alibaba.pdf, amazon_report.pdf"
Amazon Sales,0,"income, amazon, aws, sales, billion, quarter, ...",94,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,"Alibaba.pdf, amazon_report.pdf"
E-commerce platforms,1,"alibaba, commerce, business, companies, mercha...",48,[: Massive Scale Alibaba > : [ 00 :07 :21 ] Ye...,Alibaba.pdf
Amazon March Dilution,2,"amazon, march, diluted, 2023, quarter, 2024, a...",15,[: AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS ...,amazon_report.pdf
Lease Repayments,3,"financing, repayments, leases, finance, income...",15,[: Y/Y % > : ( 4 ) Free cash flow less princip...,amazon_report.pdf
Financial Reports,4,"stockholders, liabilities, equity, earnings, s...",9,[: LIABILITIES STOCKHOLDERS ’ EQUITY > : Accou...,amazon_report.pdf
AWS March,5,"aws, 2023, months, march, 032, 2024, 037, 247,...",6,"[: AWS > Three Months Ended March 31 , 2023 20...",amazon_report.pdf
