# Chunk and Summarize

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
def load_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents


In [3]:
def create_summary_prompt(forward_summary, current_chunk):
    template = """
    Given the following forward summary and a new text chunk, generate a concise summary of the chunk that maintains coherence with the forward summary.

    Forward Summary: {forward_summary}
    Current Chunk: {current_chunk}

    Please provide a clear and relevant summary of the current chunk.
    """
    prompt = PromptTemplate(input_variables=["forward_summary", "current_chunk"], template=template)
    return prompt


In [12]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash", api_key=os.getenv("GOOGLE_API_KEY")
)

In [13]:
def summarize_chunk(forward_summary, chunk, llm):
    prompt = create_summary_prompt(forward_summary, chunk)
    chain = LLMChain(llm=llm, prompt=prompt)
    summary = chain.run({"forward_summary": forward_summary, "current_chunk": chunk})
    return summary


In [14]:
# Split the document into manageable chunks
def split_text(document, chunk_size=1000):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200)
    chunks = text_splitter.split_documents(document)
    return chunks

In [21]:

# Main function to process the document and generate summaries
def process_pdf_document(pdf_path):
    # Load PDF
    documents = load_pdf(pdf_path)
    
    # Initialize OpenAI model
    llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash", api_key=os.getenv("GOOGLE_API_KEY")
    )
    # Split the document into chunks
    chunks = split_text(documents, chunk_size=1000)
    forward_summary = ""
    chunk_summaries = []

    # Process each chunk iteratively
    for chunk in chunks[0:10]:
        current_text = chunk.page_content
        chunk_summary = summarize_chunk(forward_summary, current_text, llm)
        
        # Update forward summary
        forward_summary = chunk_summary
        
        # Store the summary for this chunk
        chunk_summaries.append(chunk_summary)

    return chunk_summaries

In [22]:
pdf_path = "./docs/GIÁO-TRÌNH-PHÂN-TÍCH-DỮ-LIỆU-KINH-DOANH.pdf"
summaries = process_pdf_document(pdf_path)

In [23]:
for i, summary in enumerate(summaries):
    print(f"Summary of Chunk {i+1}: {summary}\n")

Summary of Chunk 1: The chunk shows a title page for a Business Data Analysis textbook, listing authors (Nguyễn Đình Thuân, Nguyễn Minh Nhựt, Nguyễn Thị Viết Hương, Trịnh Thị Thanh Trúc) and the publishing institution (University of Information Technology, Ho Chi Minh City University of Technology).  The year is 2024.

Summary of Chunk 2: The preface explains the increasing importance of data analysis in the context of Industry 4.0.  This Business Data Analysis textbook, written from an IT perspective, aims to equip students with the necessary statistical, algorithmic, and programming skills using tools like R and Python.  The textbook covers descriptive statistics and data interpretation, starting with an overview of data analysis problems.

Summary of Chunk 3: The textbook covers data analysis using statistics and machine learning.  Chapters include descriptive statistics, data interpretation, regression analysis, logistic regression, time series forecasting, and machine learning for

# Tagging

In [25]:
import spacy
import pandas as pd
from collections import Counter

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [27]:
corpus = [
    "Artificial Intelligence and Machine Learning are transforming the tech industry.",
    "The use of machine learning in healthcare has great potential.",
    "Robotics and AI have been integrated into various industries.",
]

In [26]:
# Function to extract domain-specific tags (keywords, entities, etc.)
def extract_tags_from_text(text):
    doc = nlp(text)
    # Extract named entities and noun phrases (common domain-specific terms)
    tags = [ent.text.lower() for ent in doc.ents] + [np.text.lower() for np in doc.noun_chunks]
    return tags


In [28]:
corpus_tags = []
for doc in corpus:
    tags = extract_tags_from_text(doc)
    corpus_tags.append(tags)

In [29]:
flat_tags = [tag for tags in corpus_tags for tag in tags]
tag_frequency = Counter(flat_tags)


In [30]:
print("Most common tags in corpus:", tag_frequency.most_common(10))

Most common tags in corpus: [('machine learning', 2), ('ai', 2), ('artificial intelligence and machine learning', 1), ('artificial intelligence', 1), ('the tech industry', 1), ('the use', 1), ('healthcare', 1), ('great potential', 1), ('robotics', 1), ('various industries', 1)]


In [31]:
def extract_tags_from_query(query):
    return extract_tags_from_text(query)

In [32]:
query = "How can machine learning help in improving healthcare?"


In [33]:
query_tags = extract_tags_from_query(query)
print("Tags from query:", query_tags)

Tags from query: ['help', 'healthcare']


In [34]:
def map_query_to_corpus_tags(query_tags, corpus_tags):
    matched_tags = []
    for tags in corpus_tags:
        matched_tags.append([tag for tag in tags if tag in query_tags])
    return matched_tags

In [35]:
mapped_tags = map_query_to_corpus_tags(query_tags, corpus_tags)
print("Mapped tags from query to corpus:", mapped_tags)

Mapped tags from query to corpus: [[], ['healthcare'], []]
