In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
import fitz  # PyMuPDF
import requests
from io import BytesIO

def extract_text_from_pdf(url):
    response = requests.get(url)
    pdf_content = BytesIO(response.content)
    document = fitz.Document(stream=pdf_content, filetype="pdf")

    # Extract text from each page
    doc_blocks = []
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        page_blocks = page.get_text("blocks")
        for block in page_blocks:
            doc_blocks.append(block[4])

    return doc_blocks

# Example usage
# pdf_path = "../data/RAG.pdf"
# pdf_path = "../data/LayoutLM.pdf"
url = "https://arxiv.org/pdf/1912.13318"
doc_blocks = extract_text_from_pdf(url)

for block in doc_blocks:
    print("=>")
    print(block)

=>
LayoutLM: Pre-training of Text and Layout for
Document Image Understanding

=>
Yiheng Xu∗

=>
charlesyihengxu@gmail.com
Harbin Institute of Technology

=>
Minghao Li∗

=>
liminghao1630@buaa.edu.cn
Beihang University

=>
Lei Cui
lecu@microsoft.com
Microsoft Research Asia

=>
Shaohan Huang
shaohanh@microsoft.com
Microsoft Research Asia

=>
Furu Wei
fuwei@microsoft.com
Microsoft Research Asia

=>
Ming Zhou
mingzhou@microsoft.com
Microsoft Research Asia

=>
ABSTRACT

=>
Pre-training techniques have been verified successfully in a vari-
ety of NLP tasks in recent years. Despite the widespread use of
pre-training models for NLP applications, they almost exclusively
focus on text-level manipulation, while neglecting layout and style
information that is vital for document image understanding. In
this paper, we propose the LayoutLM to jointly model interactions
between text and layout information across scanned document
images, which is beneficial for a great number of real-world doc-
ument 

In [29]:
import re
def get_page_span(blocks):
    abstract_idx = None
    for i in range(len(blocks)):
        if re.search("Abstract", blocks[i], re.IGNORECASE):
            abstract_idx = i
            break
    
    references_idx = None
    for i in range(len(blocks)-1, -1, -1):
        if re.search("References", blocks[i], re.IGNORECASE):
            references_idx = i
            break
    
    return abstract_idx, references_idx
            
def clean_block(block: str) -> str:
    if re.search("^figure", block, re.IGNORECASE):
        return ""
    
    MIN_BLOCK_LENGTH = 20 #words
    single_line_block = block.replace("\n", " ")
    single_line_block = re.sub(r"-?\d+(?:\.\d+)?", " ", single_line_block)
    if len(single_line_block.split()) < MIN_BLOCK_LENGTH:
        return ""

    return block

def clean_blocks(blocks: list[str]):
    abstract_idx, references_idx = get_page_span(blocks)
    print(abstract_idx, references_idx)

    assert abstract_idx is not None and references_idx is not None
    
    blocks = blocks[abstract_idx+1: references_idx] 
    cleaned_blocks = []
    for block in blocks:
        print("=>\n", block)
        block = clean_block(block)
        if block:
            cleaned_blocks.append(block)
        else:
            print("********* skipped")
    
    return cleaned_blocks

cleaned_blocks = clean_blocks(doc_blocks)

9 171
=>
 Pre-training techniques have been verified successfully in a vari-
ety of NLP tasks in recent years. Despite the widespread use of
pre-training models for NLP applications, they almost exclusively
focus on text-level manipulation, while neglecting layout and style
information that is vital for document image understanding. In
this paper, we propose the LayoutLM to jointly model interactions
between text and layout information across scanned document
images, which is beneficial for a great number of real-world doc-
ument image understanding tasks such as information extraction
from scanned documents. Furthermore, we also leverage image
features to incorporate words’ visual information into LayoutLM.
To the best of our knowledge, this is the first time that text and
layout are jointly learned in a single framework for document-
level pre-training. It achieves new state-of-the-art results in several
downstream tasks, including form understanding (from 70.72 to
79.27), receipt un

In [24]:
for block in cleaned_blocks:
    print("==>")
    print(block)

==>
Pre-training techniques have been verified successfully in a vari-
ety of NLP tasks in recent years. Despite the widespread use of
pre-training models for NLP applications, they almost exclusively
focus on text-level manipulation, while neglecting layout and style
information that is vital for document image understanding. In
this paper, we propose the LayoutLM to jointly model interactions
between text and layout information across scanned document
images, which is beneficial for a great number of real-world doc-
ument image understanding tasks such as information extraction
from scanned documents. Furthermore, we also leverage image
features to incorporate words’ visual information into LayoutLM.
To the best of our knowledge, this is the first time that text and
layout are jointly learned in a single framework for document-
level pre-training. It achieves new state-of-the-art results in several
downstream tasks, including form understanding (from 70.72 to
79.27), receipt understa

In [17]:
import nltk
from nltk.tokenize import sent_tokenize
import re
nltk.data.path.append("/Users/harshit/nltk_data")

# Download the punkt tokenizer if you haven't already
# nltk.download('punkt')

def split_into_paragraphs(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    sentences = [re.sub("\n", " ", sentence) for sentence in sentences]
    
    # Group sentences into paragraphs of 2 sentences each
    paragraphs = [' '.join(sentences[i:i+2]) for i in range(0, len(sentences), 2)]
    
    return paragraphs

sentences = split_into_paragraphs(page_texts[1])

In [None]:
import instructor
from pydantic import BaseModel, Field
from openai import OpenAI
import google.generativeai as genai


# Define your desired output structure
class Datasets(BaseModel):
    dataset_names: list[str] = Field(description = """Names of datasets mentioned 
                                     in the text which are used by ML model / algorithm for 
                                     training.""")
    algorithms: list[str] = Field(description = """Names of algorithms mentioned 
                                     in the text which are used by ML model / algorithm for 
                                     training / validation """)

# Patch the OpenAI client
client1 = instructor.from_openai(OpenAI())

text = """In order to perform a controlled evaluation, for this
experiment we generate preference pairs over generations using a pre-trained sentiment classifier,
where p(positive | x, yw) > p(positive | x, yl). For SFT, we fine-tune GPT-2-large until convergence
on reviews from the train split of the IMDB dataset (further details in App C.1)"""

# Extract structured data from natural language
def get_response1(text):
    return client1.chat.completions.create(
        model="gpt-4o-mini",
        response_model=Datasets,
        messages=[
            {"role": "system",
            "content": """You're a powerful language model that has been specialized for NER where entities are datasets in the domain of Machine Learning / AI.
            Extract the names of datasets mentioned in the given text"""},
            {"role": "user", "content": text}],)

In [3]:
import instructor
from pydantic import BaseModel, Field
import google.generativeai as genai


# Define your desired output structure
class Datasets(BaseModel):
    datasets: list[str] = Field(description = """Names of datasets mentioned 
                                     in the text which are used by ML model / algorithm for 
                                     training.""")
    methods: list[str] = Field(description = """Names of algorithms / methods mentioned 
                                     in the text""")

# Patch the OpenAI client
client2 = instructor.from_gemini(
    client=genai.GenerativeModel(
        model_name="models/gemini-1.5-flash-latest",
    ),
    mode=instructor.Mode.GEMINI_JSON,
)

text = """In order to perform a controlled evaluation, for this
experiment we generate preference pairs over generations using a pre-trained sentiment classifier,
where p(positive | x, yw) > p(positive | x, yl). For SFT, we fine-tune GPT-2-large until convergence
on reviews from the train split of the IMDB dataset (further details in App C.1)"""

# Extract structured data from natural language
def get_response2(text):
    return client2.messages.create(
    messages=[
            {"role": "system",
            "content": """You're a powerful language model that has been specialized for Named Entity Recognition.
            The possible entities are (1) dataset (2) algorithms / methods mentioned in the text.
            Extract the names of datasets and algorithms mentioned in the given text"""},
            {"role": "user", "content": text}],
    response_model=Datasets,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
client2.client

genai.GenerativeModel(
    model_name='models/gemini-1.5-flash-latest',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [31]:
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=10) as executor:
    responses = list(executor.map(get_response2, cleaned_blocks[3:4]))
    
for res in responses:
    print(res)

datasets=[] methods=['Convolutional Neural Networks (CNN)', 'Faster R-CNN', 'Mask R-CNN', 'Graph Convolutional Networks (GCN)']


In [26]:
print(cleaned_blocks[3])

Document AI, or Document Intelligence1, is a relatively new re-
search topic that refers techniques for automatically reading, under-
standing, and analyzing business documents. Business documents
are files that provide details related to a company’s internal and
external transactions, which are shown in Figure 1. They may be
digital-born, occurring as electronic files, or they may be in scanned
form that comes from written or printed on paper. Some common
examples of business documents include purchase orders, financial
reports, business emails, sales agreements, vendor contracts, letters,
invoices, receipts, resumes, and many others. Business documents
are critical to a company’s efficiency and productivity. The exact
format of a business document may vary, but the information is
usually presented in natural language and can be organized in a
variety of ways from plain text, multi-column layouts, and a wide
variety of tables/forms/figures. Understanding business documents
is a very c

In [62]:
len(cleaned_blocks)

46

In [8]:
responses

[Datasets(dataset_names=['LayoutLM', 'IIT-CDIP Test Collection 1.02'])]

In [14]:
client = instructor.from_openai(OpenAI())

In [15]:
a=client.chat.completions

In [7]:
"""Extract the names of datasets and algorithms mentioned in the given text.
            The possible entities are (1) dataset (2) algorithms / methods mentioned in the text.
            
            ## Text:
            {}
            """.format("asa")

'Extract the names of datasets and algorithms mentioned in the given text.\n            The possible entities are (1) dataset (2) algorithms / methods mentioned in the text.\n            \n            ## Text:\n            asa\n            '