In [6]:
from unstructured.partition.pdf import partition_pdf

In [4]:
pdf_path = "/data/saikat/unstructured_io_poc/data/1A.pdf"

In [73]:
elements = partition_pdf(pdf_path, infer_table_structure = True, hi_res_model_name='yolox')

In [59]:
text_elements = [str(el) for el in elements]

In [60]:
text_elements

['Threshold Conditions',
 'Chapter 1',
 'Introduction',
 '1',
 'COND 1 : Introduction',
 'Section 1.1A : Application',
 '1.1A',
 'Application',
 'To which threshold conditions does COND apply? ..................................................................................................... 1.1A.1 G (1) Section 55C of the Financial Services Act 2012 (Power to amend Schedule 6) gave HM Treasury the power to amend Schedule 6 of the Act. HM Treasury exercised this power by making The Financial Services and Markets Act 2000 (Threshold Conditions) Order 2013 which entered into force on 1 April 2013 (the "TC Order"). The TC Order\'s main result is the creation of four sets of threshold conditions, namely: (i) conditions for firms authorised and regulated by the FCA only (paragraphs 2B to 2F of Schedule 6 to the Act) (ii) FCA specific conditions for firms authorised by the PRA and subject to dual regulation (paragraphs 3B to 3E of Schedule 6 to the Act); (iii) PRA-specific conditions for i

In [14]:
import pandas as pd

In [61]:
df = pd.DataFrame(data=text_elements, columns=['chunks'])

In [62]:
df

Unnamed: 0,chunks
0,Threshold Conditions
1,Chapter 1
2,Introduction
3,1
4,COND 1 : Introduction
...,...
103,n Release 36 l May 2024
104,www.handbook.fca.org.uk
105,www.handbook.fca.org.uk
106,COND 1/7


# With Langchain

In [10]:
import os
import sys

import fitz


def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
    """Determine bboxes which wrap a column."""
    paths = page.get_drawings()
    bboxes = []

    # path rectangles
    path_rects = []

    # image bboxes
    img_bboxes = []

    # bboxes of non-horizontal text
    # avoid when expanding horizontal text boxes
    vert_bboxes = []

    # compute relevant page area
    clip = +page.rect
    clip.y1 -= footer_margin  # Remove footer area
    clip.y0 += header_margin  # Remove header area

    def can_extend(temp, bb, bboxlist):
        """Determines whether rectangle 'temp' can be extended by 'bb'
        without intersecting any of the rectangles contained in 'bboxlist'.

        Items of bboxlist may be None if they have been removed.

        Returns:
            True if 'temp' has no intersections with items of 'bboxlist'.
        """
        for b in bboxlist:
            if not intersects_bboxes(temp, vert_bboxes) and (
                b == None or b == bb or (temp & b).is_empty
            ):
                continue
            return False

        return True

    def in_bbox(bb, bboxes):
        """Return 1-based number if a bbox contains bb, else return 0."""
        for i, bbox in enumerate(bboxes):
            if bb in bbox:
                return i + 1
        return 0

    def intersects_bboxes(bb, bboxes):
        """Return True if a bbox intersects bb, else return False."""
        for bbox in bboxes:
            if not (bb & bbox).is_empty:
                return True
        return False

    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
        """Extend a bbox to the right page border.

        Whenever there is no text to the right of a bbox, enlarge it up
        to the right page border.

        Args:
            bboxes: (list[IRect]) bboxes to check
            width: (int) page width
            path_bboxes: (list[IRect]) bboxes with a background color
            vert_bboxes: (list[IRect]) bboxes with vertical text
            img_bboxes: (list[IRect]) bboxes of images
        Returns:
            Potentially modified bboxes.
        """
        for i, bb in enumerate(bboxes):
            # do not extend text with background color
            if in_bbox(bb, path_bboxes):
                continue

            # do not extend text in images
            if in_bbox(bb, img_bboxes):
                continue

            # temp extends bb to the right page border
            temp = +bb
            temp.x1 = width

            # do not cut through colored background or images
            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
                continue

            # also, do not intersect other text bboxes
            check = can_extend(temp, bb, bboxes)
            if check:
                bboxes[i] = temp  # replace with enlarged bbox

        return [b for b in bboxes if b != None]

    def clean_nblocks(nblocks):
        """Do some elementary cleaning."""

        # 1. remove any duplicate blocks.
        blen = len(nblocks)
        if blen < 2:
            return nblocks
        start = blen - 1
        for i in range(start, -1, -1):
            bb1 = nblocks[i]
            bb0 = nblocks[i - 1]
            if bb0 == bb1:
                del nblocks[i]

        # 2. repair sequence in special cases:
        # consecutive bboxes with almost same bottom value are sorted ascending
        # by x-coordinate.
        y1 = nblocks[0].y1  # first bottom coordinate
        i0 = 0  # its index
        i1 = -1  # index of last bbox with same bottom

        # Iterate over bboxes, identifying segments with approx. same bottom value.
        # Replace every segment by its sorted version.
        for i in range(1, len(nblocks)):
            b1 = nblocks[i]
            if abs(b1.y1 - y1) > 10:  # different bottom
                if i1 > i0:  # segment length > 1? Sort it!
                    nblocks[i0 : i1 + 1] = sorted(
                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
                    )
                y1 = b1.y1  # store new bottom value
                i0 = i  # store its start index
            i1 = i  # store current index
        if i1 > i0:  # segment waiting to be sorted
            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
        return nblocks

    # extract vector graphics
    for p in paths:
        path_rects.append(p["rect"].irect)
    path_bboxes = path_rects

    # sort path bboxes by ascending top, then left coordinates
    path_bboxes.sort(key=lambda b: (b.y0, b.x0))

    # bboxes of images on page, no need to sort them
    for item in page.get_images():
        img_bboxes.extend(page.get_image_rects(item[0]))

    # blocks of text on page
    blocks = page.get_text(
        "dict",
        flags=fitz.TEXTFLAGS_TEXT,
        clip=clip,
    )["blocks"]

    # Make block rectangles, ignoring non-horizontal text
    for b in blocks:
        bbox = fitz.IRect(b["bbox"])  # bbox of the block

        # ignore text written upon images
        if no_image_text and in_bbox(bbox, img_bboxes):
            continue

        # confirm first line to be horizontal
        line0 = b["lines"][0]  # get first line
        if line0["dir"] != (1, 0):  # only accept horizontal text
            vert_bboxes.append(bbox)
            continue

        srect = fitz.EMPTY_IRECT()
        for line in b["lines"]:
            lbbox = fitz.IRect(line["bbox"])
            text = "".join([s["text"].strip() for s in line["spans"]])
            if len(text) > 1:
                srect |= lbbox
        bbox = +srect

        if not bbox.is_empty:
            bboxes.append(bbox)

    # Sort text bboxes by ascending background, top, then left coordinates
    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))

    # Extend bboxes to the right where possible
    bboxes = extend_right(
        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
    )

    # immediately return of no text found
    if bboxes == []:
        return []

    # --------------------------------------------------------------------
    # Join bboxes to establish some column structure
    # --------------------------------------------------------------------
    # the final block bboxes on page
    nblocks = [bboxes[0]]  # pre-fill with first bbox
    bboxes = bboxes[1:]  # remaining old bboxes

    for i, bb in enumerate(bboxes):  # iterate old bboxes
        check = False  # indicates unwanted joins

        # check if bb can extend one of the new blocks
        for j in range(len(nblocks)):
            nbb = nblocks[j]  # a new block

            # never join across columns
            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
                continue

            # never join across different background colors
            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
                continue

            temp = bb | nbb  # temporary extension of new block
            check = can_extend(temp, nbb, nblocks)
            if check == True:
                break

        if not check:  # bb cannot be used to extend any of the new bboxes
            nblocks.append(bb)  # so add it to the list
            j = len(nblocks) - 1  # index of it
            temp = nblocks[j]  # new bbox added

        # check if some remaining bbox is contained in temp
        check = can_extend(temp, bb, bboxes)
        if check == False:
            nblocks.append(bb)
        else:
            nblocks[j] = temp
        bboxes[i] = None

    # do some elementary cleaning
    nblocks = clean_nblocks(nblocks)

    # return identified text bboxes
    return nblocks

In [16]:
import torch
import fitz
import re
import transformers
import os
import pandas as pd



In [17]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "/data/saikat/RegBank_Policy_POC/misc/mixtral-tokenizer/"
)

In [60]:
def create_circular_store(filepath):
    import fitz
    import os
    import re
    
    doc = fitz.open(filepath)
    filename = os.path.split(filepath)

    page_num = 1
    text_list = []

    for page in doc:
        bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

        block_num = 1
        for rect in bboxes:
            block_text = page.get_text(clip=rect, sort=True)

            paragraph_text = block_text
            paragraph_text = paragraph_text.replace("\n", " ")
            paragraph_text = re.sub(r"\-\n", "", paragraph_text)
            paragraph_text = re.sub(r"\'", "", paragraph_text)
            paragraph_text = paragraph_text.replace("\\n", " ")
            paragraph_text = paragraph_text.replace("\\xa0", " ")
            paragraph_text = re.sub(" +", " ", paragraph_text)
            paragraph_text = re.sub(r"Page\s\d{1,}", "", paragraph_text)

            text_list.append((page_num, block_num, paragraph_text))
            block_num += 1
        page_num += 1

    sorted_text_list = sorted(text_list, key=lambda x: (x[0], x[1]))

    docs = {}
    text = ""

    current_chunk_token_length = 0
    chunk_number = 0

    for list_item in sorted_text_list:

        page_num = list_item[0]
        block_num = list_item[1]
        paragraph_text = list_item[2]

        token_length = len(tokenizer.tokenize(paragraph_text))

        if token_length > 0 and chunk_number == 0 and current_chunk_token_length == 0:
            docs[chunk_number] = paragraph_text

        current_chunk_token_length = len(tokenizer.tokenize(docs[chunk_number]))

    
        docs[chunk_number] = docs[chunk_number] + paragraph_text

    return docs

In [61]:
store = create_circular_store(pdf_path)

In [62]:
store

{0: 'Threshold Conditions Chapter 1 Introduction Threshold Conditions Chapter 1 Introduction 1.1A Application To which threshold conditions does COND apply? ..................................................................................................... (1) Section 55C of the Financial Services Act 2012 (Power to amend Schedule 6) gave HM Treasury the power to amend Schedule 6 of the Act. HM Treasury exercised this power by making The Financial Services and Markets Act 2000 (Threshold Conditions) Order 2013 which entered into force on 1 April 2013 (the "TC Order"). The TC Orders main result is the creation of four sets of threshold conditions, namely: (i) conditions for firms authorised and regulated by the FCA only (paragraphs 2B to 2F of Schedule 6 to the Act) (ii) FCA specific conditions for firms authorised by the PRA and subject to dual regulation (paragraphs 3B to 3E of Schedule 6 to the Act); (iii) PRA-specific conditions for insurers (paragraphs 4A to 4F of Schedule 6 to t

In [53]:
from langchain_text_splitters import SpacyTextSplitter
text_splitter = SpacyTextSplitter(chunk_size = 50, chunk_overlap=0)

In [58]:
tst = "To whom does COND apply? ..................................................................................................... 1 (1) COND applies to all firms, except where stated otherwise in this guidance. (2) In COND, firm includes an applicant for Part 4A permission unless the context otherwise requires. To what extent does COND apply to firms authorised by the PRA (PRA-authorised persons) and subject to dual regulation? ..................................................................................................... (1) As a result of the new legal framework for threshold conditions described in I COND 1.1A.1G (1), PRA-authorised persons and firms seeking to become PRA-authorised persons are subject to two sets of threshold conditions: (i) the FCA-specific conditions referred to in I COND 1.1A.1G (1)(ii)and (ii) one of the two PRA-specific conditions referred to in I COND 1.1A.1G (1)(iii) or I (iv), depending on the PRA-regulated activities which the PRA-authorised person or firm carries on, or is seeking to carry on. The FCA threshold conditions set out in paragraphs 3B to 3E of the Act seek to reflect this. In particular, these threshold conditions do not contain a condition relating to adequate financial resources. This is a matter that falls to be considered by the PRA under its threshold conditions. (2) The majority of the guidance in COND is intended to assist all firms to understand how the FCA will approach its assessment of the applicable FCA threshold conditions, regardless of whether or not a firm is, or is seeking to become, a PRA-authorised person. This is because the FCA threshold conditions which apply to PRA-authorised persons and those which apply to firms authorised by the FCA only are, for the most part, the same. (3) However, where guidance in COND refers to an assessment of a firms financial position or its compliance with prudential regulatory requirements, it is not intended to assist firms which are, or are seeking to become, PRA-authorised persons. This is because these are matters that are not covered by the FCAs threshold conditions, but rather fall to be considered by the PRA under its threshold conditions. (4) Although some of the PRA threshold conditions and FCA threshold conditions that apply to firms which are, or are seeking to become, PRA-authorised persons may appear to address similar subject matter, the FCA will approach the assessment of its threshold conditions with its unique statutory objectives in mind and in the light of the functions which the FCA is required to discharge in relation to them. (5) For the avoidance of doubt, the guidance in COND is not intended to apply to the PRAs assessment of its own threshold conditions in respect of a PRA-authorised person. This is a matter for the PRA alone. [deleted] "
        

In [64]:
chunks = text_splitter.split_text(store[0])

Created a chunk of size 151, which is longer than the specified 50
Created a chunk of size 101, which is longer than the specified 50
Created a chunk of size 137, which is longer than the specified 50
Created a chunk of size 179, which is longer than the specified 50
Created a chunk of size 545, which is longer than the specified 50
Created a chunk of size 119, which is longer than the specified 50
Created a chunk of size 168, which is longer than the specified 50
Created a chunk of size 309, which is longer than the specified 50
Created a chunk of size 301, which is longer than the specified 50
Created a chunk of size 159, which is longer than the specified 50
Created a chunk of size 52, which is longer than the specified 50
Created a chunk of size 103, which is longer than the specified 50
Created a chunk of size 78, which is longer than the specified 50
Created a chunk of size 101, which is longer than the specified 50
Created a chunk of size 118, which is longer than the specified 

In [65]:
chunks

['Threshold Conditions Chapter 1 Introduction Threshold Conditions Chapter 1 Introduction 1.1A Application To which threshold conditions does COND apply?',
 '.....................................................................................................',
 '(1) Section 55C of the Financial Services Act 2012 (Power to amend Schedule 6) gave HM Treasury the power to amend Schedule 6 of the Act.',
 'HM Treasury exercised this power by making The Financial Services and Markets Act 2000 (Threshold Conditions) Order 2013 which entered into force on 1 April 2013 (the "TC Order").',
 'The TC Orders main result is the creation of four sets of threshold conditions, namely: (i) conditions for firms authorised and regulated by the FCA only (paragraphs 2B to 2F of Schedule 6 to the Act) (ii) FCA specific conditions for firms authorised by the PRA and subject to dual regulation (paragraphs 3B to 3E of Schedule 6 to the Act); (iii) PRA-specific conditions for insurers (paragraphs 4A to 4F of Sc