In [1]:
import re
import nest_asyncio
from llama_parse import LlamaParse

In [2]:
LLAMA_CLOUD_API_KEY =''

In [3]:
nest_asyncio.apply()
load_dotenv()

parser = LlamaParse(
    api_key = LLAMA_CLOUD_API_KEY,
    result_type = "markdown",  
    num_workers = 4,  
    verbose = True,
    language = "en"
)

In [4]:
# Parsing SRS 1

srs_1_path = "./srs_1.pdf"
async_srs_1_document = await parser.aload_data(srs_1_path)

Started parsing the file under job_id 7ee81f1d-7b02-4c74-8aa9-0852627c1438
....

In [5]:
# Parsing SRS 

srs_2_path = "./srs_2.pdf"
async_srs_2_document = await parser.aload_data(srs_2_path)

Started parsing the file under job_id 8cbf39a7-920c-4234-b513-b8d7ef55b46c
..............

In [28]:
srs_3_path = "./srs_3.pdf"
async_srs_3_document = await parser.aload_data(srs_3_path)

Started parsing the file under job_id 9b06dfaa-a23e-4951-b24d-4d0bdbd69440
.

In [8]:
def clean_extracted_text_with_newlines(text):
    text = re.sub(r'(?<=\.)\s+', '\n', text)
    text = re.sub(r' {2,}', '\n', text)
    text = re.sub(r'^segment_re_text', '\n', text, flags = re.MULTILINE)
    text = re.sub(r'^#+\s*.*$', '\n', text, flags = re.MULTILINE)
    text = re.sub(r'^\|[-\s|]+\|$', '\n', text, flags = re.MULTILINE)
    text = text.replace("|", " ")
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'^\s+|\s+$', '', text, flags = re.M)
    text = text.strip(' ')
    text = [sents.split(' ') for sents in text.split('\n')]
    text = [tokens for tokens in text if len(tokens) > 2]
    text = [' '.join(tokens) for tokens in text]
    return text 

In [13]:
def parse_pages_to_sentence(pages):
    srs_document = []
    for page in pages:
        srs_document.append(clean_extracted_text_with_newlines(page.text))
    return [sentence for page in srs_document for sentence in page]

In [18]:
srs_1 = parse_pages_to_sentence(async_srs_1_document)

In [19]:
import json

default_label = ["non_requirement_text"]

with open("./srs_1.jsonl", "w") as f:
    for line in srs_1:
        record = {"text": line, "label": default_label}
        json_line = json.dumps(record)
        f.write(json_line + "\n")

In [22]:
srs_2 = parse_pages_to_sentence(async_srs_2_document)

In [23]:
with open("./srs_2.jsonl", "w") as f:
    for line in srs_2:
        record = {"text": line, "label": default_label}
        json_line = json.dumps(record)
        f.write(json_line + "\n")

In [29]:
srs_3 = parse_pages_to_sentence(async_srs_3_document)

In [30]:
with open("./srs_3.jsonl", "w") as f:
    for line in srs_3:
        record = {"text": line, "label": default_label}
        json_line = json.dumps(record)
        f.write(json_line + "\n")