### Installing and importing dependencies

In [1]:
!pip install PyPDF2 pytesseract
!pip install pymupdf
!pip install langchain
!pip install pathlib
!pip install sentence-transformers
!pip install glob2
!pip install pdf2image
!apt-get install -y poppler-utils



E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


#### Note: Google Colab was used for this notebook due to issues with installing tesseract-ocr locally.

In [2]:
!sudo apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libtesseract5
The following NEW packages will be installed:
  libtesseract5
The following packages will be upgraded:
  tesseract-ocr
1 upgraded, 1 newly installed, 0 to remove and 568 not upgraded.
Need to get 1,648 kB of archives.
After this operation, 4,056 kB of additional disk space will be used.
Err:1 http://ppa.launchpad.net/alex-p/tesseract-ocr5/ubuntu focal/main amd64 libtesseract5 amd64 5.3.1-1ppa1~focal1
  Connection failed [IP: 185.125.190.52 80]
Err:2 http://ppa.launchpad.net/alex-p/tesseract-ocr5/ubuntu focal/main amd64 tesseract-ocr amd64 5.3.1-1ppa1~focal1
  Connection failed [IP: 185.125.190.52 80]
E: Failed to fetch http://ppa.launchpad.net/alex-p/tesseract-ocr5/ubuntu/pool/main/t/tesseract/libtesseract5_5.3.1-1ppa1~focal1_amd64.deb  Connection failed [IP: 185.125.190.52 80]
E: Failed to fetch http://ppa.launchpad.net/al

In [None]:
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
import os
import re
import glob
import fitz
from PIL import Image
from PyPDF2 import PdfReader
from transformers import LlamaTokenizerFast
from langchain.text_splitter import TextSplitter, CharacterTextSplitter

### Purpose of this notebook:
1. Extracts text from a PDF file (and saves it into a .txt file)
2. Chunks extracted text into questions to be fed into an LLM to generate Q&A pairs

### Defining functions used for extracting text from the PDF file
Note: CHUNK_SIZE is set to be 1000 with a chunk overlap of 10% across chunks; feel free to experiment with this value.

In [None]:
def extract_text_with_ocr(pdf_path):
    text = ""

    # Convert PDF pages to images using pdf2image
    images = convert_from_path(pdf_path)

    pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
    # Iterate over each page
    for page_num, image in enumerate(images):
        # Perform OCR on the image using pytesseract
        page_text = pytesseract.image_to_string(image, lang='eng')

        # Identify and exclude text from diagrams
        page_text = remove_diagram_text(page_text)

        # Append the extracted text to the overall text
        text += page_text

    return text

def remove_diagram_text(page_text):
    # Add your custom logic to identify and remove text from diagrams
    # You can use regex, string operations, or other techniques to identify and exclude text from diagrams

    # For example, you can define a list of keywords or patterns commonly found in diagram text
    diagram_keywords = ['diagram', 'chart', 'figure', 'graph']

    # Split the page text into lines
    lines = page_text.split('\n')

    # Iterate over each line and exclude lines containing diagram keywords
    filtered_lines = [line for line in lines if not any(keyword in line.lower() for keyword in diagram_keywords)]

    # Join the filtered lines back into a single string
    filtered_text = '\n'.join(filtered_lines)

    return filtered_text


CHUNK_SIZE = 1000

def text_chunker(file) :
  # initiate TextSplitter class and input chunk size and overlap
  # text_splitter = SentenceTransformersTokenTextSplitter(
  #     model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
  #     tokens_per_chunk = CHUNK_SIZE,
  #     chunk_overlap = CHUNK_SIZE // 10 # https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-chunk-documents recommends 10% as a start
  # )
  tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
  text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size = CHUNK_SIZE, chunk_overlap = CHUNK_SIZE // 10)

  # pass in the file to be split
  # output is iterable
  texts = text_splitter.split_text(file)
  return texts

def clean_text(text) :
    # replace double whitespace with single whitespace
    text = text.replace("  ", " ")

    # remove [i], where i is the number inside of a reference point throughout the research paper
    text = re.sub("\[.*?\]", '', text)

    return text

def remove_images(input_pdf, output_pdf):
    doc = fitz.open(input_pdf)
    for page in doc:
        img_list = page.get_images()
        for img in img_list:
            page.delete_image(img[0])

    doc.save(output_pdf)

### Defining PDF name

In [None]:
PDF_NAME = 'dh162021-22_full issue'

### Remove PDF images and extract text from PDF using OCR
Note: if you already have the extracted PDF text in a .txt file, skip ahead to <b>Import extracted PDF text from a .txt file</b> as the text extraction step is expensive.

In [None]:
# if you have many PDFs, wrap the below code cells in a for loop

pdf = f"{PDF_NAME}.pdf"
pdf_no_images = f"{PDF_NAME}_no_images.pdf"
remove_images(pdf, pdf_no_images)

In [None]:
# Expensive, import from {PDF_NAME}_raw.txt file (below) if possible
document = extract_text_with_ocr(pdf_no_images)

### Write extracted PDF text into a .txt file

In [None]:
with open(f'{PDF_NAME}_raw.txt', 'w') as f:
    f.write(document)

### Import extracted PDF text from a .txt file

In [None]:
# If importing from {PDF_NAME}_raw.txt file:
with open(f'{PDF_NAME}_raw.txt', 'r') as f:
  document = f.read().strip()

175085

### Cleaning and chunking text

In [None]:
# preprocess
clean_doc = clean_text(document)

# chunk doc
chunked_text_list = text_chunker(clean_doc)

Downloading (…)okenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

### Storing chunks into a list of questions to be fed into an LLM

In [None]:
prompt = "Based on the text, generate 10 different question and answer pairs.\nText:\n"
final_chunk_list = []
for i, chunk in enumerate(chunked_text_list):
    final_chunk_text = prompt + "\n" + chunk
    final_chunk_dict = {
        'question_id': i + 1,
        'text': final_chunk_text
    }
    final_chunk_list.append(final_chunk_dict)

In [None]:
print(final_chunk_list[:10])

[{'question_id': 1, 'text': 'Based on the text, generate 10 different question and answer pairs.\nText:\n\nDSTA HORIZONS EDITORIAL TEAM\n\nEditor\nKoh Tuan Yew\n\nCo-Editor\nLee Siang Meng Alex\n\nMembers\n\nCai Kunming Alvin Ho Kwee Peng Juli\nChang Chai Fung Lin Jyh Fang Kelvin\nChim Tat Wee Reman Loh Kai Ip Alvin\nChua Siew Ting Pearly Loke Yim Peng\n\nGoh Shi Hui Jaime Loo Jang Wei\n\nHeng Chye Hwee Ng Yeow Chong Ivan\n\nHeng Yinghui Elizabeth\n\nTechnical Editor\n\nProfessor Khoo Boo Cheong\nTemasek Laboratories\nNational University of Singapore\n\nReaders can access DSTA Horizons at\nwww.dsta.gov.sg/dstahorizons\n\nWe welcome your feedback. Please send all correspondence to:\n\nDSTA Horizons Editorial Team\nDSTA Academy\n\n1 Depot Road\n\nSingapore 109679\n\nEmail: dstahorizons@dsta.gov.sg\n\nDSTA Horizons\n\nIssue 16\n\nISSN 2339-529X (print) ISSN 2339-5303 (online)\n©2022 Defence Science and Technology Agency\n\nNo part of this publication may be reproduced, stored or transmitt

### Saving the questions into a .jsonl file

In [None]:
import json

def save_to_jsonl(lst, file_path):
    with open(file_path, 'w') as file:
        for line in lst:
            json_line = json.dumps(line)
            file.write(json_line + '\n')

In [None]:
save_to_jsonl(final_chunk_list, f'/content/{PDF_NAME}_questions.jsonl')