<a href="https://colab.research.google.com/github/kt-chan/Huawei-FinGPT/blob/master/unstructured_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries


In [None]:
!apt-get install poppler-utils tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra libreoffice libtesseract-dev libmagic-dev

In [None]:
!pip install unstructured unstructured-inference unstructured_pytesseract langchain openai chromadb pillow_heif pytesseract

# PDF Content Extraction

In [None]:
import os, requests
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

# For this notebook I uploaded Nvidia's earnings into the Files directory called "/content/"
pdf_url = "https://static.www.tencent.com/uploads/2024/04/08/e95c902973fc282be3b3e285c6245281.pdf"
output_dir = "./"

def download_pdf(url, filename):
    response = requests.get(url, stream=True)
    full_path = os.path.abspath(output_dir+filename)
    with open(full_path, 'wb') as f:
        f.write(response.content)
    print(f'Download completed. File saved as: {full_path}')
    return full_path


filename = download_pdf(pdf_url, 'downloaded_file.pdf')

# Define parameters for Unstructured's library
strategy = "hi_res" # Strategy for analyzing PDFs and extracting table structure
model_name = "yolox" # Best model for table extraction. Other options are detectron2_onnx and chipper depending on file layout

# Extracts the elements from the PDF
elements = partition_pdf(
  filename=filename,
  strategy=strategy,
  infer_table_structure=True,
  model_name=model_name
)


Download completed. File saved as: /content/downloaded_file.pdf


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

In [None]:

# Store results in json
elements_to_json(elements, filename=f"{filename}.json") # Takes a while for file to show up on the Google Colab

In [None]:
import re
from unstructured.staging.base import elements_to_text
from unstructured.cleaners.core import clean_non_ascii_chars
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.chunking.title import chunk_by_title

para_split_re = re.compile(r"(\s*\n\s*){3}")
element_text = elements_to_text(elements)
element_text = group_broken_paragraphs(element_text, paragraph_split=para_split_re)
element_text = clean_non_ascii_chars(element_text)
elements_to_text(elements, filename=f"{filename}.text") # Takes a while for file to show up on the Google Colab



In [None]:
## In order to extract only the table elements I’ve written a helper function to do so:
import json
from html import escape

def process_json_file(input_filename):
  # Read the JSON file
  with open(input_filename, 'r') as file:
    data = json.load(file)

    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in data:
      if entry["type"] == "Table":
        entry["metadata"]["element_id"] = entry["element_id"]
        extracted_elements.append(entry["metadata"])

  # Write the extracted elements to the output file
  with open("./downloaded_file.pdf.json-tables.html", 'w') as output_file:
    for element in extracted_elements:
      output_file.write("<span id=\'" + element["element_id"]  + "\' metadata=\'")
      text_as_html = element.pop('text_as_html')
      # Convert the dictionary to a JSON string
      json_string = json.dumps(element)
      escaped_json_string = escape(json_string)
      output_file.write(escaped_json_string)
      output_file.write("\'>")
      output_file.write(text_as_html) # Adding two newlines for separation
      output_file.write("</span>" + "\n\n")

process_json_file(f"{filename}.json") # Takes a while for the .txt file to show up in Colab



In [None]:
# The next step is to chunk our information into smaller more digestible chunks for our LLMs
# With the tables’ HTML now stored in a .txt file, we can utilize LangChain’s document loader. This tool greatly simplifies the subsequent steps.

from langchain.document_loaders import TextLoader

loader = TextLoader(text_file)
documents = loader.load()