<a href="https://colab.research.google.com/github/kt-chan/Huawei-FinGPT/blob/master/unstructured_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries


In [None]:
!apt-get update
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra libreoffice libtesseract-dev libmagic-dev

In [None]:
!pip install unstructured unstructured-inference unstructured_pytesseract langchain openai chromadb pillow_heif pytesseract

# PDF Content Extraction

In [None]:
!mkdir ./files

mkdir: cannot create directory ‘./files’: File exists


In [None]:
import os, requests
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

# For this notebook I uploaded Nvidia's earnings into the Files directory called "/content/"
pdf_url = "https://static.www.tencent.com/uploads/2024/04/08/e95c902973fc282be3b3e285c6245281.pdf"
output_dir = "./files/"

def download_pdf(url, filename):
    response = requests.get(url, stream=True)
    full_path = os.path.abspath(output_dir+filename)
    with open(full_path, 'wb') as f:
        f.write(response.content)
    print(f'Download completed. File saved as: {full_path}')
    return full_path


filename = download_pdf(pdf_url, 'downloaded_file.pdf')


Download completed. File saved as: /content/files/downloaded_file.pdf


Parittion the pdf files

In [None]:
import re
from unstructured.staging.base import elements_to_text
from unstructured.cleaners.core import clean_non_ascii_chars
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.chunking.title import chunk_by_title

# Partition PDF
# Define parameters for Unstructured's library
strategy = "hi_res" # Strategy for analyzing PDFs and extracting table structure
model_name = "yolox" # Best model for table extraction. Other options are detectron2_onnx and chipper depending on file layout

# Extracts the elements from the PDF
elements = partition_pdf(
  filename=filename,
  strategy=strategy,
  infer_table_structure=True,
  model_name=model_name
)



# Extract tables
para_split_re = re.compile(r"(\s*\n\s*){3}")
element_text = elements_to_text(elements)
element_text = group_broken_paragraphs(element_text, paragraph_split=para_split_re)
element_text = clean_non_ascii_chars(element_text)

# Store results in files
elements_to_json(elements, filename=f"{filename}.json") # Takes a while for file to show up on the Google Colab
elements_to_text(elements, filename=f"{filename}.text") # Takes a while for file to show up on the Google Colab

# Information Retrieval for Table Data

Extract tables from paritions

In [None]:
## In order to extract only the table elements I’ve written a helper function to do so:
import json
from html import escape

def process_json_file(input_filename):
  # Read the JSON file
  with open(input_filename, 'r') as file:
    data = json.load(file)

    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in data:
      if entry["type"] == "Table":
        entry["metadata"]["element_id"] = entry["element_id"]
        extracted_elements.append(entry["metadata"])

  # Write the extracted elements to the output file
  with open(output_dir+"/downloaded_file.pdf.json-tables.html", 'w') as output_file:
    for element in extracted_elements:
      output_file.write("<span id=\'" + element["element_id"]  + "\' metadata=\'")
      text_as_html = element.pop('text_as_html')
      # Convert the dictionary to a JSON string
      json_string = json.dumps(element)
      escaped_json_string = escape(json_string)
      output_file.write(escaped_json_string)
      output_file.write("\'>")
      output_file.write(text_as_html) # Adding two newlines for separation
      output_file.write("</span>" + "\n\n")

process_json_file(f"{filename}.json") # Takes a while for the .txt file to show up in Colab



In [251]:
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display


# Define a fuctnion to clean tables headings
def clean_table(ds_table):
    df_table =  pd.DataFrame(ds_table)
    df_numeric = df_table.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
    df_combined = pd.concat([df_table.iloc[:, 0], df_numeric], axis=1)
    df_cleaned = df_combined.dropna(how='any')
    return df_cleaned


# Define a function to clean the data
def clean_text(text):
    # Check if the text is a string
    if isinstance(text, str):
        # Replace unwanted characters if it's a string
        text = text.replace('#', '').replace('*', '')
        # Check if the text is a number with commas
        if re.match(r'^-?\d{1,3}(,\d{3})*\.\d+$', text):
            # Remove commas and convert to float
            return float(text.replace(',', ''))
        # Check if the text is an integer with commas
        elif re.match(r'^-?\d{1,3}(,\d{3})*$', text):
            # Remove commas and convert to integer
            return int(text.replace(',', ''))
    # Return the text as is if it's not a string
    return text


html_content = None
# Your HTML content as a string
file_path = output_dir + '/downloaded_file.pdf.json-tables.html'
# Write the HTML content to the file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
from bs4 import BeautifulSoup
import pandas as pd

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize an empty dictionary to hold the tables with span_id as the key
# and the table data as the value
tables_dict = {}


# Find all span elements with an 'id' attribute
for span in soup.find_all('span', id=True):
    span_id = span['id']

    # Initialize a dictionary to hold the metadata and table data for the current span
    span_data = {}

    # Extract the 'metadata' attribute if it exists
    metadata = span.get('metadata', None)
    if metadata:
        span_data['metadata'] = metadata

    # Find the table within the span element
    table = span.find('table')
    if table:
        # Use pandas to read the table
        table_df = pd.read_html(str(table)) # Get the first DataFrame
        span_data['table'] = table_df

    # Add the span data to the main dictionary using the span_id as the key
    tables_dict[span_id] = span_data


# Get the first key-value pair based on insertion order
datasets = []
for span_id, span_data in tables_dict.items():
    # Access the metadata and the table DataFrame
    metadata = span_data.get('metadata')
    tables_df_tmp = span_data.get('table')
    tables_df= []
    for ds_table in tables_df_tmp:
      ds_table = ds_table.applymap(clean_text)
      tables_df.append(ds_table)

    datasets.append({
            "id": span_id,  # Corrected the syntax for dictionary keys (no quotes)
            "meta": metadata,  # Corrected the variable name ('metadata' instead of 'metdata')
            "data": tables_df,  # Assuming you want to store the first DataFrame in the list
            "data_raw": tables_df_tmp  # raw data format, without formatting
        })


This is for testing the maths group function on clean tables.

In [None]:
## This is for testing the maths group function on clean tables.
# idx = 0


# for dataset in datasets[idx]["data_raw"]:
#   display(dataset)
#   break


# for dataset in datasets[idx]["data"]:
#   dataset = clean_table(dataset)
#   dataset = dataset.iloc[:,1:]
#   dataset = dataset.sum().to_frame(name="sum")
#   display(dataset)
#   break



# Save files to Drive

In [None]:
import os
import shutil
from google.colab import drive

# Mount your Google Drive to the Colab environment
drive.mount('/content/drive')

# Define the source directory (local to the Colab environment)
source_dir = '/content/files'  # Update this to the correct path of your "output" directory

# Define the target directory in your Google Drive
target_dir = '/content/drive/MyDrive/Colab Notebooks/files'  # Update this to your desired path

# Make sure the target directory exists, if not create it
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# Copy all files from the source directory to the target directory
for file_name in os.listdir(source_dir):
    # Construct full file path
    file_path = os.path.join(source_dir, file_name)

    # Check if it is a file and not a directory, then copy it
    if os.path.isfile(file_path):
        # Define the target file path
        target_file_path = os.path.join(target_dir, file_name)

        # Copy the file using shutil.copy2 to preserve metadata
        shutil.copy2(file_path, target_file_path)
        print(f'File copied: {file_name}')

print('All files copied to Google Drive.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File copied: downloaded_file.pdf.json
File copied: downloaded_file.pdf.json-tables.html
File copied: downloaded_file.pdf.text
File copied: downloaded_file.pdf
All files copied to Google Drive.
