# Setup

In [13]:
import os
import glob
import time
import json
from IPython.display import display
import pandas as pd
from PIL import Image

import gmft
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import CroppedTable, TableDetector
from gmft._rich_text.rich_page import FormattedPage
detector = TableDetector()
from gmft.auto import AutoTableFormatter
from gmft.auto import AutoFormatConfig
config = AutoFormatConfig()
config.semantic_spanning_cells = True # [Experimental] better spanning cells
config.enable_multi_header = True # multi-indices
formatter = AutoTableFormatter(config)

def ingest_pdf(pdf_path) -> list[CroppedTable]:
    doc = PyPDFium2Document(pdf_path)

    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

In [14]:
import math
from PIL import Image

import pandas as pd
import pickle
from pydantic import BaseModel
from typing import Any
import warnings
warnings.filterwarnings("ignore")

# Checking pickle files

In [147]:
# table_position_path = "./lea_storage/table_position.csv"
text_position_path = "./linda_storage/table_title_all_book.pkl"
with open(text_position_path, 'rb') as file:
    text_pos = pickle.load(file)

In [178]:
text_pos[:5]

[{'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 17,
  'position': (60.0, 58.0, 215.26011657714844, 70.0),
  'text': 'Table 1-1 Extrinsic Esophageal Masses'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 16,
  'position': (306.0, 58.0, 476.4881591796875, 70.0),
  'text': 'Table 1-2 Esophageal Submucosal Masses'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 18,
  'position': (42.0, 58.0, 244.8881072998047, 70.0),
  'text': 'Table 1-3 Submucosal Benign Esophageal Tumors'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 20,
  'position': (306.0,
   229.89398193359375,
   488.2601623535156,
   241.89398193359375),
  'text': 'Table 1-4 Esophageal Mucosal Abnormalities'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 21,
  'position': (6

In [146]:
# Load book table
table_position_path = "./linda_storage/processed_books_extracted_tablesv4/Radiology Illustrated_ Hepatobiliary and Pancreatic Radiology ( PDFDrive )/table_positions_Radiology Illustrated_ Hepatobiliary and Pancreatic Radiology ( PDFDrive ).csv"
table_pos = pd.read_csv(table_position_path)
table_pos['pos'] = list(zip(table_pos['x0'], table_pos['y0']))
table_pos.head()

Unnamed: 0,pdf_name,page_number,table_index,x0,y0,pos,image_path,csv_path
0,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,4,1,74.617561,72.363022,"(74.61756134033203, 72.36302185058594)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
1,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,19,2,55.261497,69.506111,"(55.261497497558594, 69.50611114501953)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
2,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,23,3,54.445557,72.900429,"(54.445556640625, 72.90042877197266)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
3,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,29,4,55.111008,73.805336,"(55.11100769042969, 73.80533599853516)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
4,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,39,5,55.771336,74.529961,"(55.77133560180664, 74.52996063232422)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...


In [163]:
table_pos['pdf_name'][300]

'Radiology Illustrated_ Hepatobiliary and Pancreatic Radiology ( PDFDrive )_mainpages.pdf'

In [145]:
for text in text_pos:
    text['page_num'] += 1
    df = table_pos[table_pos['page_number']==text['page_num']]
    if len(df) > 0:
        df['distance'] = df['pos'].apply(lambda x: math.dist(x, text['position'][0:2]))
        page_num = text['page_num']
        idx = df.loc[df['distance'].idxmin()]['table_index']
        pdf_name = text['book_title'][:-10]
        table_path = f"./linda_storage/processed_books_extracted_tablesv4/{pdf_name}/images/{page_num}_{idx}_{pdf_name}.png"
        df_path = f"./linda_storage/processed_books_extracted_tablesv4/{pdf_name}/tables/{page_num}_{idx}_{pdf_name}.csv"
        text['image_path'] = table_path
        text['df_path'] = df_path

# updated matching code

In [186]:
# Example book title to filter
book_to_filter = "gastrointestinal-imaging-the-requisites-fourth-edition_mainpages"

# Filter text_pos by the given book title
filtered_text_pos = [text for text in text_pos if text['book_title'] == book_to_filter]
filtered_text_pos[0]

{'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
 'page_num': 17,
 'position': (60.0, 58.0, 215.26011657714844, 70.0),
 'text': 'Table 1-1 Extrinsic Esophageal Masses'}

In [181]:
# Example book title to filter
book_to_filter = "gastrointestinal-imaging-the-requisites-fourth-edition_mainpages"

# Filter text_pos by the given book title
filtered_text_pos = [text for text in text_pos if text['book_title'] == book_to_filter]

# Normalize pdf_name by removing '.pdf' suffix for matching
for text in filtered_text_pos:
    pdf_name = table_pos['pdf_name'][:-4]  # Removing '.pdf'
    
    # Filter table_pos by matching book title (normalized pdf_name) and page number
    matching_tables = table_pos[
        (table_pos['page_number'] == text['page_num']) & 
        (table_pos['book_title'] == text['book_title'])
    ]

    if len(matching_tables) > 0:
        # Calculate distances and find the closest table
        matching_tables['distance'] = matching_tables['pos'].apply(
            lambda x: math.dist(x, text['position'][0:2])
        )
        idx = matching_tables.loc[matching_tables['distance'].idxmin()]['table_index']

        # Construct paths
        page_num = text['page_num']
        table_path = f"./linda_storage/processed_books_extracted_tablesv4/{pdf_name}/images/{page_num}_{idx}_{pdf_name}.png"
        df_path = f"./linda_storage/processed_books_extracted_tablesv4/{pdf_name}/tables/{page_num}_{idx}_{pdf_name}.csv"

        # Add paths to the text dictionary
        text['image_path'] = table_path
        text['df_path'] = df_path

# Output the filtered dataset with paths added
print(filtered_text_pos)


KeyError: 'book_title'

In [140]:
text_pos[:5]

[{'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 14,
  'position': (60.0, 58.0, 215.26011657714844, 70.0),
  'text': 'Table 1-1 Extrinsic Esophageal Masses'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 17,
  'position': (306.0, 58.0, 476.4881591796875, 70.0),
  'text': 'Table 1-2 Esophageal Submucosal Masses',
  'image_path': './linda_storage/processed_books_extracted_tablesv4/gastrointestinal-imaging-the-requisites-fourth-edition/images/17_3_gastrointestinal-imaging-the-requisites-fourth-edition.png',
  'df_path': './linda_storage/processed_books_extracted_tablesv4/gastrointestinal-imaging-the-requisites-fourth-edition/tables/17_3_gastrointestinal-imaging-the-requisites-fourth-edition.csv'},
 {'book_title': 'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages',
  'page_num': 19,
  'position': (42.0, 58.0, 244.8881072998047, 70.0),
  'text': 'Table 1-3 Submucosal Benign 

# Loading book directories

In [15]:
# Specify the directory path
directory_path = './pdf_data/LIRADS_books_processed_v2'
file_paths = glob.glob(directory_path + '/**/*_mainpages.pdf', recursive=True)

# Initialize empty lists to store data
document_type = []
book_names = []  # For the modified filename (book_name)
full_paths = []

for file_path in file_paths:
    dirname, filename = os.path.split(file_path)
    
    # Set 'dirname' to 'book' for all entries
    document_type.append('Book')
    
    # Extract the book name by removing '_mainpages.pdf'
    book_name = filename
    # book_name = filename.replace('_mainpages.pdf', '')
    book_names.append(book_name)
    
    full_paths.append(file_path)

# Create a DataFrame
books = pd.DataFrame({
    'document_type': document_type,  # 'dirname' becomes 'book'
    'book_name': book_names,  # 'filename' becomes 'book_name'
    'full_path': full_paths
})

# Convert full_path to list if needed
dirname_list = books['full_path'].tolist()

# Print the DataFrame
print(books)

  document_type                                          book_name  \
0          Book  Diagnostic Imaging_ Gastrointestinal ( PDFDriv...   
1          Book  Liver imaging _ MRI with CT correlation ( PDFD...   
2          Book  Mayo Clinic Gastrointestinal Imaging Review ( ...   
3          Book  Radiology Illustrated_ Hepatobiliary and Pancr...   
4          Book  CT and MRI of the Whole Body, 2-Volume Set, 6e...   
5          Book  gastrointestinal-imaging-the-requisites-fourth...   

                                           full_path  
0  ./pdf_data/LIRADS_books_processed_v2/Diagnosti...  
1  ./pdf_data/LIRADS_books_processed_v2/Liver ima...  
2  ./pdf_data/LIRADS_books_processed_v2/Mayo Clin...  
3  ./pdf_data/LIRADS_books_processed_v2/Radiology...  
4  ./pdf_data/LIRADS_books_processed_v2/CT and MR...  
5  ./pdf_data/LIRADS_books_processed_v2/gastroint...  


In [156]:
pdfs_folder='pdf_data/LIRADS_books_processed/'
pdf_list= books['book_name']
print(pdf_list)

0    Diagnostic Imaging_ Gastrointestinal ( PDFDriv...
1    Liver imaging _ MRI with CT correlation ( PDFD...
2    Mayo Clinic Gastrointestinal Imaging Review ( ...
3    Radiology Illustrated_ Hepatobiliary and Pancr...
4    CT and MRI of the Whole Body, 2-Volume Set, 6e...
5    gastrointestinal-imaging-the-requisites-fourth...
Name: book_name, dtype: object


# Function for deleting folders if files are wrongly generated

In [154]:
# import os
# import shutil

# # Specify the directory path
# directory_path = './linda_storage/processed_books_extracted_tablesv4/'

# # Loop through each item in the directory
# for item in os.listdir(directory_path):
#     item_path = os.path.join(directory_path, item)

#     # Check if the item is a directory, if so, delete it
#     if os.path.isdir(item_path):
#         shutil.rmtree(item_path)  # Deletes the directory and its contents

 # Old Process Books code

In [23]:
# import os
# import time
# import pandas as pd

# # Initialize the variables to track time
# _total_detect_time = 0
# _total_detect_num = 0
# _total_format_time = 0
# _total_format_num = 0
# table_details = []  # List to store table details (page no, index, coordinates)

# # Process each PDF in the list
# for paper in pdf_list:
    
#     results = []
#     images = []
#     dfs = []
    
#     start = time.time()
    
#     # Ingest the PDF file and extract tables
#     tables, doc = ingest_pdf('./pdf_data/LIRADS_books_processed/' + paper)
#     num_pages = len(doc)
#     end_detect = time.time()

#     formatted_tables = []

#     # Create a directory for each PDF book
#     book_folder = f'./linda_storage/processed_books_extracted_tablesv4/{paper[:-14]}'
#     images_folder = os.path.join(book_folder, 'images')
#     tables_folder = os.path.join(book_folder, 'tables')
    
#     # Ensure directories for images and tables exist
#     os.makedirs(images_folder, exist_ok=True)
#     os.makedirs(tables_folder, exist_ok=True)
    
#     # Loop over the extracted tables
#     for i, table in enumerate(tables):
#         ft = formatter.extract(table)  # Process the table using the formatter
        
#         # Extract the table's information in dictionary format
#         table_info = ft.to_dict()
        
#         # Get the page number from the dictionary
#         page_number = table_info.get('page_no', 'unknown') + 1  # As pg no starts at 0
#         z = i + 1  # As i starts at 0

#         # Get table coordinates (bounding box)
#         bbox = table_info.get('bbox', (0, 0, 0, 0))  # Ensure bbox exists, default to (0, 0, 0, 0)

#         # Save the table details (page number, table index, and coordinates)
#         table_details.append({
#             'pdf_name': paper,
#             'page_number': page_number,
#             'table_index': z,
#             'x0': bbox[0],
#             'y0': bbox[1],
#             'pos': (bbox[0], bbox[1])
#             # 'x1': bbox[2],
#             # 'y1': bbox[3]
#         })
        
#         # Try to append the dataframe and handle any exceptions
#         try:
#             df = ft.df()
#             dfs.append(df)

#             # Save the dataframe to CSV in the tables folder
#             table_filename = f"{page_number}_{z}_{paper[:-14]}.csv"
#             table_path = os.path.join(tables_folder, table_filename)
#             df.to_csv(table_path, index=False)
#         except Exception as e:
#             print(e)
#             dfs.append(None)
        
#         formatted_tables.append(ft)
        
#         # Save the image corresponding to the table in the images folder
#         image = ft.image()
#         images.append(image)
        
#         # Generate the image filename with page number and table index
#         image_filename = f"{page_number}_{z}_{paper[:-14]}.png"
#         image_path = os.path.join(images_folder, image_filename)
        
#         # Save the image
#         image.save(image_path)
    
#     end_format = time.time()

#     # Close the document after processing
#     doc.close()
    
#     # Store results and print time details
#     results += formatted_tables
#     print(f"Paper: {paper}\nDetect time: {end_detect - start:.3f}s for {num_pages} pages")
#     print(f"Format time: {end_format - end_detect:.3f}s for {len(tables)} tables\n")
    
#     _total_detect_time += end_detect - start
#     _total_detect_num += num_pages
#     _total_format_time += end_format - end_detect
#     _total_format_num += len(tables)

#     # Save table details (coordinates) as a DataFrame for this PDF
#     details_df = pd.DataFrame(table_details)
#     details_df.to_csv(f'{book_folder}/table_positions_{paper[:-14]}.csv', index=False)

# # Print macro statistics for time tracking
# print(f"Macro: {_total_detect_time/_total_detect_num:.3f} s/page and {_total_format_time/_total_format_num:.3f} s/table")


The identified boxes have significant overlap: 27.20% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 30.26% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 43.74% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 36.53% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 25.85% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 33.29% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 21.79% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 34.45% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 29.11% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 92.40% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 80.82% of area is overl

# Updated Book code 
- to include table img path, table df path and pos

In [155]:
import os
import time
import pandas as pd

# Initialize the variables to track time
_total_detect_time = 0
_total_detect_num = 0
_total_format_time = 0
_total_format_num = 0
table_details = []  # List to store table details (page no, index, coordinates, paths)

# Process each PDF in the list
for paper in pdf_list:
    
    results = []
    images = []
    dfs = []
    
    start = time.time()
    
    # Ingest the PDF file and extract tables
    tables, doc = ingest_pdf(f'./pdf_data/LIRADS_books_processed_v2/{paper}')
    num_pages = len(doc)
    end_detect = time.time()

    formatted_tables = []

    # Create a directory for each PDF book
    book_folder = f'./linda_storage/processed_books_extracted_tablesv4/{paper[:-14]}'
    images_folder = os.path.join(book_folder, 'images')
    tables_folder = os.path.join(book_folder, 'tables')
    
    # Ensure directories for images and tables exist
    os.makedirs(images_folder, exist_ok=True)
    os.makedirs(tables_folder, exist_ok=True)
    
    # Loop over the extracted tables
    for i, table in enumerate(tables):
        ft = formatter.extract(table)  # Process the table using the formatter
        
        # Extract the table's information in dictionary format
        table_info = ft.to_dict()
        
        # Get the page number from the dictionary
        page_number = table_info.get('page_no', 'unknown') + 1  # As pg no starts at 0
        z = i + 1  # As i starts at 0

        # Get table coordinates (bounding box)
        bbox = table_info.get('bbox', (0, 0, 0, 0))  # Ensure bbox exists, default to (0, 0, 0, 0)

        # Generate filenames for the image and CSV
        image_filename = f"{page_number}_{z}_{paper[:-14]}.png"
        image_path = os.path.join(images_folder, image_filename)

        table_filename = f"{page_number}_{z}_{paper[:-14]}.csv"
        table_path = os.path.join(tables_folder, table_filename)
        
        # Try to append the dataframe and handle any exceptions
        try:
            df = ft.df()
            dfs.append(df)
            
            # Save the dataframe to CSV in the tables folder
            df.to_csv(table_path, index=False)
        except Exception as e:
            print(e)
            dfs.append(None)
        
        # Save the image corresponding to the table in the images folder
        image = ft.image()
        images.append(image)
        image.save(image_path)

        # Save the table details including paths for the image and CSV
        table_details.append({
            'pdf_name': paper,
            'page_number': page_number,
            'table_index': z,
            'x0': bbox[0],
            'y0': bbox[1],
            'pos': (bbox[0], bbox[1]),
            'image_path': image_path,
            'csv_path': table_path
        })

        formatted_tables.append(ft)
    
    end_format = time.time()

    # Close the document after processing
    doc.close()
    
    # Store results and print time details
    results += formatted_tables
    print(f"Paper: {paper}\nDetect time: {end_detect - start:.3f}s for {num_pages} pages")
    print(f"Format time: {end_format - end_detect:.3f}s for {len(tables)} tables\n")
    
    _total_detect_time += end_detect - start
    _total_detect_num += num_pages
    _total_format_time += end_format - end_detect
    _total_format_num += len(tables)

    # Save table details (coordinates and paths) as a DataFrame for this PDF
    details_df = pd.DataFrame(table_details)
    details_df.to_csv(f'{book_folder}/table_positions_{paper[:-14]}.csv', index=False)

# Print macro statistics for time tracking
print(f"Macro: {_total_detect_time/_total_detect_num:.3f} s/page and {_total_format_time/_total_format_num:.3f} s/table")


The identified boxes have significant overlap: 41.64% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 40.02% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 38.08% of area is overlapping (Max is 20.00%)
Invoking large table row guess! set TATRFormatConfig.force_large_table_assumption to False to disable this.
The identified boxes have significant overlap: 42.20% of area is overlapping (Max is 20.00%)
Invoking large table row guess! set TATRFormatConfig.force_large_table_assumption to False to disable this.
The identified boxes have significant overlap: 40.77% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 35.30% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 36.94% of area is overlapping (Max is 20.00%)
The identified boxes have significant overlap: 38.56% of area is overlapping (Max is 20.00%)
The identified boxes have significant ov

In [164]:
path = "./linda_storage/processed_books_extracted_tablesv4/gastrointestinal-imaging-the-requisites-fourth-edition/table_positions_gastrointestinal-imaging-the-requisites-fourth-edition.csv"

In [165]:
path

'./linda_storage/processed_books_extracted_tablesv4/gastrointestinal-imaging-the-requisites-fourth-edition/table_positions_gastrointestinal-imaging-the-requisites-fourth-edition.csv'

In [172]:
table_pos['pdf_name'][487]

'gastrointestinal-imaging-the-requisites-fourth-edition_mainpages.pdf'

In [166]:
table_pos = pd.read_csv(path)
table_pos

Unnamed: 0,pdf_name,page_number,table_index,x0,y0,pos,image_path,csv_path
0,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,4,1,74.617561,72.363022,"(74.61756134033203, 72.36302185058594)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
1,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,19,2,55.261497,69.506111,"(55.261497497558594, 69.50611114501953)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
2,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,23,3,54.445557,72.900429,"(54.445556640625, 72.90042877197266)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
3,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,29,4,55.111008,73.805336,"(55.11100769042969, 73.80533599853516)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
4,Diagnostic Imaging_ Gastrointestinal ( PDFDriv...,39,5,55.771336,74.529961,"(55.77133560180664, 74.52996063232422)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
...,...,...,...,...,...,...,...,...
484,gastrointestinal-imaging-the-requisites-fourth...,364,84,46.618702,81.059090,"(46.61870193481445, 81.05908966064453)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
485,gastrointestinal-imaging-the-requisites-fourth...,371,85,69.490257,79.867249,"(69.4902572631836, 79.86724853515625)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
486,gastrointestinal-imaging-the-requisites-fourth...,371,86,327.342987,81.736221,"(327.3429870605469, 81.73622131347656)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
487,gastrointestinal-imaging-the-requisites-fourth...,377,87,71.842720,252.906708,"(71.84272003173828, 252.90670776367188)",./linda_storage/processed_books_extracted_tabl...,./linda_storage/processed_books_extracted_tabl...
