In [17]:
import os
import numpy as np
import fitz
import pandas as pd
from tqdm import tqdm
fol_path = r'\\isco-pipe.local\root\Common\EPC\Crowder Industrial\Crowder Santee Cooper Winyah ELG\Isometrics'

In [18]:
def extract_text_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def do_extraction(full_pdf_path):
    # Extract text using PyMuPDF
    pymupdf_text = extract_text_pymupdf(full_pdf_path).split("\n")
    bill_of_materials = []
    start_collecting = False
    for line in pymupdf_text:
        if line=='N':
            break
        if line=='BILL OF MATERIALS':
            start_collecting = True
        bill_of_materials.append(line)
    return bill_of_materials

def make_table(bill_of_materials):
    new_table = []
    second_table = []
    unallowed = ['BILL OF MATERIALS','PIPE','FITTINGS','OLETS','FLANGES','VALVES','PIPE SUPPORTS']
    columns = ['ID','QTY','ND','DESCRIPTION','SUPPORT DETAIL']
    columns_seen = [0,0,0,0,0]
    table_to_input = new_table
    second_table_used = False
    for t_element in bill_of_materials:
        if not t_element.upper().strip() in unallowed:
            try:
                col_i = columns.index(t_element.upper().strip())
            except:
                col_i = -1
            if col_i!=-1:
                if columns_seen[col_i]==0:
                    if t_element=='SUPPORT DETAIL':
                        new_table = table_to_input
                        table_to_input = second_table
                        table_to_input.extend(columns[:-1])
                        second_table_used = True
                    columns_seen[col_i]=1
                elif columns_seen[col_i]==1:
                    continue
            table_to_input.append(t_element)
    if second_table_used:
        second_table = table_to_input
    else:
        new_table = table_to_input
    new_table = np.array(new_table).reshape(-1,4)
    if len(second_table)>0:
        second_table = np.array(second_table).reshape(-1,5)
    return new_table,second_table

def combine_tables(new_table,second_table,pdf_path):
    ndf = pd.DataFrame(new_table)
    ndf.columns = ndf.iloc[0]
    ndf = ndf.drop(0)
    if isinstance(second_table,np.ndarray):
        sdf = pd.DataFrame(second_table)
        sdf.columns = sdf.iloc[0]
        sdf = sdf.drop(0)
        combined = pd.concat([ndf, sdf])#.fillna(None)
        final = combined
    else:
        final = ndf
    final['FILE_PATH'] = os.path.split(pdf_path)[1]
    return final

def extract_bill_of_materials(pdf_path):
    bill_of_materials = do_extraction(pdf_path)
    t1,t2 = make_table(bill_of_materials)
    t = combine_tables(t1,t2,pdf_path)
    return t

def extract_bills_of_materials(fol_path):
    bills_of_materials = []
    errors = []
    for pdf in tqdm(os.listdir(fol_path)):
        pdf_path = os.path.join(fol_path,pdf)
        try:
            bills_of_materials.append(extract_bill_of_materials(pdf_path))
        except Exception as e:
            print((pdf_path,e))
            errors.append((pdf_path,e))
            
    return bills_of_materials,errors

In [20]:
bills_of_materials,errors = extract_bills_of_materials(fol_path)

 18%|██████████████▍                                                                  | 50/280 [00:46<03:21,  1.14it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\18-0-528-40603-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 24%|███████████████████▋                                                             | 68/280 [01:02<03:01,  1.17it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401A-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 25%|███████████████████▉                                                             | 69/280 [01:03<02:59,  1.18it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401B-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 25%|████████████████████▎                                                            | 70/280 [01:04<02:58,  1.18it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401C-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 25%|████████████████████▌                                                            | 71/280 [01:05<02:56,  1.19it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401D-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 26%|████████████████████▊                                                            | 72/280 [01:05<02:53,  1.20it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401E-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 26%|█████████████████████                                                            | 73/280 [01:06<02:53,  1.19it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401F-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 26%|█████████████████████▍                                                           | 74/280 [01:07<02:52,  1.20it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401G-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 27%|█████████████████████▋                                                           | 75/280 [01:08<02:51,  1.20it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401H-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 27%|█████████████████████▉                                                           | 76/280 [01:09<02:49,  1.20it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401J-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 28%|██████████████████████▎                                                          | 77/280 [01:10<02:48,  1.20it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401K-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 28%|██████████████████████▌                                                          | 78/280 [01:10<02:47,  1.21it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401L-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 28%|██████████████████████▊                                                          | 79/280 [01:11<02:24,  1.39it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401M-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 29%|███████████████████████▏                                                         | 80/280 [01:11<02:12,  1.50it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401N-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 29%|███████████████████████▍                                                         | 81/280 [01:12<02:23,  1.39it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\3-0-528-40401P-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 39 into shape (4)'))


 53%|██████████████████████████████████████████▎                                     | 148/280 [02:09<01:48,  1.21it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\4-0-528-40655-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 14 into shape (5)'))


 55%|████████████████████████████████████████████                                    | 154/280 [02:15<02:07,  1.01s/it]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\6-0-528-40007-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 9 into shape (5)'))


 79%|███████████████████████████████████████████████████████████████▍                | 222/280 [03:16<01:12,  1.25s/it]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\6-0-528-40657-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 14 into shape (5)'))


 87%|█████████████████████████████████████████████████████████████████████▋          | 244/280 [03:36<00:33,  1.09it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\8-0-528-40197-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 19 into shape (5)'))


100%|████████████████████████████████████████████████████████████████████████████████| 280/280 [04:07<00:00,  1.13it/s]

('\\\\isco-pipe.local\\root\\Common\\EPC\\Crowder Industrial\\Crowder Santee Cooper Winyah ELG\\Isometrics\\8-0-528-40659-PE01 - Isometric.pdf', ValueError('cannot reshape array of size 14 into shape (5)'))





In [23]:
df = pd.concat(bills_of_materials)
len(df['FILE_PATH'].unique())

260

In [24]:
df.to_excel(os.path.join(fol_path,'extracted_boms.xlsx'))

In [25]:
with open(os.path.join(fol_path,'extracted_boms_errors.txt'),'w') as error_txt:
    for error in errors:
        error_txt.write(str(error)+'\n')