In [6]:
import time
import os

In [4]:
INPUT_FOLDER = "F:/Environmental Baseline Data/Version 3/Data/HTML/"
OUTPUT_FOLDER = "F:/Environmental Baseline Data/Version 3/Data/HTML Images and Tables/"

In [7]:
# initialize some variables for profiling how long the code takes
start_time = time.time()
documents_processed = 0

# list of filenames which were unable to open or errored somewhere in this
# code
bad_files = []


for file_no, FILENAME in enumerate(os.listdir(INPUT_FOLDER)):
    # if the filename does not end with .html, skip the file
    if len(FILENAME.split('.')) < 2 or FILENAME.split('.')[-1] != 'html': continue
    try:
        with open(INPUT_FOLDER + FILENAME, 'r', encoding='utf-8') as f:
            print(file_no, FILENAME)
            """
                Setup document
            """
            html_doc = html_document.html_document(f, extract_images=True)
            
            """
                Setup folder for the document images, tables, and captions
                to be dumped into
            """
            FILENAME = ''.join(FILENAME.split('.html')[:-1]).strip()
            if not os.path.exists(OUTPUT_FOLDER + FILENAME):
                os.makedirs(OUTPUT_FOLDER + FILENAME)
            
            # create containers for the output data to be stored in
            output_toc = {}
            output_labels = []
            
            """
                Get table of contents and the captions within it
            """
            toc_labels = {}
            toc_text = []
            toc_pages, toc_page_numbers = html_analyzer.get_toc_pages(html_doc)
            for toc_page in toc_pages:
                # add the extracted labels and text from this table of contents page
                # to the existing result
                toc_labels = {**toc_labels, **html_analyzer.extract_toc_labels(toc_page)}
                toc_text += list(map(lambda text_line: text_line + '\n', toc_page.text_lines))
            
            """
                Write what was extracted from the table of contents into file
            """
            with open(OUTPUT_FOLDER + FILENAME + "/table-of-contents.txt", "w+", encoding="utf-8") as toc_file:
                output_toc['toc_labels'] = toc_labels
                output_toc['raw_text'] = toc_text
                toc_file.write(json.dumps(output_toc, indent=2))
            
            
            """
                Attempt to tag images
            """
            # store the result of tagging images from the previous page, these
            # results are needed to detect tables that span multiple pages
            previous_page_tagged_images = None
            for page_number, page in enumerate(html_doc.pages):         
                labels = html_analyzer.extract_labels(page)
                
                # only use the table of contents captions if the page is after the
                # table of contents
                if len(toc_pages) == 0 or page_number < toc_page_numbers[0]:
                    tagged_images = html_analyzer.tag_images(page, page.images, labels, {}, previous_page_tagged_images=previous_page_tagged_images)
                else:
                    tagged_images = html_analyzer.tag_images(page, page.images, labels, toc_labels, previous_page_tagged_images=previous_page_tagged_images)
                
                previous_page_tagged_images = tagged_images
                
                for image_number, image_tagged in enumerate(tagged_images):
                    if image_tagged.image_data.is_table:
                        img_type = 'table'
                    else:
                        img_type = 'image'
                    
                    """
                        Save image to a .png file and record the label associated with it
                    """
                    cv2.imwrite(OUTPUT_FOLDER + FILENAME + "/{}_page-{}_{}-{}.png".format(img_type, page_number, img_type, image_number), image_tagged.image_data.image)
                    output_labels.append({
                            'file': "{}_page-{}_{}-{}.png".format(img_type, page_number, img_type, image_number),
                            'type': img_type,
                            'page-number': page_number,
                            'image-number': image_number,
                            'page-label': image_tagged.label.text if image_tagged.label else None,
                            'toc-label': image_tagged.toc_label
                        })
            
            """
                Write labels to file
            """
            with open(OUTPUT_FOLDER + FILENAME + "/label_log.txt", "w+", encoding="utf-8") as image_log_file:
                image_log_file.write(json.dumps(output_labels, indent=2))
                
    
        documents_processed += 1
    except:
        bad_files.append(FILENAME)

# print files that failed to open or errored when running the code
print(bad_files)
# save these bad files in a text log in the output folder
with open(OUTPUT_FOLDER + '/bad-files.txt', 'w+', encoding='utf-8') as bad_files_file:
    bad_files_file.write('\n'.join(bad_files))

# print the time taken for the analysis and the average time per document
time_taken = time.time() - start_time
print(float(time_taken) / (documents_processed+1))


0 A0H8C0.html
1 A0H8C1.html
2 A0H8C2.html
3 A0H8C3.html
4 A0U3G1.html
5 A0U3G2.html
6 A0Y1T8.html
7 A0Y1T9.html
8 A0Y1U0.html
9 A0Y1U1.html
10 A0Y1U2.html
11 A0Y1U3.html
12 A0Y1U4.html
13 A0Y1U5.html
14 A0Y1U6.html
15 A0Y1U7.html
16 A0Y1U8.html
17 A0Y1U9.html
18 A0Y1V0.html
19 A0Y1V1.html
20 A0Y1V2.html
21 A0Y1V3.html
22 A0Y1V4.html
23 A0Y1V5.html
24 A0Y1V6.html
25 A0Y1V7.html
26 A0Y1V8.html
27 A0Y1V9.html
28 A0Y1W0.html
29 A0Y1W1.html
30 A0Z0Y6.html
31 A0Z0Y7.html
32 A0Z0Y8.html
33 A0Z0Y9.html
34 A0Z0Z0.html
35 A0Z0Z1.html
36 A0Z0Z2.html
37 A0Z0Z3.html
38 A0Z0Z4.html
39 A0Z0Z5.html
40 A0Z0Z6.html
41 A0Z0Z7.html
42 A0Z0Z8.html
43 A0Z0Z9.html
44 A0Z1A0.html
45 A0Z1A2.html
46 A0Z1A3.html
47 A0Z1A4.html
48 A0Z1A5.html
49 A0Z1A6.html
50 A0Z1A7.html
51 A0Z1A8.html
52 A0Z1A9.html
53 A0Z1C0.html
54 A0Z1C1.html
55 A0Z1C2.html
56 A0Z1C3.html
57 A0Z1C4.html
58 A0Z1C5.html
59 A0Z4I1.html
60 A0Z4I4.html
61 A1C2E2.html
62 A1C2E5.html
63 A1C2E6.html
64 A1C2E7.html
65 A1C2E9.html
66 A1C2F0.html
67 A1

519 A3S2K5.html
520 A3S2K6.html
521 A3S2K7.html
522 A3S2K8.html
523 A3S2K9.html
524 A3S2L0.html
525 A3S2L1.html
526 A3S2L2.html
527 A3S2L5.html
528 A3S2L6.html
529 A3S2L7.html
530 A3S2L8.html
531 A3S2L9.html
532 A3S2Q0.html
533 A3S2Q2.html
534 A3S2Q3.html
535 A3S2Q4.html
536 A3S2Q6.html
537 A3S2Q8.html
538 A3S2R0.html
539 A3S2R3.html
540 A3S2R4.html
541 A3S2R5.html
542 A3S2R6.html
543 A3S2R7.html
544 A3S2R8.html
545 A3V4K8.html
546 A3V4K9.html
547 A3V4L0.html
548 A3V4L1.html
549 A3V4L2.html
550 A3V4L3.html
551 A3V4L4.html
552 A3V4L5.html
553 A3V4L6.html
554 A3V4L7.html
555 A3V4L8.html
556 A3V4L9.html
557 A3V4Q0.html
558 A3V4Q1.html
559 A3V4Q2.html
560 A3V4Q3.html
561 A3V4Q4.html
562 A3V4Q5.html
563 A3V4Q6.html
564 A3V4Q7.html
565 A3V4Q8.html
566 A3V4Q9.html
567 A3V4R0.html
568 A3V4R1.html
569 A3V4R2.html
570 A3V4R3.html
571 A3V4R4.html
572 A3V4R5.html
573 A4D9H6.html
574 A4D9H7.html
575 A4D9H8.html
576 A4D9H9.html
577 A4D9I3.html
578 A4D9I5.html
579 A4D9I6.html
580 A4D9I7.html
581 A4D9

1030 A5A9E9.html
1031 A5A9F1.html
1032 A5A9F2.html
1033 A5A9F3.html
1034 A5A9F4.html
1035 A5A9F6.html
1036 A5A9F7.html
1037 A5A9F8.html
1038 A5A9G4.html
1039 A5A9G5.html
1040 A5A9G6.html
1041 A5A9G7.html
1042 A5A9G8.html
1043 A5A9R4.html
1044 A5A9R5.html
1045 A5A9R6.html
1046 A5A9R9.html
1047 A5A9S4.html
1048 A5A9S5.html
1049 A5A9S6.html
1050 A5A9S7.html
1051 A5A9S8.html
1052 A5A9S9.html
1053 A5A9T0.html
1054 A5A9T1.html
1055 A5A9T2.html
1056 A5A9T3.html
1057 A5A9T4.html
1058 A5A9T5.html
1059 A5A9T6.html
1060 A5A9T7.html
1061 A5A9T8.html
1062 A5A9T9.html
1063 A5A9U0.html
1064 A5A9U1.html
1065 A5A9U2.html
1066 A5A9U3.html
1067 A5A9U4.html
1068 A5A9U5.html
1069 A5A9U6.html
1070 A5A9U8.html
1071 A5G2X0.html
1072 A5G2X1.html
1073 A5G2X3.html
1074 A5G2X4.html
1075 A5G2X5.html
1076 A5G2X6.html
1077 A5G2X7.html
1078 A5G2X8.html
1079 A5G2X9.html
1080 A5G2Y0.html
1081 A5G2Y1.html
1082 A5G2Y3.html
1083 A5V8D5.html
1084 A5V8D6.html
1085 A5V8D7.html
1086 A5V8D8.html
1087 A5V8D9.html
1088 A5V8E0.ht

In [8]:
len(bad_files)

1139