In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import unstructured
from unstructured.partition.pdf import partition_pdf
import base64
from tqdm import tqdm
import ollama

class Extractor:
    def __init__(self, vision_model = 'llama3.2-vision',
                llm_model = 'llama3.2:1b'):
        self.vision_model = vision_model
        self.llm_model = llm_model
    

    def parse_input_file(self, input_file):
        chunks = partition_pdf(
            filename= input_file, 
            infer_table_structure=True, 
            strategy= 'hi_res', 
            extract_image_block_types= ['Image'],
            extract_image_block_to_payload=True,
            chunking_strategy='by_title',
            max_characters=10000,
            combine_text_under_n_chars=2000,
            new_after_n_chars=6000
          )
        return chunks

    def classify_chunks(self, chunks):
        texts, tables, images = [], [], []

        for chunk in chunks:
            if isinstance(chunk, unstructured.documents.elements.Table):
                tables.append(chunk)
            if isinstance(chunk, unstructured.documents.elements.CompositeElement):
                texts.append(chunk)
                chunk_elements = chunk.metadata.orig_elements
        
                for element in chunk_elements:
                    if isinstance(element, unstructured.documents.elements.Image):
                        images.append(element.metadata.image_base64)
        return texts, tables, images

    def write_images_to_disk(self, images, images_dir):
        for idx, image in enumerate(images):
            image_data = base64.b64decode(image)
            file_path = f'image_{idx}.jpeg'

            with open(file_path, 'wb') as f:
                f.write(image_data)

    def get_image_summary(self, file_path):
        response = ollama.chat(
            model = self.vision_model,
            messages = [
                {'role': 'user', 'content': 'Summarize the image', 'images': [file_path]}
            ]
        )
        return response.message.content

    def get_text_summary(self, text):
        response = ollama.chat(
            model = self.llm_model,
            messages = [
                {'role': 'user', 'content': f'summarize this text: {text}'}
            ]
        )
        return response.message.content

    def get_table_summary(self, table):
        response = ollama.chat(
            model = self.llm_model,
            messages = [
                {'role': 'user', 'content': f'summarize this table: {table}'}
            ]
        )
        return response.message.content

In [3]:
file_path = 'attention.pdf'
image_path = '/images'

extractor = Extractor()

In [4]:
chunks = extractor.parse_input_file(file_path)

In [5]:
texts, tables, images = extractor.classify_chunks(chunks)

In [6]:
texts, images = texts[:2], images[:2]

In [7]:
extractor.write_images_to_disk(images, image_path)

In [8]:
image_summaries = [extractor.get_image_summary(f"image_{i}.jpeg") for i in tqdm(range(len(images)))]

100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [06:38<00:00, 199.17s/it]


In [9]:
text_summaries = [extractor.get_text_summary(text[i].text for i in tqdm(range(len(texts))))]

  0%|                                                                                        | 0/2 [00:02<?, ?it/s]


In [None]:
table_summaries = [extractor.get_table_summary(tables[i].metadata.text_as_html)
                    for i in tqdm(range(len(tables)))]