## OCR with UNSTRUCTURED

In [None]:
%pip install unstructured-client

Hacer las peticiones al `http://localhost:8000`

Con el siguente comando se puede correr el contenedor de UNSTRUCTURED:

```bash
docker run -p 8000:8000 -d --rm --name unstructured-api downloads.unstructured.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0
```

In [2]:
import os, json

import unstructured_client
from unstructured_client.models import operations, shared

client = unstructured_client.UnstructuredClient(
    server_url="http://localhost:8000",
)

filename = "file.pdf"
with open(filename, "rb") as f:
    data = f.read()

req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=data,
            file_name=filename,
        ),
        strategy=shared.Strategy.HI_RES,
        languages=['eng'],
        split_pdf_page=True,            # If True, splits the PDF file into smaller chunks of pages.
        split_pdf_allow_failed=True,    # If True, the partitioning continues even if some pages fail.
        split_pdf_concurrency_level=15  # Set the number of concurrent request to the maximum value: 15.
    ),
)

try:
    res = client.general.partition(request=req)
    element_dicts = [element for element in res.elements]
    json_elements = json.dumps(element_dicts, indent=2)

    # Print the processed data.
    print(json_elements)

    # Write the processed data to a local file.
    with open("salida.txt", "w") as file:
        file.write(json_elements)
except Exception as e:
    print(e)

INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 1
INFO: Concurrency level set to 15
INFO: Splitting pages 1 to 2 (2 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (2) to be split efficiently. Partitioning without split.
INFO: Successfully partitioned the document.


[
  {
    "type": "NarrativeText",
    "element_id": "084bcfca09086336d78c8ba5c6103a13",
    "text": "18. Cuando hablamos de capacitaci\u00f3n, tenemos como uno de los beneficios principales, la mejora de la calidad del Software \u00bfA qu\u00e9 nos referimos con esto?",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "file.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "f666bfbe7368828ec28fc42ca8724ce3",
    "text": "a. A un c\u00f3digo bien documentado e indentado. b. A un c\u00f3digo con menos errores, mayor confiabilidad y que genera satisfacci\u00f3n al cliente. c. A un c\u00f3digo que sustituye \u201cifs\u201d por \u201cswitch\u201d para darle mejor presentaci\u00f3n. d. A un c\u00f3digo que utiliza patrones de dise\u00f1o para optimizarse a s\u00ed mismo. 19. \u00bfCu\u00e1l de las siguientes opciones abarca actividades relacionadas con la capacitaci\u00f3n en el cont

## Clase OCR

In [3]:
import json
from pathlib import Path
from typing import List, Dict
from unstructured_client import UnstructuredClient
from unstructured_client.models import operations, shared


class OCR:
    def __init__(self):
        self.unstructured_client = UnstructuredClient(server_url="http://localhost:8000")


    def get_ocr(self, file_path: str) -> List[Dict]:
        with open(file_path, "rb") as f:
            data = f.read()

        req = operations.PartitionRequest(
            partition_parameters=shared.PartitionParameters(
                files=shared.Files(
                    content=data,
                    file_name=file_path,
                ),
                strategy=shared.Strategy.AUTO,
                languages=['eng', 'spa'],
                split_pdf_page=True,            # If True, splits the PDF file into smaller chunks of pages.
                split_pdf_allow_failed=True,    # If True, the partitioning continues even if some pages fail.
                split_pdf_concurrency_level=15  # Set the number of concurrent request to the maximum value: 15.
            ),
        )

        try:
            res = self.unstructured_client.general.partition(request=req)
            element_dicts = [element for element in res.elements]
            json_elements = json.dumps(element_dicts, indent=2)

            # Print the processed data.
            print(json_elements)
            return element_dicts

        except Exception as e:
            print(e)


    def get_dev_ocr(self, file_path: str) -> Dict:
        file = Path(file_path)
        metadata = {"filetype": f'text/{file.suffix[1:]}' , "filename": file.name}
        text = file.read_text(encoding='utf-8')
        data = {
            'metadata': metadata, 
            'text': text
        }
        return [data]


In [4]:
ocr = OCR()

pdf = ocr.get_ocr("file.pdf")
dev = ocr.get_dev_ocr("salida.txt")

INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 1
INFO: Concurrency level set to 15
INFO: Splitting pages 1 to 2 (2 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (2) to be split efficiently. Partitioning without split.
INFO: Successfully partitioned the document.


[
  {
    "type": "NarrativeText",
    "element_id": "084bcfca09086336d78c8ba5c6103a13",
    "text": "18. Cuando hablamos de capacitaci\u00f3n, tenemos como uno de los beneficios principales, la mejora de la calidad del Software \u00bfA qu\u00e9 nos referimos con esto?",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng",
        "spa"
      ],
      "page_number": 1,
      "filename": "file.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "f666bfbe7368828ec28fc42ca8724ce3",
    "text": "a. A un c\u00f3digo bien documentado e indentado. b. A un c\u00f3digo con menos errores, mayor confiabilidad y que genera satisfacci\u00f3n al cliente. c. A un c\u00f3digo que sustituye \u201cifs\u201d por \u201cswitch\u201d para darle mejor presentaci\u00f3n. d. A un c\u00f3digo que utiliza patrones de dise\u00f1o para optimizarse a s\u00ed mismo. 19. \u00bfCu\u00e1l de las siguientes opciones abarca actividades relacionadas con la capacitaci\u0

In [6]:
from pprint import pprint

pprint(pdf)
print("-" * 80)
pprint(dev)

[{'element_id': '084bcfca09086336d78c8ba5c6103a13',
  'metadata': {'filename': 'file.pdf',
               'filetype': 'application/pdf',
               'languages': ['eng', 'spa'],
               'page_number': 1},
  'text': '18. Cuando hablamos de capacitación, tenemos como uno de los '
          'beneficios principales, la mejora de la calidad del Software ¿A qué '
          'nos referimos con esto?',
  'type': 'NarrativeText'},
 {'element_id': 'f666bfbe7368828ec28fc42ca8724ce3',
  'metadata': {'filename': 'file.pdf',
               'filetype': 'application/pdf',
               'languages': ['eng', 'spa'],
               'page_number': 1},
  'text': 'a. A un código bien documentado e indentado. b. A un código con '
          'menos errores, mayor confiabilidad y que genera satisfacción al '
          'cliente. c. A un código que sustituye “ifs” por “switch” para darle '
          'mejor presentación. d. A un código que utiliza patrones de diseño '
          'para optimizarse a sí mis