In [1]:
import pymupdf.layout 
from markdown_it import MarkdownIt
import pymupdf4llm
import pymupdf
import unicodedata
import re
import json

In [2]:
def clean_md(md_text):
    lines = md_text.splitlines()
    text_lines = []
    for line in lines:
        if line != '' and "picture" not in line and 'PÃ¡gina ' not in line:
            text_lines.append(line)
    
    return text_lines

def normalizar_string(text: str) -> str:

    text = unicodedata.normalize("NFD", text)
    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace(" ", "-")

    return text

In [3]:
path = 'boletim/BHA_PT_20260128.pdf'

In [45]:
def get_text(path):
    d_boletim = {}
    doc = pymupdf.open(path)
    # Page 1
    page = doc.load_page(0)
    text = page.get_text()
    text_lines = text.splitlines()
    doi = text_lines[0].split(":")[1]
    issn = text_lines[2].split(": ")[1]
    title = " ".join(text_lines[3:-1]).replace("  ", " ")
    volume = text_lines[-1].split()[1].removesuffix(",")
    number = text_lines[-1].split()[3]
    date = " ".join(text_lines[-1].split()[5:])
    d_page = {
        'doi': doi,
        'issn': issn,
        'volume': volume,
        'number': number,
        'date': date,
        'title': title
    }
    d_boletim.update(d_page)
    # Page 3
    text_md = pymupdf4llm.to_markdown(path, pages=3)
    text = text_md.splitlines()[4].replace('_**', '').removesuffix('**_ ')
    current_conditions = {
        'text': text,
        "map_current_conditions": "map_current_conditions.png",
        "table_current_conditions": "table_current_conditions.png"
    }
    d_boletim['current_conditions'] = current_conditions
    # Page 4 - 15
    text_md = pymupdf4llm.to_markdown(path, pages=list(range(4, 15)))
    text_lines = clean_md(text_md)
    text_lines.pop(0)
    bacias_names = []
    for b in text_lines[::2]:
        name = b.removeprefix("## _**").removeprefix("_**").removesuffix("**_ ")
        bacias_names.append(name)

    md = MarkdownIt()
    bacias_contents = []
    for i in text_lines[1::2]:
        md_render = md.render(i)
        bacias_contents.append(md_render)

    dic_bacias = dict(zip(bacias_names, bacias_contents))
    analysis = []
    for k, v in dic_bacias.items():
        slugfy = normalizar_string(k)
        strong = v.split('<em><strong>')
        climatologia = strong[1].split("</strong></em>")[0]
        observados = strong[3].split("</strong></em>")[0]
        anomalia = strong[4].split("</strong></em>")[0].removesuffix(",")
        classification = strong[5].split("</strong></em>")[0].removesuffix(".")
        prognostico = strong[-1].split("</strong></em>")[0].removesuffix(".")
        item = {
            "id": slugfy,
            "name": k,
            "text": v,
            "climatologia": climatologia,
            "observados": observados,
            "anomalia": anomalia,
            "classification": classification,
            "prognostico": prognostico,
            "charts": {
                "acc": f"{slugfy}-acc.png",
                "ano": f"{slugfy}-ano.png"
            }

        }
        analysis.append(item)
    d_boletim['analysis'] = analysis
    # Page 15 - 16
    text_md = pymupdf4llm.to_markdown(path, pages=[15,16])
    text_lines = clean_md(text_md)
    title = text_lines[0].removeprefix("_**").removesuffix("**_ ")
    text = text_lines[1]
    seven_days = text_lines[2].replace("acima, ", "")
    fourteen_days = text_lines[3].replace("acima, ", "")
    multimodel = {
        'title': title,
        'text': text,
        'seven_days': seven_days,
        'img_seven_days': "seven_days.png",
        'fourteen_days': fourteen_days,
        'img_fourteen_days': "fourteen_days.png"
    }
    d_boletim['multimodel'] = multimodel
    # Page 17
    text_md = pymupdf4llm.to_markdown(path, pages=17)
    text_lines = clean_md(text_md)
    reference = {
        'text': text_lines[1],
        'legend_table': text_lines[2].removeprefix("**").removesuffix("** ").replace("** _**", "").replace("**_ **", ""),
        'legend_climatology': text_lines[3].removeprefix("**").removesuffix("** ").replace("** _**", " ").replace("**_ **", ""),
        'img_reference': "reference.png"
    }
    d_boletim['reference'] = reference 
    # Page 18
    text_md = pymupdf4llm.to_markdown(path, pages=18)
    text_lines = clean_md(text_md)
    anomaly_catergory = {
        'text': text_lines[1],
        'tables': "anomaly_table.png",
    }
    d_boletim['anomaly_category'] = anomaly_catergory
    list_imgs = [f'bacia_{i}.png' for i in range(1, 33)]
    d_boletim['anomaly_behavior'] = list_imgs


    return d_boletim
    
d_boletim = get_text(path)

OCR disabled because Tesseract language data not found.
OCR disabled because Tesseract language data not found.
OCR disabled because Tesseract language data not found.
OCR disabled because Tesseract language data not found.
OCR disabled because Tesseract language data not found.


In [46]:
with open('/home/inacio/clima-amazonia/data/boletim/pt.json', 'w') as f:
    json.dump(d_boletim, f, indent=4,  ensure_ascii=False)
    f.close()

In [None]:
text_md = pymupdf4llm.to_markdown(path, pages=18)
text_lines = clean_md(text_md)
text_lines[1]

OCR disabled because Tesseract language data not found.


# Get Images

In [None]:
from pydoc import doc


def get_images(path, d_boletim):
    doc = pymupdf.open(path)
    # Images Page 3
    page = doc.load_page(3)
    # mapa
    x0, y0, x1, y1 = 136.3000030517578, 414.7003479003906, 483.6500244140625, 682.5503540039062  
    rect = pymupdf.Rect(x0, y0, x1, y1)
    zoom = 3 
    mat = pymupdf.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    pix.save(f'current_files/map_current_conditions.png')
    # table
    x0, y0, x1, y1 = 100, 680, 515, 765   
    rect = pymupdf.Rect(x0, y0, x1, y1)
    zoom = 3 
    mat = pymupdf.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    pix.save(f'current_files/table_current_conditions.png')
    # Images Bacias
    images = []
    for i in d_boletim['analysis']:
        id = i['id']
        images.append(f'{id}-acc.png')
        images.append(f'{id}-ano.png')
    c = 0
    logo = (175.0500030517578, 776.2003173828125, 457.0, 832.2003173828125)
    for i in range(4, 15):
        page = doc.load_page(i)
        page_dict = page.get_text("dict") 
        blocks = page_dict.get("blocks", [])
        for b in blocks:
            btype = b.get("type", None)  
            bbox = b.get("bbox", None)
            if btype == 1 and bbox != logo:
                rect = pymupdf.Rect(bbox)
                pix = page.get_pixmap(clip=rect, dpi=200, alpha=False)
                img = images[c]
                pix.save(f'current_files/{img}')
                print(img)
                c += 1
    # Images Multimodel
    page = doc.load_page(15)
    x0, y0, x1, y1 = 70, 200, 515, 620   
    rect = pymupdf.Rect(x0, y0, x1, y1)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    pix.save(f'seven_days.png')
    page = doc.load_page(16)
    x0, y0, x1, y1 = 70, 70, 515, 500   
    rect = pymupdf.Rect(x0, y0, x1, y1)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    pix.save(f'fourteen_days.png')
    # Anomaly category
    doc = pymupdf.open(path)
    page = doc.load_page(18)
    x0, y0, x1, y1 = 80, 415, 530, 740   
    rect = pymupdf.Rect(x0, y0, x1, y1)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    pix.save(f'anomaly_table.png')
    # Anomaly Behaivor
    c = 1
    for i in range(19, 23):
        page = doc.load_page(i)
        page_dict = page.get_text("dict") 
        blocks = page_dict.get("blocks", [])
        for b in blocks:
            btype = b.get("type", None)  
            bbox = b.get("bbox", None)
            if btype == 1 and bbox != logo:
                rect = pymupdf.Rect(bbox)
                pix = page.get_pixmap(clip=rect, dpi=200, alpha=False)
                pix.save(f'bacia_{c}.png')
                c += 1

get_images(path, d_boletim)

In [33]:
doc = pymupdf.open(path)
page = doc.load_page(19)

In [34]:
page_dict = page.get_text("dict") 
blocks = page_dict.get("blocks", [])

In [44]:
c = 1
logo = (175.0500030517578, 776.2003173828125, 457.0, 832.2003173828125)
for i in range(19, 23):
    page = doc.load_page(i)
    page_dict = page.get_text("dict") 
    blocks = page_dict.get("blocks", [])
    for b in blocks:
        btype = b.get("type", None)  
        bbox = b.get("bbox", None)
        if btype == 1 and bbox != logo:
            rect = pymupdf.Rect(bbox)
            pix = page.get_pixmap(clip=rect, dpi=200, alpha=False)
            pix.save(f'bacia_{c}.png')
            c += 1
            print(c)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


In [56]:
x0, y0, x1, y1 = 70, 70, 515, 500   
rect = pymupdf.Rect(x0, y0, x1, y1)
zoom = 3 
mat = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
pix.save(f'fourteen_days.png')