# Heuristic rhetorical detection (fast)
# Outputs: artifacts/reports/rhetorical_sample_10000.md, .csv
# Figures: artifacts/figures/rhetorical_presence_sample_10000.png


In [None]:
import csv
import re
from pathlib import Path

SAMPLE_CSV = Path('artifacts/samples/sample_10000.csv')
OUT_CSV = Path('artifacts/reports/rhetorical_sample_10000.csv')
OUT_MD = Path('artifacts/reports/rhetorical_sample_10000.md')
FIG_DIR = Path('artifacts/figures')

LABELS = {
    'INTRO': ['introduccion'],
    'BACK': ['trabajos relacionados', 'estado del arte', 'antecedentes'],
    'METH': ['metodologia', 'metodos', 'materiales y metodos'],
    'RES': ['resultados'],
    'DISC': ['discusion'],
    'CONTR': ['contribuciones', 'contribucion'],
    'LIM': ['limitaciones', 'limitacion'],
    'CONC': ['conclusiones', 'conclusion'],
}

SAMPLE = list(csv.DictReader(SAMPLE_CSV.open('r', encoding='utf-8')))


import struct
import zlib
import unicodedata

def _png_chunk(tag, data):
    return (len(data)).to_bytes(4, 'big') + tag + data + zlib.crc32(tag + data).to_bytes(4, 'big')


def save_png(path, width, height, rgb_bytes):
    raw = bytearray()
    row_bytes = width * 3
    for y in range(height):
        raw.append(0)
        start = y * row_bytes
        raw.extend(rgb_bytes[start:start + row_bytes])
    compressed = zlib.compress(bytes(raw), level=9)
    ihdr = struct.pack('>IIBBBBB', width, height, 8, 2, 0, 0, 0)
    png = b'\x89PNG\r\n\x1a\n' + _png_chunk(b'IHDR', ihdr) + _png_chunk(b'IDAT', compressed) + _png_chunk(b'IEND', b'')
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    Path(path).write_bytes(png)


def _blank_canvas(width, height, color=(255, 255, 255)):
    r, g, b = color
    return bytearray([r, g, b] * width * height)


def _set_pixel(buf, width, x, y, color):
    if x < 0 or y < 0:
        return
    if x >= width:
        return
    idx = (y * width + x) * 3
    if idx + 2 >= len(buf):
        return
    buf[idx:idx+3] = bytes(color)


def draw_rect(buf, width, height, x, y, w, h, color):
    x0 = max(0, int(x))
    y0 = max(0, int(y))
    x1 = min(width, int(x + w))
    y1 = min(height, int(y + h))
    for yy in range(y0, y1):
        for xx in range(x0, x1):
            _set_pixel(buf, width, xx, yy, color)


def draw_line(buf, width, height, x1, y1, x2, y2, color):
    x1 = int(x1); y1 = int(y1); x2 = int(x2); y2 = int(y2)
    dx = abs(x2 - x1)
    dy = -abs(y2 - y1)
    sx = 1 if x1 < x2 else -1
    sy = 1 if y1 < y2 else -1
    err = dx + dy
    while True:
        if 0 <= x1 < width and 0 <= y1 < height:
            _set_pixel(buf, width, x1, y1, color)
        if x1 == x2 and y1 == y2:
            break
        e2 = 2 * err
        if e2 >= dy:
            err += dy
            x1 += sx
        if e2 <= dx:
            err += dx
            y1 += sy


def save_bar_png(labels, values, path, width=900, height=450):
    if not labels:
        return
    max_val = max(values) or 1
    buf = _blank_canvas(width, height)
    draw_line(buf, width, height, 60, height-50, width-20, height-50, (0,0,0))
    draw_line(buf, width, height, 60, 30, 60, height-50, (0,0,0))

    n = len(labels)
    bar_w = (width - 90) / n
    for i, val in enumerate(values):
        h = (val / max_val) * (height - 90)
        x = 60 + i * bar_w
        y = (height - 50) - h
        draw_rect(buf, width, height, x, y, max(1, bar_w-2), h, (241,143,1))

    save_png(path, width, height, buf)


def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')



def normalize(s: str) -> str:
    s = strip_accents(s.lower())
    s = re.sub(r'\s+', ' ', s)
    return s

rows = []
for item in SAMPLE:
    path = Path(item['path'])
    text = path.read_text(encoding='utf-8', errors='ignore')
    t = normalize(text)
    row = {'doc_id': item['doc_id']}
    for label, keys in LABELS.items():
        row[label] = 1 if any(k in t for k in keys) else 0
    rows.append(row)

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with OUT_CSV.open('w', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    w.writerows(rows)

# Aggregate presence per label

totals = {k: 0 for k in LABELS}
for r in rows:
    for k in LABELS:
        totals[k] += int(r[k])

n_docs = len(rows)
OUT_MD.parent.mkdir(parents=True, exist_ok=True)
with OUT_MD.open('w', encoding='utf-8') as f:
    f.write('# Distribucion Retorica (heuristica rapida, muestra)\n\n')
    f.write(f'Documentos analizados: **{n_docs}**\n\n')
    for k in sorted(totals.keys()):
        pct = (totals[k] / max(n_docs, 1)) * 100
        f.write(f'- {k}: {totals[k]} documentos ({pct:.1f}%)\n')
    f.write('\nNota: deteccion por palabras clave (heuristica rapida).\n')

print('Wrote', OUT_CSV, 'and', OUT_MD)

FIG_DIR.mkdir(parents=True, exist_ok=True)
fig_path = FIG_DIR / 'rhetorical_presence_sample_10000.png'
labels = list(sorted(totals.keys()))
values = [totals[k] for k in labels]
save_bar_png(labels, values, fig_path)
print('Saved', fig_path)
