# Contribution phrase patterns (expanded)
# Outputs: artifacts/reports/contributions_sample_10000.md, .csv
# Figures: artifacts/figures/contribution_phrases_sample_10000.png


In [None]:
import csv
import re
from pathlib import Path

SAMPLE_CSV = Path('artifacts/samples/sample_10000.csv')
OUT_CSV = Path('artifacts/reports/contributions_sample_10000.csv')
OUT_MD = Path('artifacts/reports/contributions_sample_10000.md')
FIG_DIR = Path('artifacts/figures')

PHRASES = [
    'este trabajo propone',
    'en este articulo se presenta',
    'se propone un nuevo enfoque',
    'este estudio introduce',
    'la principal contribucion',
    'se desarrolla un metodo',
    'los resultados obtenidos demuestran',
    'este trabajo aporta evidencia',
    'se presenta una metodologia novedosa',
    'a diferencia de trabajos previos',
    'en este trabajo presentamos',
    'en este trabajo se presenta',
    'presentamos un metodo',
    'proponemos un metodo',
    'proponemos un enfoque',
    'se presenta un metodo',
    'se presenta un enfoque',
    'se propone una metodologia',
    'este articulo presenta',
    'este articulo propone',
    'este trabajo introduce',
    'en el presente trabajo',
    'nuestra contribucion',
    'nuestras contribuciones',
    'este trabajo contribuye',
    'este estudio aporta',
]

SAMPLE = list(csv.DictReader(SAMPLE_CSV.open('r', encoding='utf-8')))


import struct
import zlib
import unicodedata

def _png_chunk(tag, data):
    return (len(data)).to_bytes(4, 'big') + tag + data + zlib.crc32(tag + data).to_bytes(4, 'big')


def save_png(path, width, height, rgb_bytes):
    raw = bytearray()
    row_bytes = width * 3
    for y in range(height):
        raw.append(0)
        start = y * row_bytes
        raw.extend(rgb_bytes[start:start + row_bytes])
    compressed = zlib.compress(bytes(raw), level=9)
    ihdr = struct.pack('>IIBBBBB', width, height, 8, 2, 0, 0, 0)
    png = b'\x89PNG\r\n\x1a\n' + _png_chunk(b'IHDR', ihdr) + _png_chunk(b'IDAT', compressed) + _png_chunk(b'IEND', b'')
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    Path(path).write_bytes(png)


def _blank_canvas(width, height, color=(255, 255, 255)):
    r, g, b = color
    return bytearray([r, g, b] * width * height)


def _set_pixel(buf, width, x, y, color):
    if x < 0 or y < 0:
        return
    if x >= width:
        return
    idx = (y * width + x) * 3
    if idx + 2 >= len(buf):
        return
    buf[idx:idx+3] = bytes(color)


def draw_rect(buf, width, height, x, y, w, h, color):
    x0 = max(0, int(x))
    y0 = max(0, int(y))
    x1 = min(width, int(x + w))
    y1 = min(height, int(y + h))
    for yy in range(y0, y1):
        for xx in range(x0, x1):
            _set_pixel(buf, width, xx, yy, color)


def draw_line(buf, width, height, x1, y1, x2, y2, color):
    x1 = int(x1); y1 = int(y1); x2 = int(x2); y2 = int(y2)
    dx = abs(x2 - x1)
    dy = -abs(y2 - y1)
    sx = 1 if x1 < x2 else -1
    sy = 1 if y1 < y2 else -1
    err = dx + dy
    while True:
        if 0 <= x1 < width and 0 <= y1 < height:
            _set_pixel(buf, width, x1, y1, color)
        if x1 == x2 and y1 == y2:
            break
        e2 = 2 * err
        if e2 >= dy:
            err += dy
            x1 += sx
        if e2 <= dx:
            err += dx
            y1 += sy


def save_barh_png(labels, values, path, width=900, height=450):
    if not labels:
        return
    max_val = max(values) or 1
    buf = _blank_canvas(width, height)
    draw_line(buf, width, height, 160, height-50, width-20, height-50, (0,0,0))
    draw_line(buf, width, height, 160, 30, 160, height-50, (0,0,0))

    n = len(labels)
    bar_h = (height - 90) / n
    for i, val in enumerate(values):
        w = (val / max_val) * (width - 220)
        y = 40 + i * bar_h
        draw_rect(buf, width, height, 160, y, w, max(1, bar_h-4), (76,175,80))

    save_png(path, width, height, buf)


def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


# Normalize for matching

def normalize(s: str) -> str:
    s = strip_accents(s.lower())
    s = re.sub(r'\s+', ' ', s)
    return s

rows = []
phrase_totals = {p: 0 for p in PHRASES}

for item in SAMPLE:
    path = Path(item['path'])
    text = path.read_text(encoding='utf-8', errors='ignore')
    t = normalize(text)
    counts = {}
    total_hits = 0
    for phrase in PHRASES:
        n = t.count(phrase)
        counts[phrase] = n
        phrase_totals[phrase] += n
        total_hits += n
    rows.append({'doc_id': item['doc_id'], 'total_hits': total_hits, **counts})

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with OUT_CSV.open('w', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    w.writerows(rows)

OUT_MD.parent.mkdir(parents=True, exist_ok=True)
with OUT_MD.open('w', encoding='utf-8') as f:
    f.write('# Patrones de Contribuciones (muestra)\n\n')
    f.write(f'Documentos analizados: **{len(rows)}**\n\n')
    f.write('## Frecuencia de frases\n')
    for phrase, total in sorted(phrase_totals.items(), key=lambda x: x[1], reverse=True):
        f.write(f'- "{phrase}": {total}\n')

print('Wrote', OUT_CSV, 'and', OUT_MD)

FIG_DIR.mkdir(parents=True, exist_ok=True)
fig_path = FIG_DIR / 'contribution_phrases_sample_10000.png'
labels = list(phrase_totals.keys())
values = [phrase_totals[p] for p in labels]
save_barh_png(labels, values, fig_path)
print('Saved', fig_path)
