# Descriptive stats for sample_10000
# Outputs: artifacts/reports/descriptive_sample_10000.md, .csv
# Figures: artifacts/figures/hist_words_sample_10000.png


In [None]:
import csv
import statistics as stats
from pathlib import Path

SAMPLE_CSV = Path('artifacts/samples/sample_10000.csv')
OUT_CSV = Path('artifacts/reports/descriptive_sample_10000.csv')
OUT_MD = Path('artifacts/reports/descriptive_sample_10000.md')
FIG_DIR = Path('artifacts/figures')

SAMPLE = list(csv.DictReader(SAMPLE_CSV.open('r', encoding='utf-8')))


import struct
import zlib
import unicodedata

def _png_chunk(tag, data):
    return (len(data)).to_bytes(4, 'big') + tag + data + zlib.crc32(tag + data).to_bytes(4, 'big')


def save_png(path, width, height, rgb_bytes):
    # rgb_bytes: bytes-like of length width*height*3
    raw = bytearray()
    row_bytes = width * 3
    for y in range(height):
        raw.append(0)  # no filter
        start = y * row_bytes
        raw.extend(rgb_bytes[start:start + row_bytes])
    compressed = zlib.compress(bytes(raw), level=9)
    ihdr = struct.pack('>IIBBBBB', width, height, 8, 2, 0, 0, 0)
    png = b'\x89PNG\r\n\x1a\n' + _png_chunk(b'IHDR', ihdr) + _png_chunk(b'IDAT', compressed) + _png_chunk(b'IEND', b'')
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    Path(path).write_bytes(png)


def _blank_canvas(width, height, color=(255, 255, 255)):
    r, g, b = color
    return bytearray([r, g, b] * width * height)


def _set_pixel(buf, width, x, y, color):
    if x < 0 or y < 0:
        return
    if x >= width:
        return
    idx = (y * width + x) * 3
    if idx + 2 >= len(buf):
        return
    buf[idx:idx+3] = bytes(color)


def draw_rect(buf, width, height, x, y, w, h, color):
    x0 = max(0, int(x))
    y0 = max(0, int(y))
    x1 = min(width, int(x + w))
    y1 = min(height, int(y + h))
    for yy in range(y0, y1):
        for xx in range(x0, x1):
            _set_pixel(buf, width, xx, yy, color)


def draw_line(buf, width, height, x1, y1, x2, y2, color):
    x1 = int(x1); y1 = int(y1); x2 = int(x2); y2 = int(y2)
    dx = abs(x2 - x1)
    dy = -abs(y2 - y1)
    sx = 1 if x1 < x2 else -1
    sy = 1 if y1 < y2 else -1
    err = dx + dy
    while True:
        if 0 <= x1 < width and 0 <= y1 < height:
            _set_pixel(buf, width, x1, y1, color)
        if x1 == x2 and y1 == y2:
            break
        e2 = 2 * err
        if e2 >= dy:
            err += dy
            x1 += sx
        if e2 <= dx:
            err += dx
            y1 += sy


def save_hist_png(values, bins, path, width=900, height=450, title=None):
    if not values:
        return
    vmin = min(values)
    vmax = max(values)
    if vmin == vmax:
        vmax = vmin + 1
    step = (vmax - vmin) / bins
    counts = [0] * bins
    for v in values:
        idx = int((v - vmin) / step)
        if idx == bins:
            idx -= 1
        counts[idx] += 1
    max_count = max(counts) or 1

    buf = _blank_canvas(width, height)
    # axes
    draw_line(buf, width, height, 60, height-50, width-20, height-50, (0,0,0))
    draw_line(buf, width, height, 60, 30, 60, height-50, (0,0,0))

    bar_w = (width - 90) / bins
    for i, c in enumerate(counts):
        h = (c / max_count) * (height - 90)
        x = 60 + i * bar_w
        y = (height - 50) - h
        draw_rect(buf, width, height, x, y, max(1, bar_w-1), h, (46,134,171))

    save_png(path, width, height, buf)


def save_bar_png(labels, values, path, width=900, height=450):
    if not labels:
        return
    max_val = max(values) or 1
    buf = _blank_canvas(width, height)
    draw_line(buf, width, height, 60, height-50, width-20, height-50, (0,0,0))
    draw_line(buf, width, height, 60, 30, 60, height-50, (0,0,0))

    n = len(labels)
    bar_w = (width - 90) / n
    for i, val in enumerate(values):
        h = (val / max_val) * (height - 90)
        x = 60 + i * bar_w
        y = (height - 50) - h
        draw_rect(buf, width, height, x, y, max(1, bar_w-2), h, (241,143,1))

    save_png(path, width, height, buf)


def save_barh_png(labels, values, path, width=900, height=450):
    if not labels:
        return
    max_val = max(values) or 1
    buf = _blank_canvas(width, height)
    draw_line(buf, width, height, 160, height-50, width-20, height-50, (0,0,0))
    draw_line(buf, width, height, 160, 30, 160, height-50, (0,0,0))

    n = len(labels)
    bar_h = (height - 90) / n
    for i, val in enumerate(values):
        w = (val / max_val) * (width - 220)
        y = 40 + i * bar_h
        draw_rect(buf, width, height, 160, y, w, max(1, bar_h-4), (76,175,80))

    save_png(path, width, height, buf)


def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')



def text_stats(text: str):
    lines = text.splitlines()
    chars = len(text)
    words = len(text.split())
    line_lens = [len(l) for l in lines] or [0]
    avg_line_len = sum(line_lens) / max(len(line_lens), 1)

    nonascii = sum(1 for c in text if ord(c) > 127)
    digits = sum(1 for c in text if c.isdigit())
    alpha = sum(1 for c in text if c.isalpha())
    spaces = sum(1 for c in text if c.isspace())

    total = max(chars, 1)
    empty_lines = sum(1 for l in lines if not l.strip())

    return {
        'chars': chars,
        'words': words,
        'lines': len(lines),
        'avg_line_len': avg_line_len,
        'nonascii_ratio': nonascii / total,
        'digit_ratio': digits / total,
        'alpha_ratio': alpha / total,
        'space_ratio': spaces / total,
        'empty_line_ratio': empty_lines / max(len(lines), 1),
    }


def safe_stats(values):
    return {
        'min': min(values),
        'p25': stats.quantiles(values, n=4)[0] if len(values) >= 4 else min(values),
        'median': stats.median(values),
        'p75': stats.quantiles(values, n=4)[2] if len(values) >= 4 else max(values),
        'max': max(values),
        'mean': stats.mean(values),
    }


rows = []
for item in SAMPLE:
    path = Path(item['path'])
    text = path.read_text(encoding='utf-8', errors='ignore')
    s = text_stats(text)
    rows.append({'doc_id': item['doc_id'], **s})

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with OUT_CSV.open('w', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    w.writerows(rows)

words = [r['words'] for r in rows]
chars = [r['chars'] for r in rows]
lines = [r['lines'] for r in rows]

summary = {
    'words': safe_stats(words),
    'chars': safe_stats(chars),
    'lines': safe_stats(lines),
}


OUT_MD.parent.mkdir(parents=True, exist_ok=True)
with OUT_MD.open('w', encoding='utf-8') as f:
    f.write('# Analisis Descriptivo (muestra)\n\n')
    f.write(f'Documentos analizados: **{len(rows)}**\n\n')
    f.write('## Longitud\n')
    for k, v in summary.items():
        f.write(f"- {k}: min={v['min']}, p25={v['p25']:.1f}, median={v['median']:.1f}, p75={v['p75']:.1f}, max={v['max']}, mean={v['mean']:.1f}\n")

    f.write('\n## Documentos mas largos (por palabras)\n')
    for item in sorted(rows, key=lambda r: r['words'], reverse=True)[:10]:
        f.write(f"- {item['doc_id']}: {item['words']} palabras\n")

    f.write('\n## Graficas\n')
    f.write('- Histograma de palabras: artifacts/figures/hist_words_sample_10000.png\n')

print('Wrote', OUT_CSV, 'and', OUT_MD)

FIG_DIR.mkdir(parents=True, exist_ok=True)
fig_path = FIG_DIR / 'hist_words_sample_10000.png'
save_hist_png(words, bins=20, path=fig_path)
print('Saved', fig_path)
