# RPM Build Log Analyzer
Этот ноутбук содержит функции для быстрой вытяжки ключевых ошибок из длинных логов сборки (RPM и др.).  
Заполните переменную `log_path` путём к файлу, выполните ячейки – получите краткую сводку, «ужатый» лог и при необходимости нормализованный текст для LLM.

In [1]:

import collections
import os
import re
import textwrap
from pathlib import Path
from typing import List, Tuple, Sequence, Iterable


In [2]:

# ---  Шаблоны для поиска важных строк  -----------------------------------
KEY_PATTERNS = {
    "rpm_error":              re.compile(r"^RPM build errors:", re.I),
    "bad_exit_status":        re.compile(r"Bad exit status.*\(%\w+\)", re.I),
    "command_nonzero":        re.compile(r"Command exited with non-zero status", re.I),
    "cmake_error":            re.compile(r"\bCMake Error\b", re.I),
    "macro_not_found":        re.compile(r"Macro .* not found", re.I),
    "generic_error":          re.compile(r"\berror:", re.I),
    "python_traceback":       re.compile(r"^\s*Traceback ", re.I),
    "segfault":               re.compile(r"Segmentation fault", re.I),
}

STACKTRACE_LINE = re.compile(
    r"^\s*(#\d+|at\s+\S+|File\s+\"|0x[0-9a-f]{6,}|panic:|   at )", re.I
)


In [3]:

def _grab_range(index: int, ctx: int, total: int) -> range:
    return range(max(0, index - ctx), min(total, index + ctx + 1))

def _uniq_keep_order(seq: Iterable[int]):
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]


In [4]:

# --- Нормализация текста для LLM ----------------------------------------
NORMALIZERS: Sequence[Tuple[re.Pattern[str], str]] = (
    (re.compile(r"/\S+"), "<PATH>"),
    (re.compile(r"[A-Z][a-z]{2}\s+\d{1,2}\s+\d\d:\d\d:\d\d"), "<DATE>"),
    (re.compile(r"\d+\.\d+(user|system|elapsed)"), "<TIME>"),
    (re.compile(r"\b\d+\b"), "<NUM>"),
    (re.compile(r"\s+"), " "),
)

def prepare_for_llm(text: str) -> str:
    for pat, repl in NORMALIZERS:
        text = pat.sub(repl, text)
    return text.lower().strip()


In [5]:

def analyze_log(lines: List[str], ctx: int = 5):
    important = set()
    findings = collections.defaultdict(list)

    for i, line in enumerate(lines):
        for tag, pat in KEY_PATTERNS.items():
            if pat.search(line):
                findings[tag].append((i, line.rstrip()))
                important.update(_grab_range(i, ctx, len(lines)))
        if STACKTRACE_LINE.match(line):
            important.update(_grab_range(i, ctx, len(lines)))

    trimmed = "".join(lines[i] for i in _uniq_keep_order(sorted(important)))

    summary_lines = []
    summary_lines.append(f"Log length: {len(lines):,} lines")
    summary_lines.append(f"Retained after trimming: {len(important):,} lines "
                         f"({len(important)/max(1,len(lines))*100:.1f} %)")
    for tag, matches in findings.items():
        summary_lines.append(f"{tag:20s}: {len(matches)}")

    rpm_fail_pat = re.compile(r"hsh-rebuild: rebuild of `([^']+)", re.I)
    failed_pkgs = sorted({rpm_fail_pat.search(l).group(1)
                          for l in lines if rpm_fail_pat.search(l)})
    if failed_pkgs:
        summary_lines.append("\nFailed SRPMs:")
        summary_lines.extend("  • " + p for p in failed_pkgs)

    macro_pat = re.compile(r"Macro (%\S+) not found", re.I)
    missing_macros = sorted({macro_pat.search(l).group(1)
                             for l in lines if macro_pat.search(l)})
    if missing_macros:
        summary_lines.append("\nMissing RPM macros:")
        summary_lines.extend("  • " + m for m in missing_macros)

    summary = "\n".join(summary_lines)
    return summary, trimmed


In [6]:

# --- Пример использования -------------------------------------------------
log_path = "/home/user/Projects/hackathon/ml_preset/download_logs/latest/error/alt-docs-apache2-0.2.2-alt1"   # <-- поменяйте на путь к вашему файлу
ctx_lines = 15            # количество строк контекста вокруг ошибок

if Path(log_path).is_file():
    with open(log_path, "r", errors="replace") as f:
        lines = f.readlines()

    summary, trimmed = analyze_log(lines, ctx_lines)
    print("="*80)
    print("SUMMARY")
    print("="*80)
    print(summary)
    print("="*80)
    print("TRIMMED LOG")
    print("="*80)
    print(trimmed[:5000] + ("...\n[truncated]" if len(trimmed) > 5000 else ""))
else:
    print(f"Файл не найден: {log_path}")


SUMMARY
Log length: 124 lines
Retained after trimming: 72 lines (58.1 %)
macro_not_found     : 5
cmake_error         : 1
bad_exit_status     : 2
generic_error       : 1
rpm_error           : 1
command_nonzero     : 2

Failed SRPMs:
  • alt-docs-apache2-0.2.2-alt1.src.rpm

Missing RPM macros:
  • %apache2_extra_available
  • %apache2_extra_start
  • %apache2_mods_start
  • %post_apache2conf
TRIMMED LOG
<86>May 16 00:51:40 userdel[1101603]: delete user 'rooter'
<86>May 16 00:51:40 userdel[1101603]: removed group 'rooter' owned by 'rooter'
<86>May 16 00:51:40 userdel[1101603]: removed shadow group 'rooter' owned by 'rooter'
<86>May 16 00:51:40 groupadd[1101610]: group added to /etc/group: name=rooter, GID=1271
<86>May 16 00:51:40 groupadd[1101610]: group added to /etc/gshadow: name=rooter
<86>May 16 00:51:40 groupadd[1101610]: new group: name=rooter, GID=1271
<86>May 16 00:51:40 useradd[1101618]: new user: name=rooter, UID=1271, GID=1271, home=/root, shell=/bin/bash, from=none
<86>May 16 

In [7]:

# --- Получить нормализованный текст для LLM ------------------------------
# Выполните после предыдущей ячейки, если нужен "чистый" текст
normalized = prepare_for_llm(trimmed)
print(normalized[:5000] + ("...\n[truncated]" if len(normalized) > 5000 else ""))


