In [1]:
#%pip install torch torchaudio transformers diffusers torchvision --upgrade

In [2]:
#%pip install transformers

In [3]:
#%pip install tqdm

In [4]:
#%pip install peft

In [5]:
import re
from pathlib import Path
import math
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm



In [6]:
CSV_PATH   = Path("logs.csv")                         
CSV_OUT    = CSV_PATH.with_name(CSV_PATH.stem + "_with_labels.csv")
MODEL_NAME = "byviz/bylastic_classification_logs"
BATCH_SIZE = 1024
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE).eval()

2025-05-16 19:05:09.375308: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
CLASS_NAMES = {2: "NORMAL", 1: "WARNING", 0: "ERROR"}

In [9]:
TAG_RE   = re.compile(r"\[ERROR\]\s*:?")   
CLEAN_RE = re.compile(r"^\s+|\s+$")            

In [10]:
def strip_error_tag(line: str) -> str:
    return CLEAN_RE.sub("", TAG_RE.sub("", line, count=1))

In [None]:
def classify_texts(texts: list[str]) -> list[str]:
    preds = ["NORMAL"] * len(texts)               
    non_empty = [(i, t) for i, t in enumerate(texts) if t.strip()]
    if not non_empty:
        return preds

    idxs, to_run = zip(*non_empty)        
    total_batches = math.ceil(len(to_run) / BATCH_SIZE)        
    for start in tqdm(
        range(0, len(to_run), BATCH_SIZE),
        total=total_batches,
        desc="Batch inference",
        leave=False,                       
    ):
        batch_txts = to_run[start:start + BATCH_SIZE]
        batch = tokenizer(
            list(batch_txts),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(DEVICE)

        with torch.no_grad():
            logits = model(**batch).logits
            batch_pred = torch.softmax(logits, dim=1).argmax(dim=1).cpu().tolist()

        for pos, pred in zip(
            idxs[start:start + BATCH_SIZE], batch_pred
        ):
            preds[pos] = CLASS_NAMES[pred]

    return preds

df = pd.read_csv(CSV_PATH, dtype={"log_text": str}).fillna("")

errors_col, line_nums_col = [], []

for raw_log in tqdm(df["log_text"], desc="Processing logs"):
    lines = raw_log.splitlines()                  

    explicit = [(n, l) for n, l in enumerate(lines, 1) if TAG_RE.search(l)]

    if explicit:
        err_lines  = [strip_error_tag(l) for _, l in explicit]
        err_nums   = [str(n) for n, _ in explicit]

    else:
        preds      = classify_texts(lines)
        err_lines  = [l for l, p in zip(lines, preds) if p == "ERROR"]
        err_nums   = [str(i + 1) for i, p in enumerate(preds) if p == "ERROR"]

    errors_col.append("\n".join(err_lines))       
    line_nums_col.append(",".join(err_nums)) 

df["errors"]              = errors_col
df["errors_lines_number"] = line_nums_col

df.to_csv(CSV_OUT, index=False)
print(f"✔ Итог сохранён в: {CSV_OUT}")

Processing logs:   0%|          | 0/375 [00:00<?, ?it/s]
Batch inference:   0%|          | 0/1 [00:00<?, ?it/s][A
Batch inference: 100%|██████████| 1/1 [00:04<00:00,  4.57s/it][A
Processing logs:   0%|          | 1/375 [00:04<28:36,  4.59s/it]A
Batch inference:   0%|          | 0/1 [00:00<?, ?it/s][A
Batch inference: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it][A
Processing logs:   1%|          | 2/375 [00:06<18:03,  2.90s/it]A
Batch inference:   0%|          | 0/2 [00:00<?, ?it/s][A
Batch inference:  50%|█████     | 1/2 [00:03<00:03,  3.58s/it][A
Batch inference: 100%|██████████| 2/2 [00:06<00:00,  3.27s/it][A
Processing logs:   1%|          | 3/375 [00:12<28:39,  4.62s/it]A
Batch inference:   0%|          | 0/1 [00:00<?, ?it/s][A
Batch inference: 100%|██████████| 1/1 [00:00<00:00,  5.25it/s][A
Processing logs:   1%|          | 4/375 [00:13<17:47,  2.88s/it]A
Batch inference:   0%|          | 0/1 [00:00<?, ?it/s][A
Batch inference: 100%|██████████| 1/1 [00:00<00:00,  2.85it