In [1]:
import pathlib, argparse, json, warnings, math
from typing import List, Dict, Tuple

import numpy as np
import torch
from transformers import (AutoTokenizer, AutoModel,
                          BertTokenizer, BertModel)

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.cluster import (KMeans, AgglomerativeClustering,
                             OPTICS, DBSCAN, Birch, SpectralClustering)
from sklearn.mixture import GaussianMixture
import hdbscan
import umap
import matplotlib.pyplot as plt

import re
from typing import List

# ── предварительно компилируем паттерн ────────────────────────────────────
SYSLOG_PREFIX = re.compile(
    r'^<'          # открывающий символ <
    r'\d{1,3}'     # PRI: 1–3 цифры
    r'>'
    r'[A-Z][a-z]{2}\s+'     # месяц (Jan, Feb, …)
    r'\d{1,2}\s+'           # число месяца (1–31)
    r'\d{2}:\d{2}:\d{2}\s+' # время HH:MM:SS
)

def read_lines(path: str) -> List[str]:
    """
    Читает файл построчно, удаляя syslog-префикс
    <PRI>Mon DD HH:MM:SS  и возвращая ненулевые строки.
    """
    cleaned: List[str] = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            # убираем возможный префикс и пробельный «хвост»
            ln = SYSLOG_PREFIX.sub('', ln).strip()
            if ln:                         # пропускаем пустые
                cleaned.append(ln)
    return cleaned

class TransformerEmbedder:
    """
    Универсальный sentence-transformers / HuggingFace-эмбеддер.
    По умолчанию — all-MiniLM-L6-v2 (384-мерные векторы).
    """
    def __init__(self,
                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 device: str | None = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = SentenceTransformer(model_name, device=self.device)

    def embed(self, texts: List[str]) -> np.ndarray:
        vector = self.model.encode(texts, normalize_embeddings=True,
                                   batch_size=64, show_progress_bar=False)
        return vector.astype(np.float32)


class TfidfEmbedder:
    """TF-IDF + ℓ2-нормализация."""
    def __init__(self, max_features: int = 10_000):
        self.vectorizer = TfidfVectorizer(max_features=max_features)

    def embed(self, texts: List[str]) -> np.ndarray:
        m = self.vectorizer.fit_transform(texts)
        x = m.astype(np.float32).toarray()
        x /= np.linalg.norm(x, axis=1, keepdims=True) + 1e-10
        return x


class LogBertEmbedder:
    """
    Эмбеддер для журналов/логов на основе модели
    teoogherghi/Log-Analysis-Model-DistilBert.
    """
    def __init__(self,
                 model_name: str = "teoogherghi/Log-Analysis-Model-DistilBert",
                 device: str | None = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval().to(self.device)

    def embed(self, texts: List[str]) -> np.ndarray:
        with torch.no_grad():
            toks = self.tokenizer(texts,
                                  padding=True, truncation=True,
                                  return_tensors="pt").to(self.device)
            outs = self.model(**toks).last_hidden_state[:, 0, :]
            vec = outs.cpu().numpy()
            vec /= np.linalg.norm(vec, axis=1, keepdims=True) + 1e-10
            return vec.astype(np.float32)


def run_kmeans(x: np.ndarray, k: int = 10, seed: int = 42) -> np.ndarray:
    km = KMeans(n_clusters=k, random_state=seed, n_init="auto")
    return km.fit_predict(x)

def run_hdbscan(x: np.ndarray,
                min_cluster_size: int = 10,
                min_samples: int | None = None) -> np.ndarray:
    return hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                           min_samples=min_samples or min_cluster_size//2,
                           metric="euclidean").fit_predict(x)

def run_gmm(x: np.ndarray, k: int = 10, seed: int = 42) -> np.ndarray:
    gmm = GaussianMixture(n_components=k, random_state=seed)
    gmm.fit(x)
    return gmm.predict(x)

ALGORITHMS = {
    "kmeans": run_kmeans,
    "hdbscan": run_hdbscan,
    "gmm":    run_gmm,                        # Gaussian Mixture
    "agglo":  lambda x, k=10: AgglomerativeClustering(
        n_clusters=k).fit_predict(x),
    "optics": lambda x, _: OPTICS(min_samples=10).fit_predict(x),
    "dbscan": lambda x, _: DBSCAN(eps=0.8, min_samples=10).fit_predict(x),
    "spectral": lambda x, k=10: SpectralClustering(
        n_clusters=k, assign_labels="discretize",
        affinity="nearest_neighbors").fit_predict(x),
    "birch": lambda x, k=10: Birch(n_clusters=k).fit_predict(x),
    # Для k-medoids понадобится пакет pyclustering:
    # "kmedoids": lambda x, k=10: kmedoids_wrapper(x, k),
}

def central_indices(x: np.ndarray, labels: np.ndarray,
                    top: int = 10) -> Dict[int, List[int]]:
    """Возвращает индексы `top` ближайших к центроиду точек в каждом кластере."""
    out: Dict[int, List[int]] = {}
    for lbl in np.unique(labels):
        if lbl == -1:    # метка «шум» в HDBSCAN/DBSCAN
            continue
        members = np.where(labels == lbl)[0]
        centroid = x[members].mean(axis=0, keepdims=True)
        dist = pairwise_distances(x[members], centroid)
        order = members[np.argsort(dist[:, 0])]
        out[lbl] = order[:top].tolist()
    return out

def plot_clusters(x: np.ndarray, labels: np.ndarray, title: str):
    reducer = umap.UMAP(n_components=2, random_state=0)
    xy = reducer.fit_transform(x)
    plt.figure(figsize=(8, 6))
    plt.scatter(xy[:, 0], xy[:, 1], c=labels, s=6, cmap="tab20", alpha=0.8)
    plt.title(title)
    plt.tight_layout()
    plt.show()

def pipeline(path: str,
             embedder_name: str = "transformer",
             algorithm: str = "hdbscan",
             k: int = 10):
    lines = read_lines(path)
    if not lines:
        raise ValueError("Файл пустой.")

    # -------- Embed --------
    if embedder_name == "transformer":
        embedder = TransformerEmbedder()
    elif embedder_name == "tfidf":
        embedder = TfidfEmbedder()
    elif embedder_name == "logbert":
        embedder = LogBertEmbedder()
    else:
        raise ValueError(f"Неизвестный embedder {embedder_name}")

    X = embedder.embed(lines)

    # -------- Clustering --------
    if algorithm not in ALGORITHMS:
        raise ValueError(f"Неизвестный алгоритм {algorithm}")
    labels = ALGORITHMS[algorithm](X, k)

    # -------- Central samples --------
    centers = central_indices(X, labels, top=10)
    clusters_txt: Dict[int, List[str]] = {
        c: [lines[i] for i in idxs] for c, idxs in centers.items()
    }

    # -------- Output & plot --------
    print(f"Найдено кластеров (без шума): {len(centers)}")
    for cid, samples in clusters_txt.items():
        print(f"\n=== Кластер {cid} ===")
        for s in samples:
            print("•", s[:180])

    plot_clusters(X, labels, f"{algorithm} / {embedder_name}")


In [2]:
from itertools import product
from pathlib import Path
import ipywidgets as w
from IPython.display import display, Markdown

In [3]:
# --- настройте путь к файлу и параметры по умолчанию ---
FILE_PATH = "/home/user/Projects/hackathon/ml_preset/download_logs/latest/error/qca-qt5-2.3.8-alt1"        # <-- измените при необходимости
N_CLUSTERS = 10                  # для алгоритмов, которым нужен k
MIN_CLUSTER_SIZE = 10            # для HDBSCAN / OPTICS / DBSCAN, etc.
TOP_N = 30                       # сколько «центральных» строк печатать

# --- словарь эмбеддеров ---
EMBEDDERS = {
    # "transformer": TransformerEmbedder(),
    # "tfidf":       TfidfEmbedder(),
    "logbert":     LogBertEmbedder(),
}

# --- словарь кластеризаторов с явным указанием kwargs ---
CLUSTERERS = {
    "kmeans":   (run_kmeans,   dict(k=N_CLUSTERS)),
    "hdbscan":  (run_hdbscan,  dict(min_cluster_size=MIN_CLUSTER_SIZE)),
    "gmm":      (run_gmm,      dict(k=N_CLUSTERS)),
    "agglo":    (ALGORITHMS["agglo"],  dict(k=N_CLUSTERS)),
    "spectral": (ALGORITHMS["spectral"],dict(k=N_CLUSTERS)),
    "optics":   (ALGORITHMS["optics"], dict(_=None)),  # параметр-заглушка
    "dbscan":   (ALGORITHMS["dbscan"], dict(_=None)),
    "birch":    (ALGORITHMS["birch"],  dict(k=N_CLUSTERS)),
}

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at teoogherghi/Log-Analysis-Model-DistilBert and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attentio

In [None]:
from itertools import product
from pathlib import Path            # Path нужен для чтения файла
from IPython.display import Markdown, display

# === читаем строки ===
lines = [ln.strip() for ln in Path(FILE_PATH).read_text(encoding="utf-8").splitlines() if ln.strip()]
if not lines:
    raise ValueError("Файл пуст или не найден")

# === главный цикл: все комбинации эмбеддер × алгоритм ===
for (emb_name, embedder), (algo_name, (func, kwargs)) in product(EMBEDDERS.items(),
                                                                 CLUSTERERS.items()):
    display(Markdown(f"## **{emb_name.upper()} + {algo_name.upper()}**"))
    
    # 1) эмбеддинги
    X = embedder.embed(lines)
    
    # 2) кластеризация
    labels = func(X, **kwargs)
    
    # 3) центральные строки
    centers = central_indices(X, labels, top=TOP_N)
    display(Markdown(f"Найдено кластеров: **{len(centers)}**"))
    for cid, idxs in centers.items():
        text_block = "\n".join(f"- {lines[i][:180]}" for i in idxs)
        display(Markdown(f"**Кластер {cid}**\n\n{text_block}"))
    
    # 4) визуализация
    plot_clusters(X, labels, f"{emb_name} / {algo_name}")

print("✅ Все комбинации завершены.")


## **LOGBERT + KMEANS**

OutOfMemoryError: CUDA out of memory. Tried to allocate 644.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 600.06 MiB is free. Including non-PyTorch memory, this process has 3.04 GiB memory in use. Of the allocated memory 2.73 GiB is allocated by PyTorch, and 254.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)