In [1]:
#%pip install tiktoken

In [2]:
#%pip install sentencepiece

In [5]:
# %%
import os, json, math, random, pathlib, textwrap, itertools
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import collections
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import (KMeans, DBSCAN, Birch, AgglomerativeClustering,
                             OPTICS, SpectralClustering)

from umap import UMAP  # Исправление импорта для UMAP
import hdbscan
import matplotlib.pyplot as plt

import torch
from transformers import AutoModel, AutoTokenizer

# %%
DATA_PATH = "../logs_with_labels.csv"
df = pd.read_csv(DATA_PATH)
df['id'] = df.index
errors_texts = df['errors'].astype(str).tolist()
print(f"Всего ошибок: {len(errors_texts):,}")
df.head()

# %%
EMB_DIR = pathlib.Path("embeddings"); EMB_DIR.mkdir(exist_ok=True)

MODELS = {
    "tfidf": None,
    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "bge-base-en-v1.5": "BAAI/bge-base-en-v1.5",
    "bge-large-en-v1.5": "BAAI/bge-large-en-v1.5",
    "distil-log": "teoogherghi/Log-Analysis-Model-DistilBert",
    "roberta-large": "roberta-large",
    "deberta-v3-large": "microsoft/deberta-v3-large"
}

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH = 32  # Уменьшенный размер батча для больших моделей

# %%
def encode_st(model_name: str, texts, batch_size=BATCH):
    encoder = SentenceTransformer(model_name, device=DEVICE)
    emb = encoder.encode(texts, batch_size=batch_size,
                         show_progress_bar=True,
                         convert_to_numpy=True, normalize_embeddings=True)
    return emb.astype(np.float32)

# %%
tfidf_vec = TfidfVectorizer(max_features=5000,
                            token_pattern=r'\b\w+\b', lowercase=True)
tfidf_mat = tfidf_vec.fit_transform(errors_texts)
tfidf_df = pd.DataFrame(tfidf_mat.toarray(), index=df.id)
tfidf_df.to_csv(EMB_DIR / "tfidf.csv", index_label="id")
print("TF-IDF shape:", tfidf_mat.shape)

# %%
# Установите необходимые зависимости если еще не установлены
# !pip install sentencepiece transformers>=4.40.0

from transformers import (
    AutoTokenizer,
    AutoModel,
    RobertaTokenizer,       # Для RoBERTa
    DebertaV2Tokenizer      # Для DeBERTa-v3
)


for key, hub_id in MODELS.items():
    if key == "tfidf":
        continue
    print(f"⏳ {key}")
    if key in ["distil-log", "bge-base-en-v1.5", "bge-large-en-v1.5", "roberta-large", "deberta-v3-large"]:
        # Явно выбираем токенизатор для проблемных моделей
        if key == "deberta-v3-large":
            tok = DebertaV2Tokenizer.from_pretrained(hub_id)
        elif key == "roberta-large":
            tok = RobertaTokenizer.from_pretrained(hub_id)
        else:
            tok = AutoTokenizer.from_pretrained(hub_id, use_fast=False)
        
        mdl = AutoModel.from_pretrained(hub_id).to(DEVICE).eval()
        
        # Для DeBERTa и RoBERTa добавляем специальные параметры
        tokenizer_kwargs = {
            'truncation': True,
            'max_length': 128,
            'padding': 'max_length' if key == "deberta-v3-large" else True,
            'return_tensors': "pt"
        }
        
        all_vecs = []
        for i in tqdm(range(0, len(errors_texts), BATCH)):
            batch = errors_texts[i:i+BATCH]
            inputs = tok(batch, **tokenizer_kwargs).to(DEVICE)
            with torch.no_grad():
                outs = mdl(**inputs).last_hidden_state.mean(1)
            all_vecs.append(outs.cpu())
        emb = torch.cat(all_vecs, 0).numpy()
        emb = normalize(emb)
    else:
        emb = encode_st(hub_id, errors_texts)
    pd.DataFrame(emb, index=df.id).to_csv(EMB_DIR / f"{key}.csv", index_label="id")

# %%
CLUSTER_DIR = pathlib.Path("clusters"); CLUSTER_DIR.mkdir(exist_ok=True)

ALGORITHMS = {
    "kmeans": {
        "func": lambda X, params: KMeans(**params).fit(X).labels_,
        "params": [
            {"n_clusters": 20, "n_init": "auto"},
            {"n_clusters": 30, "n_init": 10},
            {"n_clusters": 5, "n_init": 10}
        ]
    },
    "agglo": {
        "func": lambda X, params: AgglomerativeClustering(**params).fit_predict(X),
        "params": [
            {"n_clusters": 20, "linkage": "ward"},
            {"n_clusters": 15, "linkage": "complete"}
        ]
    },
    "dbscan": {
        "func": lambda X, params: DBSCAN(**params).fit_predict(X),
        "params": [
            {"eps": 0.3, "min_samples": 10, "metric": "euclidean"},
            {"eps": 0.5, "min_samples": 15, "metric": "cosine"}
        ]
    },
    "hdbscan": {
        "func": lambda X, params: hdbscan.HDBSCAN(**params).fit_predict(X),
        "params": [
            {"min_cluster_size": 5, "metric": "manhattan"},
            {"min_cluster_size": 10, "metric": "euclidean"}
        ]
    }
}

# %%
def cluster_and_report(name, X, algo_key, params):
    algo_func = ALGORITHMS[algo_key]["func"]
    try:
        labels = algo_func(X, params)
        n_clusters = len(set(labels))
        
        stats = {
            "n_clusters": n_clusters,
            "noise_points": np.sum(labels == -1),
            "silhouette": silhouette_score(X, labels) if n_clusters > 1 else np.nan,
            "davies_bouldin": davies_bouldin_score(X, labels) if n_clusters > 1 else np.nan
        }
        
        label_series = pd.Series(labels, index=df.id, name="cluster")
        label_series.to_csv(CLUSTER_DIR / f"{name}_{algo_key}_{param_str(params)}_labels.csv", index_label="id")
        
        return {**stats, "params": params}
    except Exception as e:
        print(f"⚠️ {algo_key} failed: {e}")
        return None

def param_str(params):
    return "_".join(f"{k}={v}" for k,v in params.items())

# %%
results = []

for emb_file in sorted(EMB_DIR.glob("*.csv")):
    name = emb_file.stem
    X = pd.read_csv(emb_file, index_col="id").values
    print(f"\n=== {name} ===")
    
    for algo_key in ALGORITHMS:
        for params in ALGORITHMS[algo_key]["params"]:
            result = cluster_and_report(name, X, algo_key, params)
            if result:
                results.append({
                    "embedding": name,
                    "algorithm": algo_key,
                    **result
                })
                print(f"{algo_key:8s} [params: {params}] → clusters={result['n_clusters']}")

# %%
CENTRAL_DIR = pathlib.Path("cluster_objects"); CENTRAL_DIR.mkdir(exist_ok=True)

def central_extremes(X, labels, top_k=10):
    out = defaultdict(lambda: {"central": [], "extreme": []})
    for c in set(labels):
        idx = np.where(labels == c)[0]
        part = X[idx]
        center = part.mean(0, keepdims=True)
        dists = np.linalg.norm(part - center, axis=1)
        order = np.argsort(dists)
        out[c]["central"] = idx[order[:top_k]].tolist()
        out[c]["extreme"] = idx[order[::-1][:top_k]].tolist()
    return out

# %%
for lab_file in sorted(CLUSTER_DIR.glob("*_labels.csv")):
    parts = lab_file.stem.split("_")
    name, algo = parts[0], "_".join(parts[1:-1])
    labels = pd.read_csv(lab_file, index_col="id").squeeze().values
    X = pd.read_csv(EMB_DIR / f"{name}.csv", index_col="id").values
    ce = central_extremes(X, labels)

    md_path = CENTRAL_DIR / f"{lab_file.stem}.md"
    with open(md_path, "w", encoding="utf-8") as fp:
        fp.write(f"# {name} + {algo}\n\n")
        fp.write(f"## Cluster Statistics\n")
        fp.write(f"- Total clusters: {len(ce)}\n")
        fp.write(f"- Noise points: {np.sum(labels == -1)}\n\n")
        
        for c, data in ce.items():
            fp.write(f"## Cluster {c} (Size: {len(data['central']) + len(data['extreme'])})\n\n")
            fp.write("### Central objects\n")
            for ix in data["central"]:
                row = df.loc[ix]
                fp.write(f"* **id={ix}** — `{row.errors}`\n")  # Полный текст
            fp.write("\n### Extreme objects\n")
            for ix in data["extreme"]:
                row = df.loc[ix]
                fp.write(f"* **id={ix}** — `{row.errors}`\n")  # Полный текст
            fp.write("\n---\n")
    print("✓", md_path)

# %%
VIS_DIR = pathlib.Path("viz"); VIS_DIR.mkdir(exist_ok=True)

def umap_plot(name, algo, X, labels):
    reducer = umap.UMAP(n_components=2, random_state=42, 
                       n_neighbors=15, min_dist=0.1, metric='cosine')
    emb2d = reducer.fit_transform(X)
    plt.figure(figsize=(10,8))
    scatter = plt.scatter(emb2d[:,0], emb2d[:,1], s=10, c=labels, 
                         cmap='tab20', alpha=0.8)
    plt.title(f"{name} + {algo}\nClusters: {len(set(labels))}")
    plt.colorbar(scatter)
    path = VIS_DIR / f"{name}_{algo}.png"
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
    return path

# %%
res_df = pd.DataFrame(results)
res_df = res_df.explode("params").reset_index(drop=True)
res_df.to_markdown("full_report.md", index=False)
res_df.to_csv("full_report.csv", index=False)
res_df.head(10)

Всего ошибок: 375
TF-IDF shape: (375, 5000)
⏳ all-MiniLM-L6-v2


Batches: 100%|██████████| 12/12 [00:02<00:00,  5.47it/s]


⏳ all-mpnet-base-v2


Batches: 100%|██████████| 12/12 [00:04<00:00,  2.78it/s]


⏳ bge-base-en-v1.5


100%|██████████| 12/12 [00:25<00:00,  2.10s/it]


⏳ bge-large-en-v1.5


100%|██████████| 12/12 [00:26<00:00,  2.23s/it]


⏳ distil-log


100%|██████████| 12/12 [00:24<00:00,  2.08s/it]


⏳ roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 12/12 [00:18<00:00,  1.51s/it]


⏳ deberta-v3-large


100%|██████████| 12/12 [00:09<00:00,  1.29it/s]



=== all-MiniLM-L6-v2 ===
kmeans   [params: {'n_clusters': 20, 'n_init': 'auto'}] → clusters=20
kmeans   [params: {'n_clusters': 30, 'n_init': 10}] → clusters=30
kmeans   [params: {'n_clusters': 5, 'n_init': 10}] → clusters=5
agglo    [params: {'n_clusters': 20, 'linkage': 'ward'}] → clusters=20
agglo    [params: {'n_clusters': 15, 'linkage': 'complete'}] → clusters=15
dbscan   [params: {'eps': 0.3, 'min_samples': 10, 'metric': 'euclidean'}] → clusters=1
dbscan   [params: {'eps': 0.5, 'min_samples': 15, 'metric': 'cosine'}] → clusters=2
hdbscan  [params: {'min_cluster_size': 5, 'metric': 'manhattan'}] → clusters=4
hdbscan  [params: {'min_cluster_size': 10, 'metric': 'euclidean'}] → clusters=3

=== all-mpnet-base-v2 ===
kmeans   [params: {'n_clusters': 20, 'n_init': 'auto'}] → clusters=20
kmeans   [params: {'n_clusters': 30, 'n_init': 10}] → clusters=30
kmeans   [params: {'n_clusters': 5, 'n_init': 10}] → clusters=5
agglo    [params: {'n_clusters': 20, 'linkage': 'ward'}] → clusters=20


Unnamed: 0,embedding,algorithm,n_clusters,noise_points,silhouette,davies_bouldin,params
0,all-MiniLM-L6-v2,kmeans,20,0,0.137729,2.082332,n_clusters
1,all-MiniLM-L6-v2,kmeans,20,0,0.137729,2.082332,n_init
2,all-MiniLM-L6-v2,kmeans,30,0,0.127542,2.079223,n_clusters
3,all-MiniLM-L6-v2,kmeans,30,0,0.127542,2.079223,n_init
4,all-MiniLM-L6-v2,kmeans,5,0,0.146215,2.098053,n_clusters
5,all-MiniLM-L6-v2,kmeans,5,0,0.146215,2.098053,n_init
6,all-MiniLM-L6-v2,agglo,20,0,0.149139,2.194724,n_clusters
7,all-MiniLM-L6-v2,agglo,20,0,0.149139,2.194724,linkage
8,all-MiniLM-L6-v2,agglo,15,0,0.128809,1.97463,n_clusters
9,all-MiniLM-L6-v2,agglo,15,0,0.128809,1.97463,linkage
