In [15]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering

# Загрузка эмбеддингов
df = pd.read_csv('/content/drive/MyDrive/testing_models/distil-log.csv')

# Предположение о структуре данных:
# - Первая колонка содержит тексты логов
# - Остальные колонки содержат числовые эмбеддинги
texts = df.iloc[:, 0].values
embeddings = df.iloc[:, 1:].values

# Проверка формы данных
print(f"Найдено {len(texts)} записей")
print(f"Размерность эмбеддингов: {embeddings.shape[1]}")

# Создание модели кластеризации
cluster_model = AgglomerativeClustering(
    n_clusters=10,
    linkage='average',
    metric='euclidean'
)

# Выполнение кластеризации
clusters = cluster_model.fit_predict(embeddings)

# Создание DataFrame с результатами
result_df = pd.DataFrame({
    'cluster_id': clusters,
    'log_text': texts
})

# Сохранение результатов
output_path = '/content/drive/MyDrive/testing_models/output/clustered_logs.csv'
result_df.to_csv(output_path, index=False)

print(f"Результаты сохранены в: {output_path}")
print("Распределение кластеров:")
print(result_df['cluster_id'].value_counts())

Найдено 375 записей
Размерность эмбеддингов: 768
Результаты сохранены в: /content/drive/MyDrive/testing_models/output/clustered_logs.csv
Распределение кластеров:
cluster_id
0    350
2     10
1      4
9      3
3      2
4      2
5      1
6      1
7      1
8      1
Name: count, dtype: int64


In [16]:
import pandas as pd

# Загрузка и обработка данных
df1 = pd.read_csv('/content/drive/MyDrive/testing_models/a/classified_table.csv')  # Первый файл с ошибками и кластерами
df2 = pd.read_csv('/content/drive/MyDrive/testing_models/output/clustered_logs.csv')  # Второй файл с соответствием кластеров

# Преобразование log_text в числовой тип для корректного объединения
df2['log_text'] = pd.to_numeric(df2['log_text'], errors='coerce')

# Объединение данных по id и log_text
merged_df = df1.merge(
    df2.rename(columns={'log_text': 'id'}),  # Переименовываем log_text в id для совпадения ключей
    on='id',
    how='inner'
)

# Фильтрация нужных колонок и сохранение
merged_df[['cluster_id', 'errors']] \
    .rename(columns={'cluster_id': 'id_cluster'}) \
    .to_csv('/content/drive/MyDrive/testing_models/output/result.csv', index=False)

In [21]:
import csv
import json
from collections import defaultdict

clusters = defaultdict(list)

import csv
import sys
from collections import defaultdict

# Увеличиваем максимальный размер поля до максимума
csv.field_size_limit(sys.maxsize)

clusters = defaultdict(list)

with open('/content/drive/MyDrive/testing_models/output/result.csv', 'r', newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        cluster_id = row['id_cluster']
        error = row['errors'].strip()
        if error:
            clusters[cluster_id].append(error)

# Формируем список словарей для JSON
result = [
    {
        "id_cluster": cluster_id,
        "errors": errors
    }
    for cluster_id, errors in clusters.items()
]

# Запись в JSON с форматированием
with open('/content/drive/MyDrive/testing_models/output/group_result.json', 'w', encoding='utf-8') as outfile:
    json.dump(result, outfile, ensure_ascii=False, indent=4)

Имена для кластеров

In [1]:
import os, math, json, gc, torch, pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# === Настройки ===
CANDIDATE_MODELS = [
    "sshleifer/distilbart-cnn-12-6",
    "google/pegasus-xsum",
    "t5-base",
]

DEVICE = 0 if torch.cuda.is_available() else -1
print(f"Using device: {DEVICE}")

Using device: -1


In [1]:
import json
import random
import torch
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# ================= Конфигурация =================
CANDIDATE_MODELS = [
    "sshleifer/distilbart-cnn-12-6",
    "google/pegasus-xsum",
    "t5-base"
]

INPUT_JSON = '/content/drive/MyDrive/testing_models/output/group_result.json'
OUTPUT_JSON = '/content/drive/MyDrive/testing_models/output/summarized_results.json'
DEVICE = 0 if torch.cuda.is_available() else -1

# ================ Функции обработки ================
def extract_snippet(text, max_length=1024):
    """Извлечение фрагмента текста с приоритетом на начало после новой строки"""
    newline_positions = [i for i, c in enumerate(text) if c == '\n']

    # Выбираем позиции, где после \n достаточно символов
    valid_positions = [i for i in newline_positions if (i+1+max_length) <= len(text)]

    if valid_positions:
        start = random.choice(valid_positions) + 1
    elif newline_positions:
        start = random.choice(newline_positions) + 1
    else:
        start = 0

    return text[start:start+max_length]

def load_models():
    """Инициализация моделей суммаризации"""
    models = {}
    for model_name in CANDIDATE_MODELS:
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            summarizer = pipeline(
                "summarization",
                model=model,
                tokenizer=tokenizer,
                device=DEVICE
            )
            models[model_name] = summarizer
        except Exception as e:
            print(f"Error loading {model_name}: {str(e)}")
    return models

# ================= Основной поток =================
if __name__ == "__main__":
    # Загрузка данных
    with open(INPUT_JSON, 'r', encoding='utf-8') as f:
        clusters = json.load(f)

    # Извлечение фрагментов
    for cluster in clusters:
        combined_text = '\n'.join(cluster['errors'])
        cluster['snippet'] = extract_snippet(combined_text)

    # Инициализация моделей
    models = load_models()

    # Суммаризация
    for cluster in tqdm(clusters, desc="Processing clusters"):
        text = cluster['snippet']
        cluster['summaries'] = {}

        for model_name, summarizer in models.items():
            try:
                result = summarizer(
                    text,
                    max_length=150,
                    min_length=30,
                    truncation=True
                )
                cluster['summaries'][model_name] = result[0]['summary_text']
            except Exception as e:
                cluster['summaries'][model_name] = f"ERROR: {str(e)}"

    # Сохранение результатов
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(clusters, f, ensure_ascii=False, indent=4)

print("Обработка завершена! Результаты сохранены в:", OUTPUT_JSON)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


Processing clusters:   0%|          | 0/10 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Обработка завершена! Результаты сохранены в: /content/drive/MyDrive/testing_models/output/summarized_results.json


Имена кластеров в csv