In [2]:
import re
import json
import random
from typing import List, Tuple, Dict, Any
from pathlib import Path
from pydantic import BaseModel
import httpx
import pandas as pd
from tqdm.asyncio import tqdm_asyncio

class DeepSeekMessage(BaseModel):
    role: str
    content: str

class DeepSeekService:
    def __init__(self):
        self.api_url = "https://api.deepseek.com/v1/chat/completions"
        self.headers = {
            "Authorization": "Bearer sk-caf1d8be739f4e2ea2da177bd37e5c01",
            "Content-Type": "application/json"
        }
        self.timeout = httpx.Timeout(60.0, connect=10.0)

CONTEXT_WINDOW = 2
ERROR_KEYWORDS = re.compile(
    r"\b(error|failed|exception|warning|critical|timeout|denied|refused|crash|panic)\b", 
    re.IGNORECASE
)

def prepare_log_context(log_lines: List[str]) -> str:
    error_indices = []
    for i, line in enumerate(log_lines):
        if ERROR_KEYWORDS.search(line):
            error_indices.append(i)

    context_lines = set()
    
    # Добавляем контекст вокруг ошибок
    for idx in error_indices:
        start = max(0, idx - CONTEXT_WINDOW)
        end = min(len(log_lines), idx + CONTEXT_WINDOW + 1)
        context_lines.update(range(start, end))
    
    # Если нет ошибок - выбираем случайные строки
    if not context_lines and len(log_lines) > 0:
        sample_size = min(3, len(log_lines))
        context_lines = set(random.sample(range(len(log_lines)), sample_size))
    
    # Сортируем и формируем контекст
    sorted_lines = sorted(context_lines)
    selected_lines = [log_lines[i] for i in sorted_lines if i < len(log_lines)]
    
    # Добавляем номера строк
    return "\n".join(
        f"[Line {i+1}] {line}" 
        for i, line in zip(sorted_lines, selected_lines)
    )

class ErrorAnalysisService(DeepSeekService):
    async def analyze_log_entry(self, log_context: str) -> Dict[str, str]:
        prompt = (
            "Анализируй следующие строки логов. Определи:\n"
            "1. Основные ошибки и их причины\n"
            "2. Возможные решения проблем\n"
            "Ответь строго в JSON формате на русском языке:\n"
            '{"cause": "<причина>", "solutions": "<решения>", "errors_found": <true/false>}'
        )
        
        messages = [
            DeepSeekMessage(
                role="system",
                content="Ты эксперт по анализу логов. Анализируй все возможные ошибки."
            ),
            DeepSeekMessage(role="user", content=f"{prompt}\n\n{log_context}")
        ]
        
        payload = {
            "model": "deepseek-chat",
            "messages": [m.model_dump() for m in messages],
            "temperature": 0.3,
            "response_format": {"type": "json_object"}
        }
        
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            response = await client.post(self.api_url, headers=self.headers, json=payload)
            response.raise_for_status()
            return response.json()["choices"][0]["message"]["content"]

async def _process_dataframe(df: pd.DataFrame, svc: ErrorAnalysisService, test_samples: int) -> pd.DataFrame:
    # Подготовка задач
    tasks = []
    for _, row in df.iterrows():
        log_lines = [line.strip() for line in str(row["errors"]).split("\n") if line.strip()]
        context = prepare_log_context(log_lines)
        tasks.append({
            "id": row["id"],
            "context": context,
            "original_errors": "\n".join(log_lines)
        })

    # Тестовый прогон
    print(f"\n=== 🔍 Тестовый запуск: {test_samples} записей ===")
    for task in tasks[:test_samples]:
        try:
            res = await svc.analyze_log_entry(task["context"])
            print(
                f"[ID {task['id']}]\n"
                f"Контекст:\n{task['context']}\n"
                f"Результат: {res}\n{'-'*50}"
            )
        except Exception as e:
            print(f"Ошибка анализа ID {task['id']}: {str(e)}")

    input("Нажмите Enter для продолжения...")

    # Полная обработка
    results = []
    for task in tqdm_asyncio(tasks, desc="Анализ логов"):
        try:
            analysis = json.loads(await svc.analyze_log_entry(task["context"]))
            results.append({
                "id": task["id"],
                "original_errors": task["original_errors"],
                "analyzed_context": task["context"],
                "cause": analysis.get("cause", ""),
                "solutions": analysis.get("solutions", ""),
                "errors_found": analysis.get("errors_found", False)
            })
        except Exception as e:
            results.append({
                "id": task["id"],
                "original_errors": task["original_errors"],
                "analyzed_context": task["context"],
                "cause": "Ошибка анализа",
                "solutions": str(e),
                "errors_found": False
            })
    
    return pd.DataFrame(results)

async def analyze_errors(
    input_path: str | Path,
    output_path: str | Path | None = None,
    test_samples: int = 1,
) -> pd.DataFrame:
    df_in = pd.read_csv(input_path)
    if {"id", "errors"} - set(df_in.columns):
        raise ValueError("CSV должен содержать колонки: id, errors")
    
    service = ErrorAnalysisService()
    df_out = await _process_dataframe(df_in, service, test_samples)
    
    if output_path:
        df_out.to_csv(output_path, index=False)
        print(f"\n✅ Результаты сохранены в: {output_path} (записей: {len(df_out)})")
    
    return df_out

# Запуск анализа
await analyze_errors("classified_table.csv", "classified_table_with_answers.csv")


=== 🔍 Тестовый запуск: 1 записей ===
[ID 0]
Контекст:
[Line 9] + umask 022
[Line 10] Problem opening /proc/meminfo
[Line 11] CMake Error at CMakeLists.txt:26 (cmake_policy):
[Line 12] Policy CMP0051 may not be set to OLD behavior because this version of CMake
[Line 13] behavior or use an older version of CMake that still supports the old
[Line 256] -- Performing Test C_SUPPORTS_DEPRECATED_COPY
[Line 257] Problem opening /proc/meminfo
[Line 258] -- Performing Test C_SUPPORTS_DEPRECATED_COPY - Failed
[Line 259] Problem opening /proc/meminfo
[Line 260] Problem opening /proc/meminfo
[Line 261] Problem opening /proc/meminfo
[Line 262] -- Performing Test C_WCOMMENT_ALLOWS_LINE_WRAP - Failed
[Line 263] Problem opening /proc/meminfo
[Line 264] Problem opening /proc/meminfo
[Line 275] -- Could NOT find LibXml2 (missing: LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
[Line 276] Problem opening /proc/meminfo
[Line 277] -- Performing Test CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG - Failed
[Line 278] -- Config

Анализ логов: 100%|██████████| 375/375 [2:05:29<00:00, 20.08s/it]  


✅ Результаты сохранены в: classified_table_with_answers.csv (записей: 375)





Unnamed: 0,id,original_errors,analyzed_context,cause,solutions,errors_found
0,0,Building for target x86_64\nBuilding for targe...,[Line 9] + umask 022\n[Line 10] Problem openin...,,,False
1,1,Package bash-completion was not found in the p...,[Line 26] + export LDFLAGS\n[Line 27] Problem ...,,,False
2,2,[ DONE ]\nBuilding for target x86_64\nBuilding...,[Line 145] checking for lf95... no\n[Line 146]...,,,False
3,3,Building for target x86_64\nBuilding for targe...,[Line 13] + export LDFLAGS\n[Line 14] Problem ...,,,False
4,4,Building for target x86_64\nBuilding for targe...,[Line 11] -- Detecting CXX compiler ABI info -...,Несовместимость версии CMake. Проект требует в...,Обновите CMake до версии 3.5 или выше. Это мож...,True
...,...,...,...,...,...,...
370,370,COMPILATION ERROR :\n/usr/src/RPM/BUILD/xmluni...,[Line 1] COMPILATION ERROR :\n[Line 2] /usr/sr...,,,False
371,371,<86>May 16 02:22:27 groupadd[1375817]: group a...,[Line 40] to the PKG_CONFIG_PATH environment v...,,,False
372,372,<86>May 16 01:32:17 userdel[1124655]: removed ...,[Line 21] Policy CMP0153 is not set: The exec_...,,,False
373,373,[ DONE ]\nBuilding for target x86_64\nBuilding...,[Line 10] + ac_cv_lib_intl_gettext=no\n[Line 1...,,,False


In [3]:
r = pd.read_csv("classified_table_with_answers.csv")

In [4]:
r.isna().sum()

id                    0
original_errors       0
analyzed_context      0
cause               293
solutions           293
errors_found          0
dtype: int64