In [2]:
import re
import json
import random
from typing import List, Tuple, Dict, Any
from pathlib import Path
from pydantic import BaseModel
import httpx
import pandas as pd
from tqdm.asyncio import tqdm_asyncio

class DeepSeekMessage(BaseModel):
    role: str
    content: str

class DeepSeekService:
    def __init__(self):
        self.api_url = "https://api.deepseek.com/v1/chat/completions"
        self.headers = {
            "Authorization": "Bearer sk-caf1d8be739f4e2ea2da177bd37e5c01",
            "Content-Type": "application/json"
        }
        self.timeout = httpx.Timeout(60.0, connect=10.0)

CONTEXT_WINDOW = 2
ERROR_KEYWORDS = re.compile(
    r"\b(error|failed|exception|warning|critical|timeout|denied|refused|crash|panic)\b", 
    re.IGNORECASE
)

def prepare_log_context(log_lines: List[str]) -> str:
    error_indices = []
    for i, line in enumerate(log_lines):
        if ERROR_KEYWORDS.search(line):
            error_indices.append(i)

    context_lines = set()
    
    # –î–æ–±–∞–≤–ª—è–µ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç –≤–æ–∫—Ä—É–≥ –æ—à–∏–±–æ–∫
    for idx in error_indices:
        start = max(0, idx - CONTEXT_WINDOW)
        end = min(len(log_lines), idx + CONTEXT_WINDOW + 1)
        context_lines.update(range(start, end))
    
    # –ï—Å–ª–∏ –Ω–µ—Ç –æ—à–∏–±–æ–∫ - –≤—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω—ã–µ —Å—Ç—Ä–æ–∫–∏
    if not context_lines and len(log_lines) > 0:
        sample_size = min(3, len(log_lines))
        context_lines = set(random.sample(range(len(log_lines)), sample_size))
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º –∏ —Ñ–æ—Ä–º–∏—Ä—É–µ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç
    sorted_lines = sorted(context_lines)
    selected_lines = [log_lines[i] for i in sorted_lines if i < len(log_lines)]
    
    # –î–æ–±–∞–≤–ª—è–µ–º –Ω–æ–º–µ—Ä–∞ —Å—Ç—Ä–æ–∫
    return "\n".join(
        f"[Line {i+1}] {line}" 
        for i, line in zip(sorted_lines, selected_lines)
    )

class ErrorAnalysisService(DeepSeekService):
    async def analyze_log_entry(self, log_context: str) -> Dict[str, str]:
        prompt = (
            "–ê–Ω–∞–ª–∏–∑–∏—Ä—É–π —Å–ª–µ–¥—É—é—â–∏–µ —Å—Ç—Ä–æ–∫–∏ –ª–æ–≥–æ–≤. –û–ø—Ä–µ–¥–µ–ª–∏:\n"
            "1. –û—Å–Ω–æ–≤–Ω—ã–µ –æ—à–∏–±–∫–∏ –∏ –∏—Ö –ø—Ä–∏—á–∏–Ω—ã\n"
            "2. –í–æ–∑–º–æ–∂–Ω—ã–µ —Ä–µ—à–µ–Ω–∏—è –ø—Ä–æ–±–ª–µ–º\n"
            "–û—Ç–≤–µ—Ç—å —Å—Ç—Ä–æ–≥–æ –≤ JSON —Ñ–æ—Ä–º–∞—Ç–µ –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ:\n"
            '{"cause": "<–ø—Ä–∏—á–∏–Ω–∞>", "solutions": "<—Ä–µ—à–µ–Ω–∏—è>", "errors_found": <true/false>}'
        )
        
        messages = [
            DeepSeekMessage(
                role="system",
                content="–¢—ã —ç–∫—Å–ø–µ—Ä—Ç –ø–æ –∞–Ω–∞–ª–∏–∑—É –ª–æ–≥–æ–≤. –ê–Ω–∞–ª–∏–∑–∏—Ä—É–π –≤—Å–µ –≤–æ–∑–º–æ–∂–Ω—ã–µ –æ—à–∏–±–∫–∏."
            ),
            DeepSeekMessage(role="user", content=f"{prompt}\n\n{log_context}")
        ]
        
        payload = {
            "model": "deepseek-chat",
            "messages": [m.model_dump() for m in messages],
            "temperature": 0.3,
            "response_format": {"type": "json_object"}
        }
        
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            response = await client.post(self.api_url, headers=self.headers, json=payload)
            response.raise_for_status()
            return response.json()["choices"][0]["message"]["content"]

async def _process_dataframe(df: pd.DataFrame, svc: ErrorAnalysisService, test_samples: int) -> pd.DataFrame:
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∑–∞–¥–∞—á
    tasks = []
    for _, row in df.iterrows():
        log_lines = [line.strip() for line in str(row["errors"]).split("\n") if line.strip()]
        context = prepare_log_context(log_lines)
        tasks.append({
            "id": row["id"],
            "context": context,
            "original_errors": "\n".join(log_lines)
        })

    # –¢–µ—Å—Ç–æ–≤—ã–π –ø—Ä–æ–≥–æ–Ω
    print(f"\n=== üîç –¢–µ—Å—Ç–æ–≤—ã–π –∑–∞–ø—É—Å–∫: {test_samples} –∑–∞–ø–∏—Å–µ–π ===")
    for task in tasks[:test_samples]:
        try:
            res = await svc.analyze_log_entry(task["context"])
            print(
                f"[ID {task['id']}]\n"
                f"–ö–æ–Ω—Ç–µ–∫—Å—Ç:\n{task['context']}\n"
                f"–†–µ–∑—É–ª—å—Ç–∞—Ç: {res}\n{'-'*50}"
            )
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∞–Ω–∞–ª–∏–∑–∞ ID {task['id']}: {str(e)}")

    input("–ù–∞–∂–º–∏—Ç–µ Enter –¥–ª—è –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏—è...")

    # –ü–æ–ª–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    results = []
    for task in tqdm_asyncio(tasks, desc="–ê–Ω–∞–ª–∏–∑ –ª–æ–≥–æ–≤"):
        try:
            analysis = json.loads(await svc.analyze_log_entry(task["context"]))
            results.append({
                "id": task["id"],
                "original_errors": task["original_errors"],
                "analyzed_context": task["context"],
                "cause": analysis.get("cause", ""),
                "solutions": analysis.get("solutions", ""),
                "errors_found": analysis.get("errors_found", False)
            })
        except Exception as e:
            results.append({
                "id": task["id"],
                "original_errors": task["original_errors"],
                "analyzed_context": task["context"],
                "cause": "–û—à–∏–±–∫–∞ –∞–Ω–∞–ª–∏–∑–∞",
                "solutions": str(e),
                "errors_found": False
            })
    
    return pd.DataFrame(results)

async def analyze_errors(
    input_path: str | Path,
    output_path: str | Path | None = None,
    test_samples: int = 1,
) -> pd.DataFrame:
    df_in = pd.read_csv(input_path)
    if {"id", "errors"} - set(df_in.columns):
        raise ValueError("CSV –¥–æ–ª–∂–µ–Ω —Å–æ–¥–µ—Ä–∂–∞—Ç—å –∫–æ–ª–æ–Ω–∫–∏: id, errors")
    
    service = ErrorAnalysisService()
    df_out = await _process_dataframe(df_in, service, test_samples)
    
    if output_path:
        df_out.to_csv(output_path, index=False)
        print(f"\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {output_path} (–∑–∞–ø–∏—Å–µ–π: {len(df_out)})")
    
    return df_out

# –ó–∞–ø—É—Å–∫ –∞–Ω–∞–ª–∏–∑–∞
await analyze_errors("classified_table.csv", "classified_table_with_answers.csv")


=== üîç –¢–µ—Å—Ç–æ–≤—ã–π –∑–∞–ø—É—Å–∫: 1 –∑–∞–ø–∏—Å–µ–π ===
[ID 0]
–ö–æ–Ω—Ç–µ–∫—Å—Ç:
[Line 9] + umask 022
[Line 10] Problem opening /proc/meminfo
[Line 11] CMake Error at CMakeLists.txt:26 (cmake_policy):
[Line 12] Policy CMP0051 may not be set to OLD behavior because this version of CMake
[Line 13] behavior or use an older version of CMake that still supports the old
[Line 256] -- Performing Test C_SUPPORTS_DEPRECATED_COPY
[Line 257] Problem opening /proc/meminfo
[Line 258] -- Performing Test C_SUPPORTS_DEPRECATED_COPY - Failed
[Line 259] Problem opening /proc/meminfo
[Line 260] Problem opening /proc/meminfo
[Line 261] Problem opening /proc/meminfo
[Line 262] -- Performing Test C_WCOMMENT_ALLOWS_LINE_WRAP - Failed
[Line 263] Problem opening /proc/meminfo
[Line 264] Problem opening /proc/meminfo
[Line 275] -- Could NOT find LibXml2 (missing: LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
[Line 276] Problem opening /proc/meminfo
[Line 277] -- Performing Test CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FL

–ê–Ω–∞–ª–∏–∑ –ª–æ–≥–æ–≤: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 375/375 [2:05:29<00:00, 20.08s/it]  


‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: classified_table_with_answers.csv (–∑–∞–ø–∏—Å–µ–π: 375)





Unnamed: 0,id,original_errors,analyzed_context,cause,solutions,errors_found
0,0,Building for target x86_64\nBuilding for targe...,[Line 9] + umask 022\n[Line 10] Problem openin...,,,False
1,1,Package bash-completion was not found in the p...,[Line 26] + export LDFLAGS\n[Line 27] Problem ...,,,False
2,2,[ DONE ]\nBuilding for target x86_64\nBuilding...,[Line 145] checking for lf95... no\n[Line 146]...,,,False
3,3,Building for target x86_64\nBuilding for targe...,[Line 13] + export LDFLAGS\n[Line 14] Problem ...,,,False
4,4,Building for target x86_64\nBuilding for targe...,[Line 11] -- Detecting CXX compiler ABI info -...,–ù–µ—Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç—å –≤–µ—Ä—Å–∏–∏ CMake. –ü—Ä–æ–µ–∫—Ç —Ç—Ä–µ–±—É–µ—Ç –≤...,–û–±–Ω–æ–≤–∏—Ç–µ CMake –¥–æ –≤–µ—Ä—Å–∏–∏ 3.5 –∏–ª–∏ –≤—ã—à–µ. –≠—Ç–æ –º–æ–∂...,True
...,...,...,...,...,...,...
370,370,COMPILATION ERROR :\n/usr/src/RPM/BUILD/xmluni...,[Line 1] COMPILATION ERROR :\n[Line 2] /usr/sr...,,,False
371,371,<86>May 16 02:22:27 groupadd[1375817]: group a...,[Line 40] to the PKG_CONFIG_PATH environment v...,,,False
372,372,<86>May 16 01:32:17 userdel[1124655]: removed ...,[Line 21] Policy CMP0153 is not set: The exec_...,,,False
373,373,[ DONE ]\nBuilding for target x86_64\nBuilding...,[Line 10] + ac_cv_lib_intl_gettext=no\n[Line 1...,,,False


In [3]:
r = pd.read_csv("classified_table_with_answers.csv")

In [4]:
r.isna().sum()

id                    0
original_errors       0
analyzed_context      0
cause               293
solutions           293
errors_found          0
dtype: int64