In [3]:
# Add this around line 165 (before creating the model)
print("üîç Checking available models...")
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(f"  - {m.name}")

üîç Checking available models...
  - models/gemini-2.5-pro-preview-03-25
  - models/gemini-2.5-flash
  - models/gemini-2.5-pro-preview-05-06
  - models/gemini-2.5-pro-preview-06-05
  - models/gemini-2.5-pro
  - models/gemini-2.0-flash-exp
  - models/gemini-2.0-flash
  - models/gemini-2.0-flash-001
  - models/gemini-2.0-flash-exp-image-generation
  - models/gemini-2.0-flash-lite-001
  - models/gemini-2.0-flash-lite
  - models/gemini-2.0-flash-lite-preview-02-05
  - models/gemini-2.0-flash-lite-preview
  - models/gemini-2.0-pro-exp
  - models/gemini-2.0-pro-exp-02-05
  - models/gemini-exp-1206
  - models/gemini-2.0-flash-thinking-exp-01-21
  - models/gemini-2.0-flash-thinking-exp
  - models/gemini-2.0-flash-thinking-exp-1219
  - models/gemini-2.5-flash-preview-tts
  - models/gemini-2.5-pro-preview-tts
  - models/learnlm-2.0-flash-experimental
  - models/gemma-3-1b-it
  - models/gemma-3-4b-it
  - models/gemma-3-12b-it
  - models/gemma-3-27b-it
  - models/gemma-3n-e4b-it
  - models/gemma-

In [5]:
import json
import os
import chromadb
import google.generativeai as genai
from statistics import median
from typing import List, Dict

# ==========================================
# 0. CONFIGURATION
# ==========================================
# Export this in your terminal: export GOOGLE_API_KEY="your_key"
API_KEY = os.getenv("GOOGLE_API_KEY") 
DB_PATH = "./spark_rag_db" # Where vectors will be saved on disk

if not API_KEY:
    print("‚ö†Ô∏è  WARNING: GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key="AIzaSyDM2QBkfDldek1Ey52-8ANg4T_1NqLATpo") 
else:
    genai.configure(api_key=API_KEY)

# ==========================================
# 1. PARSING LOGIC (Your Code)
# ==========================================

class SparkLogAnalysis:
    """A container object for all parsed summaries from a Spark event log."""
    def __init__(self, log_file_path):
        self.application_id = "unknown_app"
        self.log_file = os.path.basename(log_file_path)
        self.job_summaries = []
        self.stage_summaries = []
        self.application_summaries = {}

    def set_application_id(self, app_id):
        if app_id:
            self.application_id = app_id

    def add_summary(self, summary_dict):
        if not summary_dict: return
        event_type = summary_dict.get("event_type")
        summary_dict["application_id"] = self.application_id

        if event_type == "JobPerformanceSummary":
            self.job_summaries.append(summary_dict)
        elif event_type == "StagePerformanceSummary":
            self.stage_summaries.append(summary_dict)
        else:
            self.application_summaries[event_type] = summary_dict

    def to_dict(self):
        return {
            "application_id": self.application_id,
            "log_file": self.log_file,
            "job_summaries": self.job_summaries,
            "stage_summaries": self.stage_summaries,
            "application_summaries": self.application_summaries
        }

def analyze_spark_log(log_file_path):
    """Parses a single Spark event log."""
    analysis_object = SparkLogAnalysis(log_file_path)
    stage_data = {}
    job_data = {}
    executor_lifecycle = {}
    disk_related_executor_failures = {}
    blacklisted_executors = []

    try:
        with open(log_file_path, 'r') as f:
            for line in f:
                try:
                    event = json.loads(line)
                except json.JSONDecodeError:
                    continue

                event_type = event.get("Event")
                
                if event_type == "SparkListenerApplicationStart":
                    analysis_object.set_application_id(event.get("App ID"))
                
                elif event_type == "SparkListenerExecutorAdded":
                    executor_id, timestamp = event.get("Executor ID"), event.get("Timestamp")
                    executor_lifecycle.setdefault(executor_id, {})['add_time'] = timestamp

                elif event_type == "SparkListenerExecutorRemoved":
                    executor_id, timestamp = event.get("Executor ID"), event.get("Timestamp")
                    reason = event.get("Removed Reason", "")
                    executor_lifecycle.setdefault(executor_id, {})['remove_time'] = timestamp
                    if "no space left on device" in reason.lower() or "disk" in reason.lower():
                        disk_related_executor_failures.setdefault(reason, {"count": 0, "samples": []})
                        disk_related_executor_failures[reason]["count"] += 1
                        if len(disk_related_executor_failures[reason]["samples"]) < 3:
                            disk_related_executor_failures[reason]["samples"].append({"executor_id": executor_id, "timestamp": timestamp})

                elif event_type == "SparkListenerExecutorBlacklisted":
                    blacklisted_executors.append({"executor_id": event.get("Executor ID"), "timestamp": event.get("Timestamp"), "reason": f"Blacklisted for {event.get('Task ID')} task failures"})

                elif event_type == "SparkListenerJobStart":
                    job_id = event.get("Job ID")
                    job_data[job_id] = {"start_time": event.get("Submission Time")}

                elif event_type == "SparkListenerJobEnd":
                    job_id = event.get("Job ID")
                    if job_id in job_data:
                        duration_ms = event.get("Completion Time") - job_data[job_id]["start_time"]
                        job_result = event.get("Job Result", {}).get("Result")
                        analysis_object.add_summary({ "event_type": "JobPerformanceSummary", "job_id": job_id, "status": job_result, "duration_s": duration_ms / 1000, "summary": f"Job {job_id} finished with status {job_result} in {duration_ms / 1000:.2f} seconds." })
                        del job_data[job_id]

                elif event_type == "SparkListenerTaskEnd":
                    stage_id = event.get("Stage ID")
                    if stage_id is not None:
                        stage_data.setdefault(stage_id, {"tasks": [], "failed_task_count": 0})
                        if event.get("Task End Reason", {}).get("Reason") != "Success": stage_data[stage_id]["failed_task_count"] += 1
                        task_info, task_metrics = event.get("Task Info", {}), event.get("Task Metrics", {})
                        duration_ms = task_info.get("Finish Time", 0) - task_info.get("Launch Time", 0)
                        stage_data[stage_id]["tasks"].append({ "duration_ms": duration_ms, "jvm_gc_time_ms": task_metrics.get("JVM GC Time", 0), "executor_run_time_ms": task_metrics.get("Executor Run Time", 1), "executor_deserialize_time_ms": task_metrics.get("Executor Deserialize Time", 0), "shuffle_fetch_wait_time_ms": task_metrics.get("Shuffle Read Metrics", {}).get("Fetch Wait Time", 0), "memory_spilled_bytes": task_metrics.get("Memory Bytes Spilled", 0), "disk_spilled_bytes": task_metrics.get("Disk Bytes Spilled", 0), })

                elif event_type == "SparkListenerStageCompleted":
                    stage_id = event.get("Stage Info", {}).get("Stage ID")
                    if stage_id in stage_data and len(stage_data[stage_id]["tasks"]) > 0:
                        tasks, failed_task_count = stage_data[stage_id]["tasks"], stage_data[stage_id]["failed_task_count"]
                        task_durations = [t["duration_ms"] for t in tasks]
                        total_duration, total_gc_time, total_runtime = sum(task_durations), sum(t["jvm_gc_time_ms"] for t in tasks), sum(t["executor_run_time_ms"] for t in tasks)
                        total_deserialize_time, total_fetch_wait_time = sum(t["executor_deserialize_time_ms"] for t in tasks), sum(t["shuffle_fetch_wait_time_ms"] for t in tasks)
                        total_spilled_bytes = sum(t["memory_spilled_bytes"] for t in tasks) + sum(t["disk_spilled_bytes"] for t in tasks)
                        summary_parts, potential_issues = [f"Stage {stage_id} completed with {len(tasks)} tasks."], []
                        if failed_task_count > 0: summary_parts.append(f"It experienced {failed_task_count} task failures that required retries."); potential_issues.append("TASK_FAILURES")
                        max_duration, median_duration = max(task_durations), median(task_durations)
                        if max_duration > 3 * median_duration and max_duration > 20000: potential_issues.append("DATA_SKEW"); summary_parts.append(f"Detected potential data skew. Max task duration: {max_duration/1000:.2f}s, median: {median_duration/1000:.2f}s.")
                        if total_runtime > 0 and (total_gc_time / total_runtime) > 0.10: potential_issues.append("HIGH_GC_PRESSURE"); summary_parts.append(f"High JVM GC pressure detected ({ (total_gc_time / total_runtime) * 100:.1f}% of runtime).")
                        if total_spilled_bytes > 0: potential_issues.append("DATA_SPILL"); summary_parts.append(f"Detected data spilling to disk ({total_spilled_bytes / (1024*1024):.2f} MB).")
                        if total_duration > 0 and (total_fetch_wait_time / total_duration) > 0.25: potential_issues.append("SHUFFLE_BOTTLENECK"); summary_parts.append(f"Significant shuffle bottleneck, tasks spent {(total_fetch_wait_time / total_duration) * 100:.1f}% of time waiting for data.")
                        if total_runtime > 0 and (total_deserialize_time / total_runtime) > 0.15: potential_issues.append("DESERIALIZATION_BOTTLENECK"); summary_parts.append(f"High deserialization time ({(total_deserialize_time / total_runtime) * 100:.1f}% of runtime).")
                        analysis_object.add_summary({ "event_type": "StagePerformanceSummary", "stage_id": stage_id, "metrics": { "task_count": len(tasks), "failed_task_count": failed_task_count, "max_task_duration_ms": max_duration, "median_task_duration_ms": median_duration, "total_spilled_mb": total_spilled_bytes / (1024*1024) }, "summary": " ".join(summary_parts), "potential_issues": potential_issues if potential_issues else ["NONE"] })

        if disk_related_executor_failures:
            total_failures = sum(v['count'] for v in disk_related_executor_failures.values())
            analysis_object.add_summary({ "event_type": "ExecutorDiskFailureSummary", "total_failures": total_failures, "summary": f"A critical error occurred where {total_failures} executors failed to launch due to running out of disk space on the nodes.", "failures_by_reason": disk_related_executor_failures })
        
        short_lived_executors = []
        churn_threshold_ms = 60000 
        for exec_id, times in executor_lifecycle.items():
            if 'add_time' in times and 'remove_time' in times and (times['remove_time'] - times['add_time']) < churn_threshold_ms:
                short_lived_executors.append({"executor_id": exec_id, "lifespan_s": (times['remove_time'] - times['add_time']) / 1000})
        if short_lived_executors:
            avg_lifespan_s = sum(e['lifespan_s'] for e in short_lived_executors) / len(short_lived_executors)
            analysis_object.add_summary({ "event_type": "ExecutorChurnSummary", "total_churn_events": len(short_lived_executors), "summary": f"Detected {len(short_lived_executors)} executors with an average lifespan of just {avg_lifespan_s:.2f}s, indicating severe instability.", "average_lifespan_s": round(avg_lifespan_s, 2), "sample_churned_executors": short_lived_executors[:3] })

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

    return analysis_object

# ==========================================
# 2. VECTORIZATION (ChromaDB Integration)
# ==========================================

def get_chroma_collection():
    """Returns a persistent ChromaDB collection."""
    # Using PersistentClient to save data to disk so we don't re-index every run
    client = chromadb.PersistentClient(path=DB_PATH)
    # Using the default embedding model (all-MiniLM-L6-v2)
    return client.get_or_create_collection(name="spark_logs")

def ingest_logs(log_directory):
    """Parses logs and ingests them into ChromaDB."""
    collection = get_chroma_collection()
    
    # Check if we already have data (optional optimization)
    if collection.count() > 0:
        print(f"üìö Vector DB already contains {collection.count()} documents. Skipping ingestion.")
        print("To force re-ingestion, delete the 'spark_rag_db' folder.")
        return

    print(f"üìÇ Processing logs in: {log_directory}")
    all_analyses = []

    for filename in os.listdir(log_directory):
        full_path = os.path.join(log_directory, filename)
        if os.path.isfile(full_path):
            print(f"   - Parsing {filename}...")
            analysis = analyze_spark_log(full_path)
            if analysis:
                all_analyses.append(analysis)

    print("üß© Vectorizing and storing data...")
    documents, metadatas, ids = [], [], []

    for analysis in all_analyses:
        # Flatten the object
        summaries = (analysis.job_summaries + 
                     analysis.stage_summaries + 
                     list(analysis.application_summaries.values()))

        for i, summary in enumerate(summaries):
            # 1. Text to Embed
            documents.append(summary["summary"])
            
            # 2. Metadata Cleaning (Chroma requires simple types)
            meta = summary.copy()
            del meta["summary"]
            
            # Convert list/dict fields to strings for metadata storage
            for key, val in meta.items():
                if isinstance(val, (list, dict)):
                    meta[key] = str(val)
            
            metadatas.append(meta)
            ids.append(f"{analysis.application_id}_{i}")

    if documents:
        collection.add(documents=documents, metadatas=metadatas, ids=ids)
        print(f"‚úÖ Successfully ingested {len(documents)} log events into ChromaDB.")
    else:
        print("‚ö†Ô∏è No valid log events found to ingest.")

# ==========================================
# 3. LLM ORCHESTRATOR (The RAG Logic)
# ==========================================

def get_gemini_response(user_query, app_id, collection):
    """
    1. Retrieval: Search ChromaDB filtered by app_id.
    2. Augmentation: Build a prompt with the search results.
    3. Generation: Call Gemini API.
    """
    print(f"üîç Searching logs for App: {app_id}...")
    
    # 1. RETRIEVAL
    # We query for top 5 most relevant log summaries for this specific app
    results = collection.query(
        query_texts=[user_query],
        n_results=5,
        where={"application_id": app_id} 
    )

    retrieved_docs = results['documents'][0]
    retrieved_meta = results['metadatas'][0]

    if not retrieved_docs:
        return "I couldn't find any specific log events matching your question for this application ID."

    # 2. AUGMENTATION (Context Construction)
    context_parts = []
    for doc, meta in zip(retrieved_docs, retrieved_meta):
        # We combine the text summary with key metadata tags for the LLM
        # e.g. "Event: JobPerformanceSummary | Issues: DATA_SKEW | Text: Job 2 finished..."
        info_tag = f"[{meta.get('event_type', 'Event')}]"
        issue_tag = f"[Issues: {meta.get('potential_issues', 'None')}]"
        context_parts.append(f"{info_tag} {issue_tag}\nLog Summary: {doc}")

    context_str = "\n---\n".join(context_parts)

    system_prompt = f"""
    You are an expert Apache Spark Log Diagnostician. 
    You are analyzing performance logs for Spark Application: {app_id}.

    Here are the most relevant log summaries retrieved from the system:
    
    {context_str}

    USER QUESTION: "{user_query}"

    INSTRUCTIONS:
    1. specific technical root causes based ONLY on the provided log summaries.
    2. If the logs mention 'Data Skew', 'GC Pressure', or 'Shuffle', explain what that means.
    3. Suggest a concrete configuration change (e.g., 'Increase spark.executor.memory', 'Adjust spark.sql.shuffle.partitions').
    4. Keep the tone professional and concise.
    """

    # 3. GENERATION
    try:
        model = genai.GenerativeModel('gemini-2.5-flash')
        response = model.generate_content(system_prompt)
        return response.text
    except Exception as e:
        return f"‚ùå Error communicating with Gemini API: {e}"

# ==========================================
# 4. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    # A. Setup paths
    # CHANGE THIS to your actual log folder path
    LOG_DIR = r"C:\Users\ddev\Documents\projects\spark-event-logs\test-logs"
    
    if not os.path.exists(LOG_DIR):
        print(f"‚ùå Error: Log directory '{LOG_DIR}' does not exist.")
        exit()

    # B. Ingest Logs (only runs if DB is empty)
    ingest_logs(LOG_DIR)
    
    # C. Get the collection for querying
    collection = get_chroma_collection()
    
    # D. Interactive Chat Loop
    print("\n" + "="*50)
    print("ü§ñ SPARK DIAGNOSTIC CHATBOT READY")
    print("="*50)
    
    # Simulating a user selecting an App ID (In a real UI, this is a dropdown)
    # For now, let's grab the first App ID available in the DB for testing
    existing_ids = collection.get(limit=1)['metadatas']
    if existing_ids:
        default_app = existing_ids[0].get('application_id')
        print(f"Detected Application ID in DB: {default_app}")
        target_app_id = input(f"Enter Application ID to analyze (Press Enter for '{default_app}'): ") or default_app
    else:
        print("No data in DB. Exiting.")
        exit()

    print(f"\nüí¨ Context set to: {target_app_id}")
    print("Type 'quit' to exit.\n")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['quit', 'exit']:
            break
        
        answer = get_gemini_response(user_input, target_app_id, collection)
        print(f"\nGemini:\n{answer}\n")

üìö Vector DB already contains 789 documents. Skipping ingestion.
To force re-ingestion, delete the 'spark_rag_db' folder.

ü§ñ SPARK DIAGNOSTIC CHATBOT READY
Detected Application ID in DB: app-20211011150934-6406


Enter Application ID to analyze (Press Enter for 'app-20211011150934-6406'):  



üí¨ Context set to: app-20211011150934-6406
Type 'quit' to exit.



You:  how long did my job run for ? 


üîç Searching logs for App: app-20211011150934-6406...

Gemini:
The provided logs indicate the following durations for individual jobs:

*   **Job 0:** 2.66 seconds
*   **Job 1:** 2.27 seconds
*   **Job 2:** 0.77 seconds

The longest individual job ran for **2.66 seconds**. Without further context on how these jobs are executed within the application (sequentially or in parallel), it is not possible to provide a single "total job run time" for the entire application.

**Identified Issues and Root Causes:**

The most significant issue identified is **severe executor instability**, as highlighted by the `ExecutorChurnSummary`: "Detected 54 executors with an average lifespan of just 0.53s, indicating severe instability." This extreme rate of executor termination indicates a critical underlying problem that can lead to poor performance, task failures, and wasted resources, even if the observed jobs completed successfully.

**Specific Technical Root Causes:**

1.  **Out-Of-Memory (OOM) Err

You:  how many other application ID's are you aware of ? 


üîç Searching logs for App: app-20211011150934-6406...

Gemini:
Based on the provided log summaries:

I am aware of **0 other application IDs** besides `app-20211011150934-6406`.

---

**Diagnosis for Spark Application: app-20211011150934-6406**

The performance logs indicate two critical issues impacting the application:

1.  **Severe Executor Instability / Churn:**
    *   **Root Cause:** The logs report 54 executors with an average lifespan of just 0.53 seconds, which is a strong indicator of repeated executor failures, most commonly due to OutOfMemoryError (OOM) exceptions. This leads to wasted resources, re-computation, and overall application instability.
    *   **Suggested Configuration Change:**
        *   Increase `spark.executor.memory` to provide more heap space for executors (e.g., `spark.executor.memory 20g`).
        *   Consider increasing `spark.executor.memoryOverhead` to account for off-heap memory usage by the JVM and external libraries (e.g., `spark.executor.memo

You:  what was the most prominent issue in my jobs ? 


üîç Searching logs for App: app-20211011150934-6406...

Gemini:
The most prominent issue in your Spark application is the **critical failure of executors to launch due to insufficient disk space on the nodes**. This directly led to severe instability, evidenced by 54 executors failing and having an average lifespan of just 0.53 seconds.

**Technical Root Cause:**
Executors are failing at startup because the underlying cluster nodes where they attempt to launch do not have adequate local disk space available. This prevents them from properly initializing their working directories, temporary files, or spill space, leading to immediate termination.

**Configuration Change Suggestion:**
To address this, you should adjust the `spark.local.dir` configuration. This property specifies the directories where Spark should use local disk.

**Concrete Configuration Change:**
*   **`spark.local.dir`**: Configure this property to point to a path on the cluster nodes that has significantly more avail

KeyboardInterrupt: Interrupted by user