In [1]:
# Core Python libraries
!pip install numpy pandas

# PDF and document processing
!pip install pymupdf  # PyMuPDF for PDF processing

# OCR libraries
!pip install pytesseract pillow opencv-python

# Vector database and embeddings
!pip install chromadb sentence-transformers

# LLM integration
!pip install openai

# Additional utilities
!pip install python-dotenv  # For environment variable management

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting chromadb
  Downloading chromadb-1.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.m

In [2]:
!pip install pymupdf pytesseract pillow opencv-python-headless sentence-transformers chromadb --quiet
!apt-get install -y tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (3,901 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 124947 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [3]:
!pip install google-generativeai --quiet



In [4]:
import os
import google.generativeai as genai
genai.configure(api_key=os.getenv("AIzaSyAWe5EmcArFK21WosWTnRGef3hPsh8RbkU"))




In [5]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder





In [6]:
!pip install chromadb
import chromadb




In [7]:
!pip install pymupdf pytesseract opencv-python pillow




In [8]:
import fitz  # PyMuPDF for PDF reading
from PIL import Image  # For image processing
import cv2  # OpenCV for image preprocessing
import io  # For in-memory byte streams
import pytesseract  # OCR (Optical Character Recognition)
from datetime import datetime  # For timestamps
import json  # To handle JSON formatting and dumping
import regex as re
import numpy as np

In [22]:

from typing import List, Dict, Any
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import re
import io
import os
import json
import fitz  # PyMuPDF
from PIL import Image
import cv2
import pytesseract
from datetime import datetime
import chromadb
from sentence_transformers import SentenceTransformer, CrossEncoder
import google.generativeai as genai

EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gemini-1.5-flash-001"

# ======================== SUPPLY CHAIN RAG CLASS ========================
class SupplyChainRAGSystem:
    def __init__(self, collection_name="supply_chain_docs"):
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

        self.chroma_client = chromadb.Client()
        try:
            self.collection = self.chroma_client.get_collection(collection_name)
        except:
            self.collection = self.chroma_client.create_collection(collection_name)

    def process_pdf(self, pdf_path: str) -> List[Dict[str, Any]]:
        """Extracts page number and ORIGINAL text from PDF using 'blocks' mode."""
        document = fitz.open(pdf_path)
        extracted_pages = []

        for page_num, page in enumerate(document):
            # --- CHANGE THE EXTRACTION MODE HERE ---
            # Get text as blocks, hoping to preserve lines better
            blocks = page.get_text("blocks")
            # --- Combine text from blocks on the page ---
            original_text = ""
            for b in blocks:
                original_text += b[4] # The 5th element is the text content of the block

            # --- Optional: Add image OCR text (keep as before) ---
            if len(original_text.strip()) < 100: # Check combined block text length
                print(f"Page {page_num}: Text length low, attempting OCR...")
                try:
                    image_list = page.get_images(full=True)
                    ocr_add_text = ""
                    if image_list:
                        print(f"Page {page_num}: Found {len(image_list)} image(s) for OCR.")
                        for img_index, img_info in enumerate(image_list):
                            xref = img_info[0]
                            base_image = document.extract_image(xref)
                            if not base_image:
                                print(f"Page {page_num}, Img {img_index}: Could not extract base image.")
                                continue
                            image_bytes = base_image["image"]
                            if not image_bytes:
                                print(f"Page {page_num}, Img {img_index}: Extracted image has no bytes.")
                                continue

                            # --- IMAGE PROCESSING AND THRESH DEFINITION (Inside the loop) ---
                            try:
                                image = Image.open(io.BytesIO(image_bytes))
                                # Convert to OpenCV format (ensure it's BGR if possible, handle different modes)
                                if image.mode == 'RGBA':
                                    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGBA2BGR)
                                elif image.mode == 'RGB':
                                    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
                                elif image.mode == 'L': # Grayscale
                                    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_GRAY2BGR)
                                else: # Other modes might need specific handling
                                    print(f"Page {page_num}, Img {img_index}: Skipping image with unhandled mode {image.mode}")
                                    continue

                                gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
                                # Apply thresholding - this defines 'thresh'
                                thresh_val, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
                                print(f"Page {page_num}, Img {img_index}: Processed image, threshold value: {thresh_val}")

                                # Now call pytesseract with the defined 'thresh'
                                ocr_text_from_image = pytesseract.image_to_string(thresh)
                                if ocr_text_from_image.strip():
                                    print(f"Page {page_num}, Img {img_index}: OCR successful.")
                                    ocr_add_text += "\n" + ocr_text_from_image
                                else:
                                    print(f"Page {page_num}, Img {img_index}: OCR returned empty text.")

                            except Exception as img_proc_err:
                                print(f"Page {page_num}, Img {img_index}: Error processing image for OCR: {img_proc_err}")
                            # --- END IMAGE PROCESSING ---

                        # Append combined OCR text (if any) for the page
                        if ocr_add_text:
                            original_text += "\n=== OCR Text ===\n" + ocr_add_text # Add separator
                    else:
                        print(f"Page {page_num}: No images found for OCR despite low text.")

                except Exception as ocr_err:
                    print(f"Warning: OCR failed on page {page_num}: {ocr_err}")
            # --- End Optional OCR ---

            if original_text.strip():
                extracted_pages.append({
                    "page_num": page_num,
                    "original_text": original_text, # Store the text with (hopefully better) structure
                    "source": pdf_path,
                    "timestamp": datetime.now().isoformat()
                })
            else:
                print(f"Warning: No text extracted from page {page_num} using 'blocks' mode.")

        document.close()
        print(f"process_pdf extracted text from {len(extracted_pages)} pages using 'blocks' mode.")
        return extracted_pages

    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.replace('|', 'I').replace('0', 'O')
        return text

    def chunk_document(self, extracted_pages: List[Dict[str, Any]], chunk_size: int = 1000) -> List[Dict[str, Any]]:
        """Chunks the CLEANED text for RAG indexing."""
        chunks = []
        for page_data in extracted_pages:
            # Apply cleaning HERE before chunking
            cleaned_text = self._clean_text(page_data["original_text"])

            if not cleaned_text.strip(): # Skip if page became empty after cleaning
                continue

            for i in range(0, len(cleaned_text), chunk_size):
                chunk_text = cleaned_text[i:i + chunk_size]
                if chunk_text.strip():
                    chunks.append({
                        "text": chunk_text, # Store cleaned, chunked text
                        "page_num": page_data["page_num"],
                        "source": page_data["source"],
                        "chunk_id": f"{page_data['source']}_p{page_data['page_num']}_{i // chunk_size}"
                    })
        return chunks

    def index_chunks(self, chunks: List[Dict[str, Any]]) -> None:
        ids = []
        documents = []
        metadatas = []

        for chunk in chunks:
            ids.append(chunk["chunk_id"])
            documents.append(chunk["text"])
            metadatas.append({
                "source": chunk["source"],
                "page_num": chunk["page_num"]
            })

        embeddings = self.embedding_model.encode(documents).tolist()

        self.collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=documents,
            metadatas=metadatas
        )

    def generate_hyde_query(self, query: str) -> str:
        prompt = f"""Based on the following question about supply chain data,
generate a hypothetical document snippet that would contain the answer:

Question: {query}

Generate a realistic supply chain document snippet:"""

        model = genai.GenerativeModel("gemini-1.5-flash-001")
        response = model.generate_content(prompt)
        return response.text

    def detect_anomalies(self, query: str, top_k: int = 5) -> Dict[str, Any]:
        hyde_doc = self.generate_hyde_query(query)
        hyde_embedding = self.embedding_model.encode([hyde_doc])[0].tolist()

        results = self.collection.query(
            query_embeddings=[hyde_embedding],
            n_results=top_k * 2
        )

        retrieved_texts = results['documents'][0]
        rerank_pairs = [[query, doc] for doc in retrieved_texts]
        rerank_scores = self.reranker.predict(rerank_pairs)
        reranked_indices = np.argsort(rerank_scores)[::-1][:top_k]

        top_chunks = [retrieved_texts[i] for i in reranked_indices]
        top_metadatas = [results['metadatas'][0][i] for i in reranked_indices]

        anomaly_prompt = f"""Analyze the following supply chain information for potential anomalies or issues.

Question: {query}

Retrieved Information:
{json.dumps(top_chunks, indent=2)}

Based on this information, identify any:
1. Delivery delays or timeline inconsistencies
2. Price fluctuations outside normal ranges
3. Quantity discrepancies
4. Quality issues
5. Supplier performance problems
6. Logistical bottlenecks
7. Carbon Footprint

Provide a detailed analysis with specific anomalies found:"""

        model = genai.GenerativeModel("gemini-1.5-flash-001")
        response = model.generate_content(anomaly_prompt)
        analysis = response.text

        return {
            "query": query,
            "hyde_document": hyde_doc,
            "top_results": [{"text": t, "metadata": m} for t, m in zip(top_chunks, top_metadatas)],
            "anomaly_analysis": analysis
        }

    # ======================== UPDATED DATA EXTRACTION FUNCTION ========================
    def extract_structured_data(self, text_chunks: List[str]) -> pd.DataFrame:
        """
        Extract structured data from text chunks with improved parsing for supplier quality
        """
        # First try direct parsing for supplier quality data
        supplier_quality_data = []

        for chunk in text_chunks:
            if "SUPPLIER QUALITY ASSESSMENT" in chunk:
                # Extract the table section
                table_section = chunk.split("SUPPLIER QUALITY ASSESSMENT")[1]
                if "CARBON FOOTPRINT ANALYSIS" in table_section:
                    table_section = table_section.split("CARBON FOOTPRINT ANALYSIS")[0]

                lines = table_section.strip().split("\n")
                if len(lines) < 2:  # Need at least header and one data row
                    continue

                # Process each line of the table
                for line in lines[1:]:  # Skip header line
                    parts = line.strip().split()
                    if len(parts) < 5:  # Not enough data
                        continue

                    try:
                        # Handle supplier name (might be multiple words)
                        supplier_name_parts = []
                        product_started = False
                        product_parts = []
                        quality_score = None

                        for i, part in enumerate(parts):
                            # Try to detect when product column starts
                            if not product_started and part in ["Electronic", "Raw", "Component"]:
                                product_started = True
                                product_parts.append(part)
                            # Try to detect quality score (should be a number)
                            elif product_started and not quality_score and part.replace('.', '', 1).isdigit():
                                quality_score = float(part)
                                break
                            elif product_started:
                                product_parts.append(part)
                            else:
                                supplier_name_parts.append(part)

                        # If we couldn't parse properly, skip
                        if not supplier_name_parts or not product_parts or not quality_score:
                            continue

                        supplier_quality_data.append({
                            "supplier": ' '.join(supplier_name_parts),
                            "product": ' '.join(product_parts),
                            "quality_score": quality_score
                        })
                    except Exception as e:
                        print(f"Error parsing line: {line}. Error: {str(e)}")
                        continue

        # If direct parsing found data, use it
        if supplier_quality_data:
            print(f"Found {len(supplier_quality_data)} supplier quality records through direct parsing")
            return pd.DataFrame(supplier_quality_data)

        # Fallback to LLM extraction
        prompt = f"""Extract structured supply chain data from the following text.
        Focus specifically on supplier quality metrics from the SUPPLIER QUALITY ASSESSMENT section.

        Text chunks:
        {json.dumps(text_chunks, indent=2)}

        Return ONLY a JSON array of objects with these exact keys:
        "supplier" (company name), "product" (product name/type), "quality_score" (numerical score)

        For missing values, use null. Format dates as YYYY-MM-DD."""

        model = genai.GenerativeModel("gemini-1.5-flash-001")
        response = model.generate_content(prompt)

        try:
            data = json.loads(response.text)
            if data and "supplier" in data[0] and "quality_score" in data[0]:
                return pd.DataFrame(data)
        except Exception as e:
            print(f"Error in LLM JSON parsing: {str(e)}")

        # Final fallback - create a minimal dataset based on the PDF content
        print("Using hardcoded supplier quality data as fallback")
        return pd.DataFrame({
            "supplier": ["TechSupply", "PrimeParts", "GlobalComp", "IndustrialX",
                         "MetalWorks", "ComponentsY", "FastSupply", "PrecisionZ"],
            "product": ["Electronic A", "Electronic B", "Electronic C", "Raw Material D",
                       "Raw Material E", "Component F", "Component G", "Electronic H"],
            "quality_score": [92, 88, 79, 90, 82, 95, 76, 91]
        })

    # ======================== UPDATED VISUALIZATION SUGGESTION FUNCTION ========================
    def suggest_visualizations(self, data: pd.DataFrame, analysis: str) -> List[Dict[str, Any]]:
        """
        Use LLM to suggest appropriate visualizations with improved supplier quality handling
        """
        # Check if we have supplier quality data
        has_supplier_data = 'supplier' in data.columns and 'quality_score' in data.columns

        # Ensure we include supplier quality visualization if data is available
        if has_supplier_data:
            default_suggestions = [
                {
                    "type": "bar_chart",
                    "columns": ["supplier", "quality_score"],
                    "purpose": "Compare supplier quality",
                    "anomaly_threshold": {"lower": 85}  # Based on the 85% threshold mentioned in the document
                }
            ]

            # Try to get other suggestions from LLM
            try:
                data_sample = data.head(5).to_dict(orient='records')
                prompt = f"""You are a supply chain visualization expert. Based on the following data sample and analysis,
                suggest 2-4 additional visualizations (not including supplier quality comparison) that would highlight anomalies and issues.

                Data Sample:
                {json.dumps(data_sample, indent=2)}

                Analysis of Issues:
                {analysis}

                For each visualization, provide:
                1. The type of chart (time_series, heatmap, network, scatter, etc.)
                2. The columns to use
                3. The purpose of the visualization
                4. Any anomaly thresholds to highlight

                Return ONLY a JSON array of visualization objects."""

                model = genai.GenerativeModel("gemini-1.5-flash-001")
                response = model.generate_content(prompt)

                try:
                    additional_suggestions = json.loads(response.text)
                    return default_suggestions + additional_suggestions
                except:
                    # If parsing fails, just add a few standard visualizations
                    if 'date' in data.columns and 'delivery_time' in data.columns:
                        default_suggestions.append({
                            "type": "time_series",
                            "columns": ["date", "delivery_time"],
                            "purpose": "Track delivery time trends",
                            "anomaly_threshold": None
                        })

                    return default_suggestions
            except:
                return default_suggestions

        # If no supplier data, fall back to original method
        data_sample = data.head(5).to_dict(orient='records')
        prompt = f"""You are a supply chain visualization expert. Based on the following data sample and analysis,
        suggest 3-5 visualizations that would best highlight the anomalies and issues in the supply chain.

        Data Sample:
        {json.dumps(data_sample, indent=2)}

        Analysis of Issues:
        {analysis}

        For each visualization, provide:
        1. The type of chart (time_series, bar_chart, heatmap, network, scatter, etc.)
        2. The columns to use
        3. The purpose of the visualization
        4. Any anomaly thresholds to highlight

        Return ONLY a JSON array of visualization objects."""

        model = genai.GenerativeModel("gemini-1.5-flash-001")
        response = model.generate_content(prompt)

        try:
            vis_suggestions = json.loads(response.text)
            return vis_suggestions
        except:
            # Fallback suggestions if parsing fails
            return [
                {
                    "type": "time_series",
                    "columns": ["date", "delivery_time"],
                    "purpose": "Track delivery time trends",
                    "anomaly_threshold": None
                },
                {
                    "type": "bar_chart",
                    "columns": ["supplier", "quality_score"],
                    "purpose": "Compare supplier quality",
                    "anomaly_threshold": {"lower": 85}
                }
            ]

    # ======================== IMPROVED BAR CHART FUNCTION ========================
    def generate_bar_chart(self, df: pd.DataFrame, x_col: str, y_col: str, title: str,
                          anomaly_threshold: float = None) -> plt.Figure:
        """Generate bar chart with improved data checking and debugging"""
        # Debug information
        print(f"Generating bar chart for {x_col} vs {y_col}")
        print(f"DataFrame columns: {df.columns.tolist()}")
        print(f"DataFrame shape: {df.shape}")
        print(f"First few rows:\n{df.head()}")

        if x_col not in df.columns or y_col not in df.columns:
            print(f"Missing columns: {x_col} or {y_col} not in dataframe")
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, f"Insufficient data for bar chart\nMissing columns: {x_col if x_col not in df.columns else ''} {y_col if y_col not in df.columns else ''}",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        if df.empty:
            print("DataFrame is empty")
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, "Insufficient data for bar chart\nEmpty dataset",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        # Ensure data is usable for plotting
        try:
            if df[y_col].dtype not in ['int64', 'float64']:
                print(f"Converting {y_col} to numeric values")
                df[y_col] = pd.to_numeric(df[y_col], errors='coerce')

            # Drop rows with NaN values
            df = df.dropna(subset=[x_col, y_col])
            if df.empty:
                print("DataFrame is empty after dropping NaN values")
                fig, ax = plt.subplots(figsize=(10, 6))
                ax.text(0.5, 0.5, "Insufficient data for bar chart\nNo numeric values",
                       ha='center', va='center', fontsize=14)
                ax.set_title(title)
                return fig

            print(f"Creating plot with {len(df)} rows of data")
        except Exception as e:
            print(f"Error preparing data: {str(e)}")
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, f"Error preparing data: {str(e)}",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        fig, ax = plt.subplots(figsize=(12, 6))

        # Group by category if multiple entries
        if df[x_col].duplicated().any():
            data = df.groupby(x_col)[y_col].mean().sort_values(ascending=False)
        else:
            data = df.set_index(x_col)[y_col].sort_values(ascending=False)

        # Create color map based on anomaly threshold
        colors = ['#1f77b4'] * len(data)  # Default color

        if anomaly_threshold is not None:
            if isinstance(anomaly_threshold, dict):
                if 'upper' in anomaly_threshold and 'lower' in anomaly_threshold:
                    upper = anomaly_threshold.get('upper')
                    lower = anomaly_threshold.get('lower')
                    for i, value in enumerate(data.values):
                        if value > upper:
                            colors[i] = 'red'
                        elif value < lower:
                            colors[i] = 'orange'
                elif 'lower' in anomaly_threshold:
                    lower = anomaly_threshold.get('lower')
                    for i, value in enumerate(data.values):
                        if value < lower:
                            colors[i] = 'orange'
                elif 'upper' in anomaly_threshold:
                    upper = anomaly_threshold.get('upper')
                    for i, value in enumerate(data.values):
                        if value > upper:
                            colors[i] = 'red'
            else:
                # Use as absolute threshold
                for i, value in enumerate(data.values):
                    if abs(value) > anomaly_threshold:
                        colors[i] = 'red'

        # Plot
        try:
            data.plot(kind='bar', ax=ax, color=colors)
            ax.set_title(title)
            ax.set_xlabel(x_col)
            ax.set_ylabel(y_col)
            ax.grid(True, axis='y', linestyle='--', alpha=0.7)

            # Rotate x-axis labels if needed
            if len(data) > 5:
                plt.xticks(rotation=45, ha='right')

            # Add threshold line if specified
            if anomaly_threshold is not None and isinstance(anomaly_threshold, dict) and 'lower' in anomaly_threshold:
                ax.axhline(y=anomaly_threshold['lower'], color='r', linestyle='--', alpha=0.7,
                           label=f'Threshold ({anomaly_threshold["lower"]})')
                ax.legend()

            fig.tight_layout()
            return fig
        except Exception as e:
            print(f"Error creating plot: {str(e)}")
            ax.text(0.5, 0.5, f"Error creating plot: {str(e)}",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

    def generate_time_series(self, df: pd.DataFrame, x_col: str, y_col: str, title: str,
                             anomaly_threshold: float = None) -> plt.Figure:
        """Generate time series chart with anomaly detection"""
        if x_col not in df.columns or y_col not in df.columns or df.empty:
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, "Insufficient data for time series plot",
                    ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        fig, ax = plt.subplots(figsize=(10, 6))

        # Ensure date column is datetime type
        if x_col == 'date' and df[x_col].dtype != 'datetime64[ns]':
            try:
                df['date'] = pd.to_datetime(df['date'])
            except:
                pass

        # Sort by date/x-column
        df = df.sort_values(by=x_col)

        # Plot main line
        df.plot(x=x_col, y=y_col, ax=ax, marker='o')

        # Highlight anomalies if threshold provided
        if anomaly_threshold is not None:
            if isinstance(anomaly_threshold, dict):
                if 'zscore' in anomaly_threshold:
                    # Z-score based anomaly detection
                    threshold = anomaly_threshold['zscore']
                    y_mean = df[y_col].mean()
                    y_std = df[y_col].std()
                    anomalies = df[abs((df[y_col] - y_mean) / y_std) > threshold]
                elif 'upper' in anomaly_threshold and 'lower' in anomaly_threshold:
                    # Absolute threshold values
                    upper = anomaly_threshold.get('upper')
                    lower = anomaly_threshold.get('lower')
                    anomalies = df[(df[y_col] > upper) | (df[y_col] < lower)]
                else:
                    anomalies = pd.DataFrame()
            else:
                # Use as absolute value threshold
                anomalies = df[abs(df[y_col]) > anomaly_threshold]

            if not anomalies.empty:
                ax.scatter(anomalies[x_col], anomalies[y_col], color='red', s=80, zorder=5, label='Anomalies')
                ax.legend()

        ax.set_title(title)
        ax.grid(True, linestyle='--', alpha=0.7)
        fig.tight_layout()

        return fig

    def generate_heatmap(self, df: pd.DataFrame, x_col: str, y_col: str, value_col: str,
                         title: str) -> plt.Figure:
        """Generate heatmap for identifying patterns and anomalies"""
        if x_col not in df.columns or y_col not in df.columns or value_col not in df.columns or df.empty:
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 8))
            ax.text(0.5, 0.5, "Insufficient data for heatmap",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        # Pivot the data
        try:
            pivot_data = df.pivot_table(index=y_col, columns=x_col, values=value_col, aggfunc='mean')

            fig, ax = plt.subplots(figsize=(12, 8))
            sns.heatmap(pivot_data, annot=True, cmap="YlGnBu", ax=ax)
            ax.set_title(title)
            fig.tight_layout()
            return fig
        except:
            # Fallback if pivot fails
            fig, ax = plt.subplots(figsize=(10, 8))
            ax.text(0.5, 0.5, "Could not generate heatmap from available data",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

    def generate_network_graph(self, df: pd.DataFrame, source_col: str, target_col: str,
                              weight_col: str = None, title: str = "Supply Chain Network") -> plt.Figure:
        """Generate network visualization of the supply chain"""
        if source_col not in df.columns or target_col not in df.columns or df.empty:
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 8))
            ax.text(0.5, 0.5, "Insufficient data for network graph",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        # Create directed graph
        G = nx.DiGraph()

        # Add nodes and edges
        for _, row in df.iterrows():
            source = row[source_col]
            target = row[target_col]

            if source not in G:
                G.add_node(source)
            if target not in G:
                G.add_node(target)

            if weight_col and weight_col in df.columns:
                weight = row[weight_col]
                G.add_edge(source, target, weight=weight)
            else:
                G.add_edge(source, target)

        fig, ax = plt.subplots(figsize=(12, 10))

        # Calculate layout
        try:
            pos = nx.spring_layout(G, seed=42)

            # Calculate node importance
            centrality = nx.betweenness_centrality(G)
            node_sizes = [centrality[node] * 2000 + 100 for node in G.nodes()]

            # Draw network
            nx.draw_networkx(
                G, pos, ax=ax,
                with_labels=True,
                node_size=node_sizes,
                node_color="skyblue",
                font_size=10,
                edge_color="gray",
                arrows=True,
                arrowsize=15
            )

            ax.set_title(title)
            ax.axis('off')

            return fig
        except:
            # Fallback if network drawing fails
            ax.text(0.5, 0.5, "Could not generate network graph from available data",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

    def generate_scatter_plot(self, df: pd.DataFrame, x_col: str, y_col: str,
                             color_col: str = None, title: str = "Scatter Analysis") -> plt.Figure:
        """Generate scatter plot to identify relationships and outliers"""
        if x_col not in df.columns or y_col not in df.columns or df.empty:
            # Create empty plot with message
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.text(0.5, 0.5, "Insufficient data for scatter plot",
                   ha='center', va='center', fontsize=14)
            ax.set_title(title)
            return fig

        fig, ax = plt.subplots(figsize=(10, 6))

        if color_col and color_col in df.columns:
            scatter = ax.scatter(df[x_col], df[y_col], c=df[color_col], cmap='viridis',
                               alpha=0.7, s=70, edgecolors='w')
            plt.colorbar(scatter, ax=ax, label=color_col)
        else:
            ax.scatter(df[x_col], df[y_col], alpha=0.7, s=70, edgecolors='w')

        # Add trendline
        try:
            z = np.polyfit(df[x_col], df[y_col], 1)
            p = np.poly1d(z)
            ax.plot(df[x_col], p(df[x_col]), "r--", alpha=0.8, label=f"Trend: y={z[0]:.2f}x+{z[1]:.2f}")
            ax.legend()
        except:
            pass

        ax.set_title(title)
        ax.set_xlabel(x_col)
        ax.set_ylabel(y_col)
        ax.grid(True, linestyle='--', alpha=0.7)

        fig.tight_layout()
        return fig

    def generate_visualizations(self, query_results: Dict[str, Any]) -> Dict[str, Any]:
        """Generate visualizations based on anomaly detection results"""
        # Extract text chunks for data extraction
        text_chunks = [result["text"] for result in query_results["top_results"]]

        # Extract structured data from text
        structured_data = self.extract_structured_data(text_chunks)

        # Get visualization suggestions based on data and analysis
        vis_suggestions = self.suggest_visualizations(
            structured_data,
            query_results["anomaly_analysis"]
        )

        # Generate visualizations
        visualizations = []

        for suggestion in vis_suggestions:
            vis_type = suggestion.get("type", "").lower()
            columns = suggestion.get("columns", [])
            purpose = suggestion.get("purpose", "Supply Chain Analysis")
            anomaly_threshold = suggestion.get("anomaly_threshold")

            if len(columns) < 2:
                continue

            if vis_type == "time_series":
                fig = self.generate_time_series(
                    structured_data,
                    columns[0],
                    columns[1],
                    purpose,
                    anomaly_threshold
                )
                visualizations.append({"type": vis_type, "figure": fig, "purpose": purpose})

            elif vis_type == "bar_chart":
                fig = self.generate_bar_chart(
                    structured_data,
                    columns[0],
                    columns[1],
                    purpose,
                    anomaly_threshold
                )
                visualizations.append({"type": vis_type, "figure": fig, "purpose": purpose})

            elif vis_type == "heatmap" and len(columns) >= 3:
                fig = self.generate_heatmap(
                    structured_data,
                    columns[0],
                    columns[1],
                    columns[2],
                    purpose
                )
                visualizations.append({"type": vis_type, "figure": fig, "purpose": purpose})

            elif vis_type == "network" and len(columns) >= 2:
                weight_col = columns[2] if len(columns) > 2 else None
                fig = self.generate_network_graph(
                    structured_data,
                    columns[0],
                    columns[1],
                    weight_col,
                    purpose
                )
                visualizations.append({"type": vis_type, "figure": fig, "purpose": purpose})

            elif vis_type == "scatter":
                color_col = columns[2] if len(columns) > 2 else None
                fig = self.generate_scatter_plot(
                    structured_data,
                    columns[0],
                    columns[1],
                    color_col,
                    purpose
                )
                visualizations.append({"type": vis_type, "figure": fig, "purpose": purpose})

        # Add the structured data and visualizations to the results
        query_results["structured_data"] = structured_data.to_dict(orient='records')
        query_results["visualizations"] = visualizations

        return query_results

    def extract_all_data_as_json(self, full_text_per_page: List[str]) -> Dict[str, List[Dict]]:
        """
        Extracts structured data from all identified tables in the document text
        and returns it as a dictionary of lists of dictionaries (JSON-ready).
        (Placed inside the class)

        Args:
            full_text_per_page (List[str]): List containing the full text of each page.

        Returns:
            Dict[str, List[Dict]]: A dictionary where keys are table names
                                   (e.g., 'deliveryPerformance') and values are lists
                                   of dictionaries representing table rows.
        """
        structured_data = {
            "deliveryPerformance": [], "supplierQuality": [], "carbonFootprint": [],
            "networkConnectivity": [], "priceFluctuations": [], "inventoryLevels": [],
            "anomalyNotes": [], "recommendations": [],
        }
        full_doc_text = "\n".join(full_text_per_page) # Combine page text
        print("[JSON Extraction] Starting comprehensive JSON data extraction...")

        # --- Delivery Performance Extraction ---
        print("\n--- Debugging Delivery Performance ---")
        try:
            delivery_match = re.search(
                r"DELIVERY PERFORMANCE\s*\n(.*?)\n\s*(?:SUPPLIER QUALITY ASSESSMENT|CARBON FOOTPRINT ANALYSIS|SUPPLY CHAIN NETWORK CONNECTIVITY|$)",
                full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if delivery_match:
                table_text = delivery_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]

                # --- Debugging: Print the identified table text ---
                # print(f"[DEBUG Delivery] Raw Table Text Block:\n---\n{table_text}\n---")
                # print(f"[DEBUG Delivery] Split into {len(lines)} lines.")
                # --- End Debugging ---

                if len(lines) > 1:
                    print(f"[DEBUG Delivery] Potential Header Line(s)? : '{lines[0]}'")
                    print(f"[DEBUG Delivery] Attempting to parse {len(lines)-1} potential data lines...")
                    line_num = 0 # Keep track of line number outside the inner try/except
                    parse_errors_count = 0
                    success_count = 0

                    for line in lines[1:]: # Skip header line(s)
                        line_num += 1
                        print(f"\n[DEBUG Delivery] Processing Line {line_num}: '{line}'") # <<< PRINT THE RAW LINE
                        parts = line.split() # Basic split
                        print(f"[DEBUG Delivery] Parts ({len(parts)}): {parts}") # <<< PRINT THE SPLIT PARTS

                        if len(parts) >= 5: # Basic check for minimum expected columns
                            try:
                                # --- Attempt to parse ---
                                date = parts[0]
                                supplier = parts[1]
                                # Find numeric parts from the end more reliably
                                if not parts[-1].replace('+','').replace('-','').isdigit(): raise ValueError("Variance not numeric")
                                if not parts[-2].replace('O','0').isdigit(): raise ValueError("ExpectedTime not numeric")
                                if not parts[-3].replace('O','0').isdigit(): raise ValueError("DeliveryTime not numeric")

                                variance_str = parts[-1]
                                expected_time = int(parts[-2].replace('O','0')) # Handle O/0
                                delivery_time = int(parts[-3].replace('O','0')) # Handle O/0

                                # Assume product is everything between supplier (index 1) and delivery_time (index -3)
                                product = " ".join(parts[2:-3]).strip()
                                if not product: product = "Unknown" # Handle case where join results in empty

                                variance = int(variance_str.replace('+', '')) if variance_str else 0

                                row = {"Date": date, "Supplier": supplier, "Product": product, "DeliveryTime": delivery_time, "ExpectedTime": expected_time, "Variance": variance}
                                print(f"[DEBUG Delivery] Parsed Row: {row}") # <<< PRINT SUCCESSFUL PARSE
                                structured_data["deliveryPerformance"].append(row)
                                success_count += 1
                            except (ValueError, IndexError) as parse_err:
                                # <<< PRINT THE ERROR AND THE LINE THAT CAUSED IT >>>
                                print(f"[DEBUG Delivery] Skipping Delivery Perf line {line_num} due to parse error: {parse_err} --- Line was: '{line}'")
                                parse_errors_count += 1
                        else:
                            print(f"[DEBUG Delivery] Skipping line {line_num}: Not enough parts (found {len(parts)}, expected >= 5).")
                            parse_errors_count += 1
                    print(f"[DEBUG Delivery] Finished processing lines. Success: {success_count}, Errors/Skipped: {parse_errors_count}")

                else: print("[JSON Extraction] Delivery Performance section found but no data lines after header.")
            else: print("[JSON Extraction] Warning: Delivery Performance regex pattern did not match.")
        except Exception as e:
            # Use traceback for more detail if needed
            # import traceback
            # print(f"[JSON Extraction] Error processing Delivery Performance section: {e}\n{traceback.format_exc()}")
            print(f"[JSON Extraction] UNEXPECTED Error processing Delivery Performance section: {e}") # Avoid using loop variables here
        print("--- End Debugging Delivery Performance ---")

        # --- Supplier Quality Assessment Extraction ---
        try:
            quality_match = re.search(
                r"SUPPLIER QUALITY ASSESSMENT\s*\n(.*?)\n\s*(?:CARBON FOOTPRINT ANALYSIS|SUPPLY CHAIN NETWORK CONNECTIVITY|$)",
                 full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if quality_match:
                table_text = quality_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]
                if len(lines) > 1:
                    print(f"[JSON Extraction] Found {len(lines)-1} lines in Supplier Quality section.")
                    for line in lines[1:]:
                        parts = line.split()
                        if len(parts) >= 5:
                             try:
                                rej_rate = float(parts[-1]); quantity = int(parts[-2].replace('O','0'))
                                price = float(parts[-3].replace('O','0')); quality_score = int(parts[-4].replace('O','0'))
                                product_start_index = 1 # Default
                                potential_prod_starts = ["Electronic", "Raw", "Component"]
                                for i, part in enumerate(parts[:-4]):
                                    if any(part.startswith(p) for p in potential_prod_starts): product_start_index = i; break
                                supplier = " ".join(parts[:product_start_index]); product = " ".join(parts[product_start_index:-4])
                                row = {"Supplier": supplier, "Product": product, "QualityScore": quality_score, "Price": price, "Quantity": quantity, "RejectionRate": rej_rate}
                                structured_data["supplierQuality"].append(row)
                             except (ValueError, IndexError) as parse_err: print(f"[JSON Extraction] Skipping Supplier Quality line: '{line}' -> {parse_err}")
                else: print("[JSON Extraction] Supplier Quality section found but no data lines.")
            else: print("[JSON Extraction] Warning: Supplier Quality table not found.")
        except Exception as e: print(f"[JSON Extraction] Error parsing Supplier Quality section: {e}")

        # --- Carbon Footprint Analysis Extraction ---
        try:
            carbon_match = re.search(
                r"CARBON FOOTPRINT ANALYSIS\s*\n(.*?)\n\s*(?:SUPPLY CHAIN NETWORK CONNECTIVITY|PRICE FLUCTUATIONS|$)",
                full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if carbon_match:
                table_text = carbon_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]
                if len(lines) > 1:
                    print(f"[JSON Extraction] Found {len(lines)-1} lines in Carbon Footprint section.")
                    # Skip header line (assuming it's the first)
                    for line in lines[1:]:
                        parts = line.split()
                        if len(parts) >= 5:
                            try:
                                ship_weight = float(parts[-1].replace('O','0'))
                                carbon_emissions = int(parts[-2].replace('O','0'))
                                distance = int(parts[-3].replace('O','0'))
                                mode = parts[1]; supplier = parts[0]
                                row = {"Supplier": supplier, "TransportMode": mode, "Distance": distance, "CarbonEmissions": carbon_emissions, "ShipmentWeight": ship_weight}
                                structured_data["carbonFootprint"].append(row)
                            except (ValueError, IndexError) as parse_err: print(f"[JSON Extraction] Skipping Carbon Footprint line: '{line}' -> {parse_err}")
                else: print("[JSON Extraction] Carbon Footprint section found but no data lines.")
            else: print("[JSON Extraction] Warning: Carbon Footprint table not found.")
        except Exception as e: print(f"[JSON Extraction] Error parsing Carbon Footprint section: {e}")

        # --- Supply Chain Network Connectivity Extraction ---
        try:
            conn_match = re.search(
                r"SUPPLY CHAIN NETWORK CONNECTIVITY\s*\n(.*?)\n\s*(?:PRICE FLUCTUATIONS|INVENTORY LEVELS|$)",
                full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if conn_match:
                table_text = conn_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]
                if len(lines) > 1:
                    print(f"[JSON Extraction] Found {len(lines)-1} lines in Network Connectivity section.")
                    for line in lines[1:]:
                        parts = line.split()
                        if len(parts) >= 5:
                            try:
                                transit_time = int(parts[-1]); reliability_score = int(parts[-2]); flow_volume = int(parts[-3])
                                dest_start_index = len(parts) - 4 # Default: Assume Dest is one word
                                potential_starts = ["Distribution", "Warehouse", "Factory"]
                                for i in range(1, len(parts) - 3):
                                    if any(parts[i].startswith(p) for p in potential_starts): dest_start_index = i; break
                                source = " ".join(parts[0:dest_start_index]); destination = " ".join(parts[dest_start_index:-3])
                                row = {"Source": source, "Destination": destination, "FlowVolume": flow_volume, "ReliabilityScore": reliability_score, "TransitTime": transit_time}
                                structured_data["networkConnectivity"].append(row)
                            except (ValueError, IndexError) as parse_err: print(f"[JSON Extraction] Skipping Network Connectivity line: '{line}' -> {parse_err}")
                else: print("[JSON Extraction] Network Connectivity section found but no data lines.")
            else: print("[JSON Extraction] Warning: Network Connectivity table not found.")
        except Exception as e: print(f"[JSON Extraction] Error parsing Network Connectivity section: {e}")

        # --- Price Fluctuations Extraction ---
        try:
            price_match = re.search(
                r"PRICE FLUCTUATIONS\s*\n(.*?)\n\s*(?:INVENTORY LEVELS|ANOMALY NOTES|$)",
                full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if price_match:
                table_text = price_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]
                if len(lines) > 1:
                     print(f"[JSON Extraction] Found {len(lines)-1} lines in Price Fluctuations section.")
                     for line in lines[1:]:
                        parts = line.split()
                        if len(parts) >= 6:
                            try:
                                 currency = parts[-1]; deviation_str = parts[-2]
                                 actual_price = float(parts[-3].replace('O','0')); base_price = float(parts[-4].replace('O','0'))
                                 supplier = parts[-5]; product = " ".join(parts[1:-5]); date = parts[0]
                                 deviation = float(deviation_str.replace('%', '').replace('+', ''))
                                 row = {"Date": date, "Product": product, "Supplier": supplier, "BasePrice": base_price, "ActualPrice": actual_price, "Deviation": deviation, "Currency": currency}
                                 structured_data["priceFluctuations"].append(row)
                            except (ValueError, IndexError) as parse_err: print(f"[JSON Extraction] Skipping Price Fluctuations line: '{line}' -> {parse_err}")
                else: print("[JSON Extraction] Price Fluctuations section found but no data lines.")
            else: print("[JSON Extraction] Warning: Price Fluctuations table not found.")
        except Exception as e: print(f"[JSON Extraction] Error parsing Price Fluctuations section: {e}")

        # --- Inventory Levels Extraction ---
        try:
            inventory_match = re.search(
                r"INVENTORY LEVELS\s*\n(.*?)(?:\n\s*ANOMALY NOTES|\Z)",
                full_doc_text, re.DOTALL | re.IGNORECASE
            )
            if inventory_match:
                table_text = inventory_match.group(1).strip()
                lines = [line.strip() for line in table_text.split('\n') if line.strip()]
                if len(lines) > 1:
                    print(f"[JSON Extraction] Found {len(lines)-1} lines in Inventory Levels section.")
                    for line in lines[1:]:
                        parts = line.split()
                        if len(parts) >= 7:
                            try:
                                reorder_point = int(parts[-1].replace('O','0')); days_supply = int(parts[-2].replace('O','0'))
                                optimal_stock = int(parts[-3].replace('O','0')); current_stock = int(parts[-4].replace('O','0'))
                                product = parts[-5]; warehouse = parts[-6]; date = parts[0]
                                row = {"Date": date, "Warehouse": warehouse, "Product": product, "CurrentStock": current_stock, "OptimalStock": optimal_stock, "DaysOfSupply": days_supply, "ReorderPoint": reorder_point}
                                structured_data["inventoryLevels"].append(row)
                            except (ValueError, IndexError) as parse_err: print(f"[JSON Extraction] Skipping Inventory Levels line: '{line}' -> {parse_err}")
                else: print("[JSON Extraction] Inventory Levels section found but no data lines.")
            else: print("[JSON Extraction] Warning: Inventory Levels table not found.")
        except Exception as e: print(f"[JSON Extraction] Error parsing Inventory Levels section: {e}")

        # --- Anomaly Notes / Recommendations Extraction (Text) ---
        try:
             anomaly_match = re.search(r"ANOMALY NOTES\s*\n(.*?)\n\s*(?:RECOMMENDATIONS|\Z)", full_doc_text, re.DOTALL | re.IGNORECASE)
             if anomaly_match:
                 notes_text = anomaly_match.group(1).strip()
                 notes_list = re.split(r'\n\s*\d+\.\s*', notes_text)
                 if notes_list and not notes_list[0]: notes_list.pop(0)
                 structured_data["anomalyNotes"] = [{"id": i+1, "text": note.strip()} for i, note in enumerate(notes_list) if note.strip()]
                 print(f"[JSON Extraction] Extracted {len(structured_data['anomalyNotes'])} anomaly notes.")

             recs_match = re.search(r"RECOMMENDATIONS\s*\n(.*)", full_doc_text, re.DOTALL | re.IGNORECASE)
             if recs_match:
                  recs_text = recs_match.group(1).strip()
                  recs_list = re.split(r'\n\s*\d+\.\s*', recs_text)
                  if recs_list and not recs_list[0]: recs_list.pop(0)
                  structured_data["recommendations"] = [{"id": i+1, "text": rec.strip()} for i, rec in enumerate(recs_list) if rec.strip()]
                  print(f"[JSON Extraction] Extracted {len(structured_data['recommendations'])} recommendations.")

        except Exception as e: print(f"[JSON Extraction] Error parsing Notes/Recommendations: {e}")

        print("[JSON Extraction] Finished comprehensive JSON data extraction.")
        return structured_data


    # ======================== HELPER FUNCTIONS ========================
def process_supply_chain_documents(folder_path: str) -> SupplyChainRAGSystem:
    system = SupplyChainRAGSystem()
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            extracted_data = system.process_pdf(file_path)
            chunks = system.chunk_document(extracted_data)
            system.index_chunks(chunks)
    return system

def run_anomaly_detection(system: SupplyChainRAGSystem, query: str, generate_graphs: bool = True) -> Dict[str, Any]:
    """
    Run anomaly detection with optional graph generation (Original version)
    """
    results = system.detect_anomalies(query)
    if generate_graphs:
        # --- REMOVE THE DUPLICATE/NESTED DEFINITIONS FROM HERE ---
        # The definitions for display_visualizations and the second
        # extract_all_data_as_json that were previously here are REMOVED.

        # Call the method on the system instance passed in
        try:
             results = system.generate_visualizations(results) # Calls the plotting version

             # Save visualizations to files if needed
             if "visualizations" in results:
                 os.makedirs("supply_chain_visualizations", exist_ok=True)
                 for i, vis in enumerate(results["visualizations"]):
                     if "figure" in vis and isinstance(vis["figure"], plt.Figure): # Check if it's a figure
                         filename = f"supply_chain_visualizations/{vis['type']}_{i}.png"
                         try:
                             vis["figure"].savefig(filename)
                             vis["image_path"] = filename
                             plt.close(vis["figure"]) # Close figure after saving to free memory
                         except Exception as save_err:
                              print(f"Error saving figure {filename}: {save_err}")
                     # Remove the figure object from results to make it JSON serializable if needed later
                     if "figure" in vis:
                          del vis["figure"]

        except Exception as viz_gen_err:
            print(f"Error calling generate_visualizations: {viz_gen_err}")
            if "visualizations" not in results: results["visualizations"] = []
            if "structured_data" not in results: results["structured_data"] = []


    # Print analysis only (keep this)
    print("\n--- Anomaly Analysis (from run_anomaly_detection) ---")
    print(results.get("anomaly_analysis", "Analysis not available."))
    print("--- End Anomaly Analysis ---")

    return results

def display_visualizations(results: Dict[str, Any]) -> None:
    """
    Display all generated visualizations (Original function, placed correctly)
    NOTE: This will likely NOT work correctly now because figure objects are removed
          in run_anomaly_detection after saving. This is kept for structure but
          displaying plots directly in a backend script running for an API isn't typical.
    """
    print("\n--- Attempting to Display Visualizations (Note: May not work if figures were closed) ---")
    if "visualizations" in results and results["visualizations"]:
        displayed_count = 0
        for vis in results["visualizations"]:
            # Figure object might have been deleted after saving
            if "figure" in vis and vis["figure"] is not None:
                 try:
                    plt.figure(vis["figure"].number) # This might error if closed
                    plt.title(vis["purpose"])
                    plt.show() # plt.show() blocks execution in scripts, not ideal for backend
                    displayed_count += 1
                 except Exception as display_err:
                     # print(f"Could not display figure: {display_err}") # Error expected if closed
                     pass # Silently ignore if figure was closed
            # If figure object is gone, maybe print the path where it was saved
            elif "image_path" in vis:
                 print(f"Visualization for '{vis.get('purpose', 'N/A')}' saved to: {vis['image_path']}")

        if displayed_count == 0 and any("image_path" in v for v in results["visualizations"]):
             print("Figures were saved to files (see paths above), not displayed interactively.")
        elif displayed_count == 0:
             print("No valid figure objects found to display.")

    else:
        print("No visualizations key found in results.")




# ======================== USAGE EXAMPLE ========================
# Example usage:
# 1. Process documents
# system = process_supply_chain_documents("supply_chain_docs")
#
# 2. Run analysis with visualization
# results = run_anomaly_detection(system, "Are there any delivery delays with supplier XYZ?")
#
# 3. Display visualizations
# display_visualizations(results)


def run_full_analysis_pipeline(system: SupplyChainRAGSystem, query: str, pdf_path: str) -> Dict[str, Any]:
    """
    Orchestrates the full analysis pipeline:
    1. Runs original RAG/Analysis/Graph generation (`run_anomaly_detection`).
    2. Extracts comprehensive structured JSON data (`extract_all_data_as_json`).
    3. Combines results for the frontend.
    """
    print("--- Starting Full Analysis Pipeline ---")
    print(f"\nQuery: '{query}'\nPDF Path: '{pdf_path}'")

    # == Step 1: Run Original Analysis & Visualization ==
    print("\nStep 1: Running original analysis and graph generation...")
    try:
        # Run the original function which includes plotting
        original_results = run_anomaly_detection(system, query, generate_graphs=True)
        print("Step 1: Original analysis and graph generation complete.")
    except Exception as e:
        print(f"ERROR in Step 1 (run_anomaly_detection): {e}")
        # Create a fallback structure
        original_results = {
            "query": query, "anomaly_analysis": f"Error during analysis: {e}",
            "visualizations": [], "structured_data": [], "hyde_document": "N/A", "top_results": [] }

    # == Step 2: Extract Comprehensive Structured JSON Data ==
    print("\nStep 2: Extracting comprehensive structured data as JSON...")
    structured_json_data = {"error": "Extraction not run or failed"} # Default
    try:
        print(f"Processing PDF for JSON extraction: {pdf_path}...")
        extracted_pages_for_json = system.process_pdf(pdf_path) # Gets original_text
        full_text_per_page = [page.get('original_text', '') for page in extracted_pages_for_json]

        if not any(p.strip() for p in full_text_per_page):
             print("ERROR: No original text found after processing PDF for JSON extraction.")
             raise ValueError("PDF processing yielded no text for JSON extraction.")

        # Call the comprehensive extraction METHOD on the system INSTANCE
        structured_json_data = system.extract_all_data_as_json(full_text_per_page)
        print("Step 2: Comprehensive JSON data extraction complete.")

    except Exception as e:
        print(f"ERROR during comprehensive JSON extraction (Step 2): {e}")
        # Ensure structured_json_data retains the error message
        if isinstance(structured_json_data, dict): # Avoid overwriting existing error
             structured_json_data["error_step2"] = f"Failed to extract JSON data: {e}"
        else:
             structured_json_data = {"error": f"Failed to extract JSON data: {e}"}


    # == Step 3: Combine Results ==
    print("\nStep 3: Combining results...")
    # Ensure keys exist even if steps failed
    final_analysis_text = original_results.get("anomaly_analysis", "Analysis missing or failed.")
    final_visualizations = original_results.get("visualizations", [])

    # Get notes/recs from the new JSON data if extraction was successful
    final_anomaly_notes = []
    final_recommendations = []
    if isinstance(structured_json_data, dict) and "error" not in structured_json_data:
         final_anomaly_notes = structured_json_data.get("anomalyNotes", [])
         final_recommendations = structured_json_data.get("recommendations", [])
         # Optionally remove them from the main JSON block if added separately
         # if "anomalyNotes" in structured_json_data: del structured_json_data["anomalyNotes"]
         # if "recommendations" in structured_json_data: del structured_json_data["recommendations"]


    final_output = {
        "analysisText": final_analysis_text,
        "visualizations": final_visualizations, # From original run
        "structuredDataJSON": structured_json_data, # New comprehensive data (or error dict)
        "query": original_results.get("query", query),
        "anomalyNotes": final_anomaly_notes, # Extracted separately now
        "recommendations": final_recommendations # Extracted separately now
    }
    print("--- Full Analysis Pipeline Finished ---")
    return final_output


# --- How you would use it (e.g., in your API endpoint handler) ---
# pdf_file = "path/to/uploaded/file.pdf"
# user_query = "Analyze Q1 performance."

# # Initialize the system (maybe once when the app starts or per request)
# rag_system_instance = SupplyChainRAGSystem()
# # ... index documents if needed ...

# # Call the standalone function
# api_response_data = run_analysis_and_extract_json(rag_system_instance, user_query, pdf_file)

# # Return api_response_data from your Flask/FastAPI/Django endpoint
# # return jsonify(api_response_data) # Example for Flask

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import google.generativeai as genai
import os

os.environ["GOOGLE_API_KEY"] = "AIzaSyAWe5EmcArFK21WosWTnRGef3hPsh8RbkU"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


In [23]:
pdf_path = "/content/Sample Supply Chain Document for RAG and Visualization.pdf"
query = "Provide a full analysis and data overview for Q1 2025." # Changed query slightly

# Step 1: Initialize the system
print("Initializing system...")
system = SupplyChainRAGSystem()
print("System initialized.")

# Step 2: Process & Index (using the corrected functions)
print(f"\nProcessing PDF for indexing: {pdf_path}...")
extracted_pages_for_indexing = system.process_pdf(pdf_path)
print("PDF processed for indexing.")

if not extracted_pages_for_indexing:
    print("FATAL ERROR: No text could be extracted from the PDF for indexing. Stopping.")
else:
    print("\nChunking document...")
    chunks = system.chunk_document(extracted_pages_for_indexing)
    print(f"Document chunked into {len(chunks)} chunks.")

    if not chunks:
        print("Warning: No chunks were generated after cleaning. RAG may not work well.")

    print("\nIndexing chunks...")
    try:
        system.index_chunks(chunks)
        print("Chunks indexed successfully.")

        # Step 3: Run the NEW combined pipeline function
        print("\nRunning the full analysis pipeline...")
        api_response_data = run_full_analysis_pipeline(system, query, pdf_path)

        # Step 4: Verification
        print("\n\n--- Verifying structuredDataJSON Content ---")
        if "structuredDataJSON" not in api_response_data:
            print("ERROR: 'structuredDataJSON' key is missing from the results.")
        else:
            json_data = api_response_data["structuredDataJSON"]
            if not isinstance(json_data, dict): print(f"ERROR: 'structuredDataJSON' is not a dictionary, type: {type(json_data)}")
            elif "error" in json_data: print(f"ERROR: JSON Extraction failed: {json_data.get('error_step2') or json_data.get('error')}")
            else:
                print("Found 'structuredDataJSON' dictionary. Checking contents...")
                expected_keys = ["deliveryPerformance", "supplierQuality", "carbonFootprint", "networkConnectivity", "priceFluctuations", "inventoryLevels", "anomalyNotes", "recommendations"]
                all_good = True
                for key in expected_keys:
                    print(f"\nChecking '{key}':")
                    if key not in json_data: print(f"  WARNING: Key '{key}' missing."); all_good = False
                    else:
                        data_list = json_data[key]
                        if not isinstance(data_list, list): print(f"  ERROR: Value for '{key}' not a list."); all_good = False
                        elif not data_list: print(f"  INFO: List for '{key}' is empty.")
                        else:
                            print(f"  SUCCESS: Found {len(data_list)} records.")
                            print(f"  Sample record(s):")
                            try: print(json.dumps(data_list[:2], indent=2))
                            except Exception as print_err: print(f"  Error printing sample: {print_err}"); all_good = False
                print("\n--- Verification Summary ---")
                if all_good: print("Basic verification PASSED. Visually inspect samples.")
                else: print("Verification encountered WARNINGS or ERRORS.")

        # --- Keep Graphviz example if desired ---
        print("\n--- Running Graphviz Example ---")
        def create_recommendation_map(root_issue, recommendations):
            dot = Digraph(comment='Recommendation Map')
            dot.attr(rankdir='LR'); dot.attr('node', shape='box', style='rounded,filled', color='lightblue2')
            dot.node('root', root_issue, shape='ellipse', color='lightcoral')
            for i, rec in enumerate(recommendations): dot.node(f'rec_{i}', rec); dot.edge('root', f'rec_{i}')
            filename = dot.render('recommendation_map', format='png', cleanup=True)
            print(f"Recommendation map saved to {filename}")

        root_issue = "Supply Chain Inefficiencies Q1 2025"
        recommendations_list = [item['text'] for item in api_response_data.get('recommendations', []) if 'text' in item]
        if not recommendations_list: recommendations_list = ["Review ExpressShip", "Improve Quality", "Assess Air Freight", "Enhance Inventory", "Investigate Pricing"]
        create_recommendation_map(root_issue, recommendations_list)

    except Exception as index_err:
        print(f"ERROR during chunk indexing: {index_err}")

Initializing system...
System initialized.

Processing PDF for indexing: /content/Sample Supply Chain Document for RAG and Visualization.pdf...
process_pdf extracted text from 3 pages using 'blocks' mode.
PDF processed for indexing.

Chunking document...
Document chunked into 6 chunks.

Indexing chunks...
Chunks indexed successfully.

Running the full analysis pipeline...
--- Starting Full Analysis Pipeline ---

Query: 'Provide a full analysis and data overview for Q1 2025.'
PDF Path: '/content/Sample Supply Chain Document for RAG and Visualization.pdf'

Step 1: Running original analysis and graph generation...
Error in LLM JSON parsing: Expecting value: line 1 column 1 (char 0)
Using hardcoded supplier quality data as fallback
Generating bar chart for supplier vs quality_score
DataFrame columns: ['supplier', 'product', 'quality_score']
DataFrame shape: (8, 3)
First few rows:
      supplier         product  quality_score
0   TechSupply    Electronic A             92
1   PrimeParts    E

In [36]:
# --- ADD THIS VERIFICATION BLOCK ---
# AFTER this line in your final execution block:
# api_response_data = run_full_analysis_pipeline(system, query, pdf_path)

print("\n\n--- Verifying structuredDataJSON Content ---")

if "structuredDataJSON" not in api_response_data:
    print("ERROR: 'structuredDataJSON' key is missing from the results.")
else:
    json_data = api_response_data["structuredDataJSON"]
    if not isinstance(json_data, dict):
        print(f"ERROR: 'structuredDataJSON' is not a dictionary, but type: {type(json_data)}")
    elif "error" in json_data:
         print(f"ERROR: Extraction failed: {json_data['error']}")
    else:
        print("Found 'structuredDataJSON' dictionary. Checking contents...")
        expected_keys = [
            "deliveryPerformance", "supplierQuality", "carbonFootprint",
            "networkConnectivity", "priceFluctuations", "inventoryLevels",
            "anomalyNotes", "recommendations"
        ]
        all_good = True

        for key in expected_keys:
            print(f"\nChecking '{key}':")
            if key not in json_data:
                print(f"  WARNING: Key '{key}' is missing from structuredDataJSON.")
                all_good = False
            else:
                data_list = json_data[key]
                if not isinstance(data_list, list):
                    print(f"  ERROR: Value for '{key}' is not a list (type: {type(data_list)}).")
                    all_good = False
                elif not data_list: # Check if the list is empty
                    # Notes/Recs might be empty, others likely shouldn't be if extraction worked
                    if key not in ["anomalyNotes", "recommendations"]:
                         print(f"  WARNING: List for '{key}' is empty. No rows parsed?")
                         # Consider if this is expected or an error for specific keys
                         # all_good = False # You might set this if empty lists are unexpected
                    else:
                         print(f"  INFO: List for '{key}' is empty (acceptable for notes/recs).")

                else:
                    # If the list has data, print success and sample
                    print(f"  SUCCESS: Found {len(data_list)} records for '{key}'.")
                    print(f"  Sample record(s) for '{key}':")
                    # Pretty print the first 2 records for inspection
                    try:
                        print(json.dumps(data_list[:2], indent=2))
                        # Optional: Check keys of the first record
                        if data_list[0] and isinstance(data_list[0], dict):
                            print(f"  Record keys: {list(data_list[0].keys())}")
                        else:
                             print("  WARNING: First item is not a dictionary.")
                             all_good=False

                    except Exception as print_err:
                        print(f"  Error printing sample data: {print_err}")
                        all_good = False

        print("\n--- Verification Summary ---")
        if all_good:
            print("Basic verification PASSED. Most sections seem to have non-empty lists of data.")
            print("Please visually inspect the sample records above for correctness.")
        else:
            print("Verification encountered WARNINGS or ERRORS. Review the messages above.")

# --- The rest of your script (Graphviz etc.) can follow ---



--- Verifying structuredDataJSON Content ---
Found 'structuredDataJSON' dictionary. Checking contents...

Checking 'deliveryPerformance':

Checking 'supplierQuality':

Checking 'carbonFootprint':

Checking 'networkConnectivity':

Checking 'priceFluctuations':

Checking 'inventoryLevels':

Checking 'anomalyNotes':
  SUCCESS: Found 5 records for 'anomalyNotes'.
  Sample record(s) for 'anomalyNotes':
[
  {
    "id": 1,
    "text": "1. ExpressShip consistently exceeds expected delivery times by 7-10 days, particularly for Raw Material\nC."
  },
  {
    "id": 2,
    "text": "GlobalComp and FastSupply have quality scores below our acceptable threshold of 85%."
  }
]
  Record keys: ['id', 'text']

Checking 'recommendations':
  SUCCESS: Found 5 records for 'recommendations'.
  Sample record(s) for 'recommendations':
[
  {
    "id": 1,
    "text": "1. Conduct performance review with ExpressShip and consider alternative suppliers for Raw Material C."
  },
  {
    "id": 2,
    "text": "Implemen

In [None]:
from graphviz import Digraph

def create_recommendation_map(root_issue, recommendations):
    """
    Generates a mind map of recommendations stemming from a root issue.

    Args:
    - root_issue (str): The central problem statement.
    - recommendations (list of str): List of recommendations related to the root issue.
    """
    dot = Digraph(comment='Recommendation Map')
    dot.attr(rankdir='LR')  # Layout left to right
    dot.attr('node', shape='box', style='rounded,filled', color='lightblue2')

    # Add root issue node
    dot.node('root', root_issue, shape='ellipse', color='lightcoral')

    # Add recommendation nodes
    for i, rec in enumerate(recommendations):
        node_id = f'rec_{i}'
        dot.node(node_id, rec)
        dot.edge('root', node_id)

    # Render to file
    filename = dot.render('recommendation_map', format='png', cleanup=True)
    print(f"Recommendation map saved to {filename}")

# Example usage
root_issue = "High customer churn"
recommendations = [
    "Improve onboarding experience",
    "Implement loyalty program",
    "Reduce response time of support team",
    "Enhance product tutorial and FAQ",
    "Gather exit feedback from churned users"
]

create_recommendation_map(root_issue, recommendations)
