# ESG Analysis with Retrieval-Augmented Generation (RAG)

This notebook implements a RAG system for Enhanced ESG analysis using:
- WatsonxLLM for text generation
- IBM Granite embeddings for document encoding  
- Chroma vector database for similarity search
- ESG documents from corporate 10-K filings and sustainability reports

The goal is to overcome the token limitations discovered in the LLM.ipynb notebook and provide more comprehensive ESG scoring capabilities.

In [None]:
# Import required libraries
import os
import chromadb
import pandas as pd
import json
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
from typing import Dict, Any, Literal, List
from copy import deepcopy

# LangChain components
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.schema import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.prompts import PromptTemplate

# LangGraph for building the RAG pipeline
from langgraph.graph import StateGraph, START
from langgraph.types import State
from IPython.display import Image, display
from typing_extensions import TypedDict

# Watsonx.ai integration
from langchain_ibm import WatsonxEmbeddings
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
from ibm_watsonx_ai.foundation_models import Model

# Evaluation dependencies 
from pydantic import BaseModel, Field
import instructor
import litellm
from litellm import completion
from instructor import Mode

# Load environment variables
load_dotenv()

print("✓ All libraries imported successfully")

In [None]:
# Watsonx.ai Configuration
WX_API_KEY = os.getenv('WX_API_KEY')
WX_PROJECT_ID = os.getenv('WX_PROJECT_ID') 
WX_API_URL = "https://us-south.ml.cloud.ibm.com"

if not WX_API_KEY or not WX_PROJECT_ID:
    raise ValueError("Missing required environment variables: WX_API_KEY and/or WX_PROJECT_ID")

print("✓ Watsonx.ai credentials configured")

# Initialize Watsonx Embeddings with Granite model
watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url=WX_API_URL,
    apikey=WX_API_KEY,
    project_id=WX_PROJECT_ID,
)

print("✓ Watsonx embeddings initialized")

In [None]:
# Configure Watsonx LLM with optimal parameters for ESG analysis
generation_parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.MAX_NEW_TOKENS: 2048,  # Increased for detailed ESG analysis
    GenParams.TEMPERATURE: 0.1,     # Low temperature for factual consistency
    GenParams.TOP_P: 0.9,
    GenParams.STOP_SEQUENCES: ["<|endoftext|>", "\n\n\n"]
}

# Initialize the Watsonx LLM 
model = Model(
    model_id=ModelTypes.GRANITE_13B_INSTRUCT_V2,
    params=generation_parameters,
    credentials={
        "apikey": WX_API_KEY,
        "url": WX_API_URL
    },
    project_id=WX_PROJECT_ID
)

def llm_invoke(prompt):
    """Helper function to invoke the Watsonx LLM"""
    response = model.generate_text(prompt)
    return response

print("✓ Watsonx LLM configured for ESG analysis")

In [None]:
# Load ESG documents from the data directory
def load_esg_documents():
    """Load all ESG-related documents from the data directory"""
    documents = []
    data_path = Path("data")
    
    if not data_path.exists():
        raise FileNotFoundError("Data directory not found. Please ensure ESG documents are in ./data/")
    
    # Load 10-K filings (markdown files)
    companies = ["apple", "boeing", "disney", "elililly", "fedex", "google", 
                "johnsonandjohnson", "jpmorganchase", "mcdonald", "meta", 
                "microsoft", "netflix", "nike", "nvidia", "tesla"]
    
    for company in companies:
        company_path = data_path / company
        if company_path.exists():
            # Load 10-K items
            for item_file in ["10k_item1.md", "10k_item1A.md", "10k_item7.md", "10k_item7A.md", "ESG_Report.md"]:
                file_path = company_path / item_file.replace("10k_", f"{company}_10k_")
                if not file_path.exists():
                    file_path = company_path / item_file  # Try without company prefix
                
                if file_path.exists():
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()
                            if content.strip():  # Only add non-empty documents
                                doc = Document(
                                    page_content=content,
                                    metadata={
                                        "company": company,
                                        "document_type": item_file.replace(".md", ""),
                                        "source": str(file_path)
                                    }
                                )
                                documents.append(doc)
                    except Exception as e:
                        print(f"Warning: Could not load {file_path}: {e}")
    
    print(f"✓ Loaded {len(documents)} ESG documents from {len(companies)} companies")
    return documents

# Load the documents
esg_documents = load_esg_documents()

In [None]:
# Document preprocessing and chunking
def preprocess_text(text: str) -> str:
    """Clean and normalize text for better embedding quality"""
    import re
    import unicodedata
    
    # Unicode normalization
    text = unicodedata.normalize('NFKC', text)
    
    # Remove HTML tags and special characters
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)]', ' ', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

def chunk_documents_with_headers(documents: List[Document]) -> List[Document]:
    """
    Split documents into chunks while preserving semantic structure
    using markdown headers and semantic separators
    """
    # First pass: Extract headers using MarkdownHeaderTextSplitter
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
    ]
    
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    header_chunks = []
    
    for doc in documents:
        try:
            # Apply markdown splitting
            splits = markdown_splitter.split_text(doc.page_content)
            for split in splits:
                # Merge metadata from original document
                split.metadata.update(doc.metadata)
                header_chunks.append(split)
        except Exception as e:
            # Fallback to original document if markdown splitting fails
            print(f"Markdown splitting failed for {doc.metadata.get('source', 'unknown')}: {e}")
            header_chunks.append(doc)
    
    # Second pass: Semantic-aware chunking with overlap
    token_safe_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Larger chunks for ESG context
        chunk_overlap=100,  # Good overlap for continuity
        separators=["\n\n", "\n", ".", ";", ":", " ", ""],  # Semantic boundaries
        length_function=len,
    )
    
    safe_chunks = token_safe_splitter.split_documents(header_chunks)
    
    # Preprocess content for better embeddings
    for chunk in safe_chunks:
        chunk.page_content = preprocess_text(chunk.page_content)
        
    # Remove empty chunks
    safe_chunks = [chunk for chunk in safe_chunks if chunk.page_content.strip()]
    
    print(f"✓ Created {len(safe_chunks)} chunks from {len(documents)} documents")
    return safe_chunks

def update_documents_with_headers(chunks: List[Document]) -> List[Document]:
    """
    Add header context to chunk content for better retrieval
    """
    updated_chunks = []
    
    for doc in chunks:
        new_doc = deepcopy(doc)
        
        # Build header prefix from metadata
        headers = []
        for i in range(1, 4):
            key = f'Header {i}'
            if key in new_doc.metadata:
                headers.append(new_doc.metadata[key])
        
        # Add company and document type context
        company = new_doc.metadata.get('company', '')
        doc_type = new_doc.metadata.get('document_type', '')
        
        context_parts = []
        if company:
            context_parts.append(f"Company: {company}")
        if doc_type:
            context_parts.append(f"Document: {doc_type}")
        if headers:
            context_parts.append(f"Section: {'/'.join(headers)}")
            
        if context_parts:
            prefix = f"[{' | '.join(context_parts)}]: "
            new_doc.page_content = prefix + "\n" + new_doc.page_content
        
        updated_chunks.append(new_doc)
    
    return updated_chunks

# Process the documents
print("Processing ESG documents...")
processed_chunks = chunk_documents_with_headers(esg_documents)
final_docs = update_documents_with_headers(processed_chunks)

In [None]:
# Create enhanced ESG vector database
print("Creating enhanced ESG vector database...")

# Initialize Chroma vector database
esg_vector_db = Chroma.from_documents(
    collection_name="enhanced_esg_collection",
    embedding=watsonx_embedding,
    persist_directory="enhanced_comprehensive_esg_db",  # Enhanced version
    documents=final_docs,
)

print(f"✓ Created vector database with {len(final_docs)} document chunks")

# Configure retriever with optimal settings for ESG analysis
esg_retriever = esg_vector_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 8,  # Retrieve more context for comprehensive analysis
        "score_threshold": 0.3  # Filter out low-similarity results
    }
)

print("✓ ESG retriever configured")

In [None]:
# Test retrieval with ESG-specific queries
test_queries = [
    "What are Apple's environmental sustainability initiatives?",
    "How does Boeing address carbon emissions and climate change?",
    "What diversity and inclusion programs does Microsoft have?",
    "What are Tesla's governance practices and board composition?",
    "How does Nike approach sustainable manufacturing?"
]

print("Testing retrieval system with ESG queries:")
print("=" * 60)

for query in test_queries[:2]:  # Test first 2 queries
    print(f"\nQuery: {query}")
    print("-" * 40)
    
    retrieved_docs = esg_retriever.invoke(query)
    
    for i, doc in enumerate(retrieved_docs[:3]):  # Show top 3 results
        company = doc.metadata.get('company', 'Unknown')
        doc_type = doc.metadata.get('document_type', 'Unknown')
        preview = doc.page_content[:200].replace('\n', ' ')
        
        print(f"{i+1}. Company: {company} | Type: {doc_type}")
        print(f"   Preview: {preview}...")
        print()

In [None]:
# ESG-specific prompt templates
esg_scoring_template = """You are an ESG (Environmental, Social, Governance) analysis expert. Based on the provided corporate documents, analyze the company's ESG performance and provide a numerical score.

**Scoring Guidelines:**
- Environmental (E): Climate action, resource efficiency, pollution prevention, biodiversity (0-100)
- Social (S): Employee welfare, community impact, diversity & inclusion, product safety (0-100)  
- Governance (G): Board structure, executive compensation, business ethics, transparency (0-100)

**Instructions:**
1. Only use information explicitly stated in the provided context
2. If information is insufficient for a category, assign a score of 50 (neutral)
3. Provide specific evidence from the documents to justify each score
4. Be objective and avoid speculation beyond the available evidence

**Company:** {company}
**ESG Category:** {category}

**Context Documents:**
{context}

**Analysis Required:**
Provide a {category} score (0-100) for {company} with detailed justification based on the evidence above.

**Response Format:**
Score: [numerical score 0-100]
Justification: [detailed explanation with specific evidence from documents]
"""

general_esg_template = """You are an ESG analysis expert. Use the provided corporate documents to answer questions about environmental, social, and governance practices.

**Guidelines:**
- Only use information explicitly stated in the provided context
- If you cannot find relevant information, state "Information not available in provided documents"
- Cite specific evidence when possible
- Be factual and objective

**Question:** {question}

**Context Documents:**
{context}

**Answer:**"""

# Create prompt templates
esg_scoring_prompt = PromptTemplate.from_template(esg_scoring_template)
general_esg_prompt = PromptTemplate.from_template(general_esg_template)

print("✓ ESG prompt templates created")

In [None]:
# Define the RAG state for ESG analysis
class ESGRAGState(TypedDict):
    """State for ESG RAG application"""
    company: str
    category: str  # 'Environmental', 'Social', 'Governance', or 'General'
    question: str
    context: List[Document]
    answer: str
    score: int  # ESG score 0-100

# Define RAG pipeline functions
def retrieve_esg_documents(state: ESGRAGState) -> Dict[str, Any]:
    """Retrieve relevant ESG documents based on company and category"""
    
    # Build search query
    company = state.get("company", "")
    category = state.get("category", "")
    question = state.get("question", "")
    
    # Construct search query with ESG context
    if category in ["Environmental", "Social", "Governance"]:
        search_query = f"{company} {category} {question}"
    else:
        search_query = f"{company} {question}"
    
    # Retrieve documents
    retrieved_docs = esg_retriever.invoke(search_query)
    
    # Filter documents by company if specified
    if company:
        filtered_docs = [
            doc for doc in retrieved_docs 
            if doc.metadata.get('company', '').lower() == company.lower()
        ]
        # If no company-specific docs found, use all retrieved docs
        if not filtered_docs:
            filtered_docs = retrieved_docs
    else:
        filtered_docs = retrieved_docs
    
    return {"context": filtered_docs}

def generate_esg_response(state: ESGRAGState) -> Dict[str, Any]:
    """Generate ESG analysis or scoring based on retrieved context"""
    
    category = state.get("category", "General")
    company = state.get("company", "")
    question = state.get("question", "")
    context_docs = state.get("context", [])
    
    # Format context
    context_text = "\n\n".join([
        f"Document {i+1} ({doc.metadata.get('document_type', 'Unknown')}):\n{doc.page_content}" 
        for i, doc in enumerate(context_docs)
    ])
    
    # Choose appropriate prompt template
    if category in ["Environmental", "Social", "Governance"] and company:
        # ESG scoring mode
        formatted_prompt = esg_scoring_prompt.invoke({
            "company": company,
            "category": category,
            "context": context_text
        })
    else:
        # General Q&A mode
        formatted_prompt = general_esg_prompt.invoke({
            "question": question,
            "context": context_text
        })
    
    # Generate response
    response = llm_invoke(formatted_prompt.text)
    
    # Extract score if in scoring mode
    score = None
    if category in ["Environmental", "Social", "Governance"]:
        try:
            # Look for "Score: XX" pattern
            import re
            score_match = re.search(r'Score:\s*(\d+)', response)
            if score_match:
                score = int(score_match.group(1))
                score = max(0, min(100, score))  # Clamp to 0-100
        except:
            score = 50  # Default neutral score
    
    return {"answer": response, "score": score}

# Build the RAG graph
esg_rag_builder = StateGraph(ESGRAGState)
esg_rag_builder.add_sequence([retrieve_esg_documents, generate_esg_response])
esg_rag_builder.add_edge(START, "retrieve_esg_documents")
esg_rag_graph = esg_rag_builder.compile()

print("✓ ESG RAG pipeline built successfully")

In [None]:
# Test the ESG RAG system
def test_esg_scoring(company: str, category: str):
    """Test ESG scoring for a specific company and category"""
    
    result = esg_rag_graph.invoke({
        "company": company,
        "category": category,
        "question": f"Analyze {company}'s {category.lower()} performance for ESG scoring"
    })
    
    return result

# Test with sample companies and categories
test_cases = [
    ("Apple", "Environmental"),
    ("Tesla", "Environmental"), 
    ("Microsoft", "Social"),
    ("Boeing", "Governance")
]

print("Testing ESG RAG System:")
print("=" * 60)

sample_results = []
for company, category in test_cases[:2]:  # Test first 2 cases
    print(f"\n🏢 Company: {company}")
    print(f"📊 Category: {category}")
    print("-" * 40)
    
    try:
        result = test_esg_scoring(company, category)
        score = result.get('score', 'N/A')
        answer = result.get('answer', 'No response')
        
        print(f"Score: {score}/100")
        print(f"Analysis: {answer[:300]}...")
        
        sample_results.append({
            'company': company,
            'category': category,
            'score': score,
            'analysis': answer
        })
        
    except Exception as e:
        print(f"Error: {e}")
    
    print("\n" + "="*60)

In [None]:
# Comprehensive ESG scoring for all companies
def comprehensive_esg_scoring():
    """Generate ESG scores for all companies using RAG"""
    
    companies = ["apple", "boeing", "disney", "elililly", "fedex", "google", 
                "johnsonandjohnson", "jpmorganchase", "mcdonald", "meta", 
                "microsoft", "netflix", "nike", "nvidia", "tesla"]
    
    categories = ["Environmental", "Social", "Governance"]
    
    results = []
    
    print("Generating comprehensive ESG scores using RAG...")
    print("=" * 60)
    
    for company in tqdm(companies, desc="Processing companies"):
        company_scores = {"company": company}
        
        for category in categories:
            try:
                result = esg_rag_graph.invoke({
                    "company": company,
                    "category": category,
                    "question": f"Analyze {company}'s {category.lower()} performance for ESG scoring"
                })
                
                score = result.get('score', 50)  # Default to neutral if no score
                company_scores[f"{category.lower()}_score"] = score
                
                # Store the analysis text (truncated)
                analysis = result.get('answer', '')[:500]
                company_scores[f"{category.lower()}_analysis"] = analysis
                
            except Exception as e:
                print(f"Error processing {company} - {category}: {e}")
                company_scores[f"{category.lower()}_score"] = 50  # Default score
                company_scores[f"{category.lower()}_analysis"] = f"Error: {str(e)}"
        
        # Calculate overall ESG score (average)
        env_score = company_scores.get('environmental_score', 50)
        soc_score = company_scores.get('social_score', 50)
        gov_score = company_scores.get('governance_score', 50)
        
        company_scores['overall_esg_score'] = round((env_score + soc_score + gov_score) / 3, 2)
        
        results.append(company_scores)
    
    return results

# Create a sample scoring function for testing (limited companies)
def sample_esg_scoring():
    """Generate ESG scores for a sample of companies"""
    
    sample_companies = ["apple", "tesla", "microsoft", "boeing", "nike"]
    categories = ["Environmental", "Social", "Governance"]
    
    results = []
    
    print("Generating sample ESG scores using RAG...")
    print("=" * 50)
    
    for company in sample_companies:
        print(f"\n📊 Processing: {company.capitalize()}")
        company_scores = {"company": company}
        
        for category in categories:
            try:
                result = esg_rag_graph.invoke({
                    "company": company,
                    "category": category,
                    "question": f"Analyze {company}'s {category.lower()} performance for ESG scoring"
                })
                
                score = result.get('score', 50)
                company_scores[f"{category.lower()}_score"] = score
                print(f"   {category}: {score}/100")
                
            except Exception as e:
                print(f"   {category}: Error - {e}")
                company_scores[f"{category.lower()}_score"] = 50
        
        # Calculate overall score
        env_score = company_scores.get('environmental_score', 50)
        soc_score = company_scores.get('social_score', 50)
        gov_score = company_scores.get('governance_score', 50)
        
        overall_score = round((env_score + soc_score + gov_score) / 3, 2)
        company_scores['overall_esg_score'] = overall_score
        print(f"   Overall ESG: {overall_score}/100")
        
        results.append(company_scores)
    
    return results

print("✓ ESG scoring functions created")

In [None]:
# Run sample ESG scoring to test the system
print("Running sample ESG scoring with RAG system...")

# Execute sample scoring
rag_sample_results = sample_esg_scoring()

# Convert to DataFrame for analysis
rag_df = pd.DataFrame(rag_sample_results)

print("\n" + "="*60)
print("RAG ESG SCORING RESULTS")
print("="*60)
print(rag_df[['company', 'environmental_score', 'social_score', 'governance_score', 'overall_esg_score']])

# Save results
rag_df.to_csv('esg_scores_rag_sample.csv', index=False)
print(f"\n✓ Results saved to 'esg_scores_rag_sample.csv'")

In [None]:
# Compare RAG results with previous methods
def compare_with_previous_methods():
    """Compare RAG results with BOW, BERT, and LLM approaches"""
    
    # Load previous results if available
    previous_files = {
        'BOW': 'esg_scores_bow_analysis.csv',
        'BERT': 'esg_scores_bert_analysis.csv', 
        'LLM': 'esg_scores_llm_analysis.csv'
    }
    
    comparison_data = []
    
    # RAG results (current)
    rag_scores = rag_df.set_index('company')['overall_esg_score'].to_dict()
    
    print("Comparison with Previous Methods:")
    print("="*50)
    
    for method, filename in previous_files.items():
        try:
            if os.path.exists(filename):
                df = pd.read_csv(filename)
                if 'overall_esg_score' in df.columns:
                    method_scores = df.set_index('company')['overall_esg_score'].to_dict()
                    
                    # Compare scores for common companies
                    for company in rag_scores.keys():
                        if company in method_scores:
                            comparison_data.append({
                                'company': company,
                                'method': method,
                                'score': method_scores[company]
                            })
                    
                    print(f"✓ Loaded {method} results: {len(method_scores)} companies")
                else:
                    print(f"⚠ {filename} missing 'overall_esg_score' column")
            else:
                print(f"⚠ {filename} not found")
                
        except Exception as e:
            print(f"✗ Error loading {method}: {e}")
    
    # Add RAG results to comparison
    for company, score in rag_scores.items():
        comparison_data.append({
            'company': company,
            'method': 'RAG',
            'score': score
        })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        
        # Create pivot table for better visualization
        pivot_df = comparison_df.pivot(index='company', columns='method', values='score')
        
        print(f"\nMethod Comparison (Available Companies):")
        print("-"*50)
        print(pivot_df.round(2))
        
        # Calculate method statistics
        method_stats = comparison_df.groupby('method')['score'].agg(['mean', 'std', 'count']).round(2)
        print(f"\nMethod Statistics:")
        print("-"*30)
        print(method_stats)
        
        return comparison_df, pivot_df
    else:
        print("No previous results found for comparison")
        return None, None

# Perform comparison
comparison_df, pivot_df = compare_with_previous_methods()

In [None]:
# Visualize RAG performance and comparison
def create_rag_visualizations():
    """Create visualizations for RAG ESG analysis"""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('RAG ESG Analysis Results', fontsize=16, fontweight='bold')
    
    # 1. RAG ESG Scores by Category
    categories = ['environmental_score', 'social_score', 'governance_score']
    category_means = [rag_df[cat].mean() for cat in categories]
    category_labels = ['Environmental', 'Social', 'Governance']
    
    axes[0,0].bar(category_labels, category_means, color=['green', 'blue', 'orange'], alpha=0.7)
    axes[0,0].set_title('RAG: Average ESG Scores by Category')
    axes[0,0].set_ylabel('Score (0-100)')
    axes[0,0].set_ylim(0, 100)
    
    # Add value labels on bars
    for i, v in enumerate(category_means):
        axes[0,0].text(i, v + 1, f'{v:.1f}', ha='center', va='bottom')
    
    # 2. Company Rankings (RAG Overall Scores)
    company_scores = rag_df.set_index('company')['overall_esg_score'].sort_values(ascending=True)
    axes[0,1].barh(range(len(company_scores)), company_scores.values, color='skyblue', alpha=0.7)
    axes[0,1].set_yticks(range(len(company_scores)))
    axes[0,1].set_yticklabels([c.capitalize() for c in company_scores.index])
    axes[0,1].set_title('RAG: Company ESG Rankings')
    axes[0,1].set_xlabel('Overall ESG Score')
    
    # 3. Score Distribution
    axes[1,0].hist(rag_df['overall_esg_score'], bins=10, color='lightcoral', alpha=0.7, edgecolor='black')
    axes[1,0].set_title('RAG: ESG Score Distribution')
    axes[1,0].set_xlabel('Overall ESG Score')
    axes[1,0].set_ylabel('Number of Companies')
    
    # 4. Method Comparison (if available)
    if comparison_df is not None and pivot_df is not None:
        # Box plot of scores by method
        methods = comparison_df['method'].unique()
        method_data = [comparison_df[comparison_df['method'] == method]['score'].values for method in methods]
        
        bp = axes[1,1].boxplot(method_data, labels=methods, patch_artist=True)
        colors = ['lightblue', 'lightgreen', 'lightyellow', 'lightcoral']
        for patch, color in zip(bp['boxes'], colors[:len(bp['boxes'])]):
            patch.set_facecolor(color)
        
        axes[1,1].set_title('ESG Score Comparison: All Methods')
        axes[1,1].set_ylabel('ESG Score')
        axes[1,1].grid(True, alpha=0.3)
    else:
        axes[1,1].text(0.5, 0.5, 'No comparison data\navailable', 
                      ha='center', va='center', transform=axes[1,1].transAxes,
                      fontsize=12, style='italic')
        axes[1,1].set_title('Method Comparison')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n" + "="*60)
    print("RAG ESG ANALYSIS SUMMARY")
    print("="*60)
    print(f"Companies analyzed: {len(rag_df)}")
    print(f"Average Environmental score: {rag_df['environmental_score'].mean():.2f}")
    print(f"Average Social score: {rag_df['social_score'].mean():.2f}")
    print(f"Average Governance score: {rag_df['governance_score'].mean():.2f}")
    print(f"Average Overall ESG score: {rag_df['overall_esg_score'].mean():.2f}")
    print(f"Score standard deviation: {rag_df['overall_esg_score'].std():.2f}")
    
    # Top and bottom performers
    top_performer = rag_df.loc[rag_df['overall_esg_score'].idxmax()]
    bottom_performer = rag_df.loc[rag_df['overall_esg_score'].idxmin()]
    
    print(f"\nTop ESG performer: {top_performer['company'].capitalize()} ({top_performer['overall_esg_score']:.2f})")
    print(f"Bottom ESG performer: {bottom_performer['company'].capitalize()} ({bottom_performer['overall_esg_score']:.2f})")

# Create visualizations
create_rag_visualizations()

In [None]:
# RAG Evaluation and Token Efficiency Analysis
def analyze_rag_efficiency():
    """Analyze RAG system efficiency and advantages over direct LLM"""
    
    print("RAG SYSTEM EFFICIENCY ANALYSIS")
    print("="*50)
    
    # Calculate document processing statistics
    total_docs = len(esg_documents)
    total_chunks = len(final_docs)
    avg_chunk_size = sum(len(doc.page_content) for doc in final_docs) / len(final_docs)
    
    print(f"Document Processing:")
    print(f"  • Original documents: {total_docs}")
    print(f"  • Generated chunks: {total_chunks}")
    print(f"  • Average chunk size: {avg_chunk_size:.0f} characters")
    print(f"  • Chunks per document: {total_chunks/total_docs:.1f}")
    
    # Estimate token efficiency
    total_content_length = sum(len(doc.page_content) for doc in esg_documents)
    print(f"\nToken Efficiency:")
    print(f"  • Total content: {total_content_length:,} characters")
    print(f"  • Estimated tokens: {total_content_length/4:,.0f} (assuming 4 chars/token)")
    print(f"  • RAG retrieval (k=8): ~{8 * avg_chunk_size/4:.0f} tokens per query")
    print(f"  • Token reduction: {(1 - (8 * avg_chunk_size)/total_content_length)*100:.1f}%")
    
    # Retrieval quality assessment
    print(f"\nRetrieval Quality:")
    print(f"  • Retrieval strategy: Similarity search")
    print(f"  • Embedding model: IBM Granite-278m-multilingual")
    print(f"  • Vector database: Chroma")
    print(f"  • Search parameters: k=8, threshold=0.3")
    
    return {
        'total_docs': total_docs,
        'total_chunks': total_chunks,
        'avg_chunk_size': avg_chunk_size,
        'total_content_length': total_content_length,
        'token_reduction': (1 - (8 * avg_chunk_size)/total_content_length)*100
    }

# Advantages of RAG over Direct LLM
def rag_advantages_analysis():
    """Analyze advantages of RAG over direct LLM approach"""
    
    print("\nRAG vs DIRECT LLM COMPARISON")
    print("="*50)
    
    advantages = {
        "Token Limitations": {
            "Direct LLM": "Limited to 8192 tokens - cannot process full documents",
            "RAG": "Processes unlimited documents through intelligent retrieval"
        },
        "Content Relevance": {
            "Direct LLM": "Processes entire document even if only small portion relevant", 
            "RAG": "Retrieves only most relevant sections for each query"
        },
        "Scalability": {
            "Direct LLM": "Performance degrades with longer context",
            "RAG": "Maintains performance regardless of corpus size"
        },
        "Information Quality": {
            "Direct LLM": "May miss relevant info due to token truncation",
            "RAG": "Finds relevant info across entire document corpus"
        },
        "Cost Efficiency": {
            "Direct LLM": "High token usage for repetitive content processing",
            "RAG": "Lower token usage through targeted retrieval"
        }
    }
    
    for category, comparison in advantages.items():
        print(f"\n{category}:")
        print(f"  🔴 Direct LLM: {comparison['Direct LLM']}")
        print(f"  🟢 RAG: {comparison['RAG']}")
    
    return advantages

# Run efficiency analysis
efficiency_stats = analyze_rag_efficiency()
advantages = rag_advantages_analysis()

In [None]:
# Comprehensive ESG Analysis (Optional - Full Dataset)
def run_comprehensive_analysis():
    """
    Run complete ESG analysis for all companies using RAG
    Note: This may take significant time due to LLM API calls
    """
    
    print("COMPREHENSIVE ESG ANALYSIS")
    print("="*50)
    print("This will analyze all 15 companies across all ESG categories")
    print("Estimated time: 10-15 minutes")
    print("Estimated API calls: ~45 (15 companies × 3 categories)")
    
    response = input("\nProceed with comprehensive analysis? (y/n): ")
    
    if response.lower() == 'y':
        print("\nStarting comprehensive ESG analysis...")
        comprehensive_results = comprehensive_esg_scoring()
        
        # Save results
        comprehensive_df = pd.DataFrame(comprehensive_results)
        comprehensive_df.to_csv('esg_scores_rag_comprehensive.csv', index=False)
        
        print(f"\n✓ Comprehensive analysis complete!")
        print(f"✓ Results saved to 'esg_scores_rag_comprehensive.csv'")
        
        # Display summary
        print(f"\nCOMPREHENSIVE RESULTS SUMMARY:")
        print("-"*40)
        print(comprehensive_df[['company', 'environmental_score', 'social_score', 
                               'governance_score', 'overall_esg_score']].round(2))
        
        # Calculate statistics
        print(f"\nStatistics:")
        print(f"Mean ESG Score: {comprehensive_df['overall_esg_score'].mean():.2f}")
        print(f"Std Deviation: {comprehensive_df['overall_esg_score'].std():.2f}")
        print(f"Min Score: {comprehensive_df['overall_esg_score'].min():.2f}")
        print(f"Max Score: {comprehensive_df['overall_esg_score'].max():.2f}")
        
        return comprehensive_df
    else:
        print("Comprehensive analysis skipped.")
        return None

# Interactive Q&A with RAG system
def interactive_esg_qa():
    """Interactive Q&A session with the RAG system"""
    
    print("\nINTERACTIVE ESG Q&A")
    print("="*40)
    print("Ask questions about ESG practices of any company in the dataset")
    print("Type 'quit' to exit")
    
    while True:
        question = input("\nYour question: ")
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        try:
            # Use general Q&A mode
            result = esg_rag_graph.invoke({
                "company": "",  # Let retrieval find relevant companies
                "category": "General",
                "question": question
            })
            
            answer = result.get('answer', 'No response generated')
            context_docs = result.get('context', [])
            
            print(f"\nAnswer: {answer}")
            
            if context_docs:
                print(f"\nSources: {len(context_docs)} relevant documents found")
                companies = set(doc.metadata.get('company', 'Unknown') for doc in context_docs[:3])
                print(f"Primary companies: {', '.join(companies)}")
            
        except Exception as e:
            print(f"Error: {e}")

print("✓ Advanced RAG functions ready")
print("\nOptions:")
print("1. run_comprehensive_analysis() - Full analysis of all companies")
print("2. interactive_esg_qa() - Interactive Q&A session")