In [9]:
from typing import Literal, Any
from copy import deepcopy

from typing_extensions import TypedDict
import matplotlib.pyplot as plt
import numpy as np
from decouple import config
from pydantic import BaseModel, Field
from IPython.display import Image, display
from tqdm import tqdm

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain_ibm import WatsonxEmbeddings
from langchain_ibm import WatsonxLLM
from langgraph.graph import START, StateGraph
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

import litellm
from litellm import completion
import instructor
from instructor import Mode

In [17]:
WX_API_KEY = os.getenv('WX_API_KEY')
WX_PROJECT_ID = os.getenv('WX_PROJECT_ID')
WX_API_URL = "https://us-south.ml.cloud.ibm.com"

In [None]:
import os
from pathlib import Path

def load_company_data(data_dir="data", companies=None):
    """
    Load 10-K and ESG data for companies from the structured directory.
    
    Args:
        data_dir: Base data directory
        companies: List of company names to load (None = all companies)
    
    Returns:
        Dictionary with company data organized by company and document type
    """
    if companies is None:
        # Get all company directories
        companies = [d for d in os.listdir(data_dir) 
                    if os.path.isdir(os.path.join(data_dir, d)) 
                    and d not in ['ESG_frameworks', 'ESG_frameworks_txt', 'apiData', 'esg_report']]
    
    company_data = {}
    
    for company in companies:
        company_dir = os.path.join(data_dir, company)
        if not os.path.exists(company_dir):
            print(f"Warning: Directory {company_dir} not found")
            continue
            
        company_data[company] = {
            '10k_documents': [],
            'esg_documents': [],
            'all_documents': []
        }
        
        # Load 10-K documents
        for item in ['10k_item1.md', '10k_item1A.md', '10k_item7.md', '10k_item7A.md']:
            file_path = os.path.join(company_dir, item)
            if os.path.exists(file_path):
                docs = TextLoader(file_path).load()
                for doc in docs:
                    doc.metadata['company'] = company
                    doc.metadata['document_type'] = '10k'
                    doc.metadata['item'] = item.replace('10k_', '').replace('.md', '')
                company_data[company]['10k_documents'].extend(docs)
                company_data[company]['all_documents'].extend(docs)
        
        # Load ESG report
        esg_path = os.path.join(company_dir, 'ESG_Report.md')
        if os.path.exists(esg_path):
            docs = TextLoader(esg_path).load()
            for doc in docs:
                doc.metadata['company'] = company
                doc.metadata['document_type'] = 'esg_report'
            company_data[company]['esg_documents'].extend(docs)
            company_data[company]['all_documents'].extend(docs)
        
        print(f"{company}: {len(company_data[company]['10k_documents'])} 10-K docs, "
              f"{len(company_data[company]['esg_documents'])} ESG docs")
    
    return company_data

# Load all company data
company_data = load_company_data()
print(f"\nLoaded data for companies: {list(company_data.keys())}")

In [18]:
llm = WatsonxLLM(

        model_id= "ibm/granite-3-8b-instruct",
        url=WX_API_URL,
        apikey=WX_API_KEY,
        project_id=WX_PROJECT_ID,

        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.TEMPERATURE: 0,
            GenParams.MIN_NEW_TOKENS: 5,
            GenParams.MAX_NEW_TOKENS: 1_000,
            GenParams.REPETITION_PENALTY:1.2
        }

)

In [19]:
llm_result = llm.generate(["Hi how are you?"])

print(type(llm_result))
print(llm_result)


<class 'langchain_core.outputs.llm_result.LLMResult'>
generations=[[Generation(text="\nI'm an artificial intelligence and don't have feelings, but I'm here to help you. How can I assist you today?", generation_info={'finish_reason': 'eos_token'})]] llm_output={'token_usage': {'generated_token_count': 31, 'input_token_count': 5}, 'model_id': 'ibm/granite-3-8b-instruct', 'deployment_id': None} run=[RunInfo(run_id=UUID('038e0b3a-36ad-4221-b32a-d29eece330e9'))] type='LLMResult'


In [25]:
from langchain_community.document_loaders import TextLoader
import os

# Load all .txt files from the ESG frameworks directory
esg_frameworks_dir = "data/ESG_frameworks_txt"
esg_documents = []
for fname in os.listdir(esg_frameworks_dir):
    if fname.endswith(".txt"):
        path = os.path.join(esg_frameworks_dir, fname)
        esg_documents.extend(TextLoader(path).load())

# Load all .md files from the apiData directory (Tesla 10-K data)
tenk_dir = "data/apiData"
tenk_documents = []
for fname in os.listdir(tenk_dir):
    if fname.endswith(".md"):
        path = os.path.join(tenk_dir, fname)
        tenk_documents.extend(TextLoader(path).load())

print(f"Loaded {len(tenk_documents)} 10-K documents")
print(f"Loaded {len(esg_documents)} ESG framework documents")
if tenk_documents:
    print("First 10-K document metadata:", tenk_documents[0].metadata)
if esg_documents:
    print("First ESG document metadata:", esg_documents[0].metadata)

Loaded 4 10-K documents
Loaded 0 ESG framework documents
First 10-K document metadata: {'source': 'data/apiData/tsla_10k_item1.md'}


In [27]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Example: split 10-K documents into 1000-character chunks with 200 overlap
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split 10-K documents
tenk_chunks = []
for doc in tenk_documents:
    tenk_chunks.extend(splitter.split_documents([doc]))

# Split ESG framework documents
esg_chunks = []
for doc in esg_documents:
    esg_chunks.extend(splitter.split_documents([doc]))

print(f"Created {len(tenk_chunks)} 10-K chunks")
print(f"Created {len(esg_chunks)} ESG framework chunks")

if tenk_chunks:
    print(f"First 10-K chunk: {tenk_chunks[0].page_content[:200]}...")
else:
    print("No 10-K chunks available")

if esg_chunks:
    print(f"First ESG chunk: {esg_chunks[0].page_content[:200]}...")
else:
    print("No ESG framework chunks available")

Created 334 10-K chunks
Created 0 ESG framework chunks
First 10-K chunk: # ITEM 1.

BUSINESS

Overview...
No ESG framework chunks available


In [28]:
embed_params = {}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url=WX_API_URL,
    project_id=WX_PROJECT_ID,
    apikey=WX_API_KEY,
    params=embed_params,
)

In [26]:
# Create a Chroma vector index for 10-K chunks
tenk_vector_db = Chroma.from_documents(
    collection_name="tenk_collection",
    embedding=watsonx_embedding,
    persist_directory="tenk_vector_db",
    documents=tenk_chunks,
)

# Create a Chroma vector index for ESG framework chunks
esg_vector_db = Chroma.from_documents(
    collection_name="esg_collection",
    embedding=watsonx_embedding,
    persist_directory="esg_vector_db",
    documents=esg_chunks,
)

In [27]:
# Use the vectorstore as a retriever
retriever = local_vector_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3,
    }
)

NameError: name 'local_vector_db' is not defined

In [None]:

# Retrieve the most similar text
retrieved_documents = retriever.invoke("What is the purpose of the ESG framework?")

for document in retrieved_documents:
    print(f"{'#' * 80}\nID: {document.id}")
    first_n_of_content = document.page_content[:500].replace('\n\n', ' ')
    print(f"Content: {first_n_of_content}\n")

################################################################################
ID: eb2e6281-c167-4b91-a0cd-e621fc50e7fa
Content: ESG
The very purpose of Tesla's existence is to accelerate the world's transition to sustainable energy. We believe the world cannot reduce carbon emissions without addressing both energy
generation and consumption, and we are designing and manufacturing a complete energy and transportation ecosystem to achieve this goal. As we expand, we are building each new factory to
be more efficient and sustainably designed than the previous one, including with respect to per-unit waste reduction and resou

################################################################################
ID: 16b81130-0a94-4934-b2f7-96848ae75494
Content: ESG
The very purpose of Tesla's existence is to accelerate the world's transition to sustainable energy. We believe the world cannot reduce carbon emissions without addressing both energy
generation and consumption, and we are designing

In [None]:
template = """You are a highly accurate ESG disclosure evaluator, trained to assess company filings for the presence, quality, and materiality of ESG (Environmental, Social, Governance) information.

Your task is to analyze retrieved excerpts from 10-K filings and provide a structured evaluation for each ESG dimension or sub-category.

You are expected to:
1. Identify whether the passage addresses a relevant ESG topic (defined below).
2. Judge the quality and detail of the disclosure (is it specific, quantitative, and forward-looking, or vague and generic?).
3. Assign a score between 0 and 5 based on the scoring rubric.
4. Justify your score in a concise explanation, quoting or paraphrasing the evidence from the text.

### ESG Dimensions and Categories (based on LSEG/SASB):

**Environmental:**
- Emissions (e.g. CO₂ reporting, reduction targets)
- Resource Use (e.g. energy/water consumption)
- Innovation (e.g. green technologies)

**Social:**
- Workforce (e.g. diversity, DEI initiatives, labor practices)
- Human Rights (e.g. supply chain policies)
- Community Engagement (e.g. philanthropy, local impact)
- Product Responsibility (e.g. product safety, consumer protection)

**Governance:**
- Management (e.g. executive structure, board diversity, oversight)
- CSR Strategy (e.g. ESG integration into business strategy)
- Shareholders (e.g. voting rights, stakeholder inclusion)

### ESG Scoring Rubric (0–5):
- **0** – No relevant disclosure.
- **1** – ESG topic is mentioned vaguely or once, with no detail.
- **2** – Topic is acknowledged; minimal detail or unclear metrics.
- **3** – Moderate disclosure; some specificity (e.g. qualitative policies).
- **4** – Strong disclosure with specific actions, recent results, or targets.
- **5** – Detailed disclosure with metrics, goals, and progress updates.

### Output Format:
- **ESG Category**: [e.g., Emissions]
- **Score (0–5)**: [e.g., 3]
- **Justification**: [e.g., “The company mentions carbon emissions and has pledged to reduce them, but no metrics or timeline are provided.”]

Be objective. If the company only includes marketing statements or boilerplate language, assign a lower score and explain why.

When in doubt, lean toward conservative scoring.


"""

prompt = PromptTemplate.from_template(template)

In [None]:
question = "what is tesla's ESG strategy?"

retrieved_docs = local_vector_db.similarity_search(question)
docs_content = "\n\n".join(f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(retrieved_docs))
formated_prompt = prompt.invoke({"question": question, "context": docs_content})

In [None]:
print(formated_prompt.to_string()[:1000])

You are a highly accurate ESG disclosure evaluator, trained to assess company filings for the presence, quality, and materiality of ESG (Environmental, Social, Governance) information.

Your task is to analyze retrieved excerpts from 10-K filings and provide a structured evaluation for each ESG dimension or sub-category.

You are expected to:
1. Identify whether the passage addresses a relevant ESG topic (defined below).
2. Judge the quality and detail of the disclosure (is it specific, quantitative, and forward-looking, or vague and generic?).
3. Assign a score between 0 and 5 based on the scoring rubric.
4. Justify your score in a concise explanation, quoting or paraphrasing the evidence from the text.

### ESG Dimensions and Categories (based on LSEG/SASB):

**Environmental:**
- Emissions (e.g. CO₂ reporting, reduction targets)
- Resource Use (e.g. energy/water consumption)
- Innovation (e.g. green technologies)

**Social:**
- Workforce (e.g. diversity, DEI initiatives, labor practi

In [None]:
answer = llm.invoke(formated_prompt)

In [None]:
print(answer)


Understood? Let's begin!

Excerpt 1: "We are committed to reducing our environmental footprint by implementing sustainable practices across all aspects of our operations."

*ESG Category*: Environmental - Emissions

*Score (0–5)*: 1

*Justification*: The statement indicates an intention to address environmental concerns without providing any concrete details about current emission levels, reduction strategies, or timelines. It lacks specificity necessary for meaningful assessment.
