# ESG Theme Classification Heatmap Demo

This notebook demonstrates how to classify ESG (Environmental, Social, Governance) themes in text and visualize the results as a heatmap.

# Required Libraries

This notebook requires the following libraries (install via requirements.txt):
- spacy (with en_core_web_sm)
- transformers
- pandas
- matplotlib
- seaborn

In [None]:
# Input Document
# Paste your CSR (Corporate Social Responsibility) report or similar text document below

input_text = """Example CSR report text. 
This is a placeholder. Please replace this with your actual CSR report or text document.
"""

In [None]:
# Sentence Splitting with spaCy

# Import the spaCy library
import spacy

# Load the English language model (small version)
nlp = spacy.load("en_core_web_sm")

# Process the input text with spaCy
# This creates a Doc object with linguistic annotations
doc = nlp(input_text)

# Extract sentences using spaCy's sentence segmentation
# spaCy identifies sentence boundaries based on punctuation and other linguistic features
sentences = [sent.text.strip() for sent in doc.sents]

# Print the list of sentences for verification
print(f"Found {len(sentences)} sentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence}")

In [None]:
# ESG Classification with Transformers

# Import necessary modules from transformers and torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

# Load the pretrained ESG-BERT model and tokenizer from Hugging Face
# This model is specifically trained to classify text into ESG categories
model_name = "nbroad/ESG-BERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define a function to classify sentences into ESG categories
def classify_sentence(sentence):
    """
    Classify a sentence into one of the ESG (Environmental, Social, Governance) categories.
    
    Args:
        sentence (str): The input sentence to classify
        
    Returns:
        tuple: (category_name, probability) where category_name is the predicted ESG category
               and probability is the confidence score for that prediction
    """
    # Step 1: Tokenize the input sentence
    # Convert the text into tokens that the model can understand
    # return_tensors="pt" returns PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Step 2: Feed the tokenized sentence into the model to obtain raw logits
    # Set model to evaluation mode and disable gradient calculation for inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Step 3: Apply softmax function to convert logits into probability distribution
    # Softmax normalizes the logits so they sum to 1, representing probabilities
    probabilities = F.softmax(logits, dim=1)
    
    # Get the predicted class (highest probability)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    probability_value = probabilities[0][predicted_class].item()
    
    # Step 4: Map the predicted numerical label to ESG categories
    # The mapping depends on how the model was trained
    # For ESG-BERT, typically: 0=Environmental, 1=Social, 2=Governance
    esg_categories = {
        0: "Environmental",
        1: "Social",
        2: "Governance"
    }
    
    category = esg_categories[predicted_class]
    
    return category, probability_value

# Test the function on a sample sentence
sample_sentence = "The company reduced carbon emissions by 15% this year."
category, confidence = classify_sentence(sample_sentence)
print(f"Sample: '{sample_sentence}'")
print(f"Predicted ESG Category: {category}")
print(f"Confidence: {confidence:.4f} ({confidence*100:.2f}%)")

In [None]:
# Aggregation of Classification Results

# Import pandas for data manipulation and analysis
import pandas as pd

# Initialize counters for each ESG category
# These will track how many sentences fall into each category
environmental_count = 0
social_count = 0
governance_count = 0

# Loop through each sentence in our list of sentences
for sentence in sentences:
    # Call the classify_sentence function to get the predicted ESG category
    # This returns both the category name and the confidence score
    category, confidence = classify_sentence(sentence)
    
    # Increment the appropriate counter based on the predicted category
    if category == "Environmental":
        environmental_count += 1
    elif category == "Social":
        social_count += 1
    elif category == "Governance":
        governance_count += 1

# Create a dictionary with the counts for each category
# This will be used to create our DataFrame
esg_counts = {
    "Environmental": [environmental_count],
    "Social": [social_count],
    "Governance": [governance_count]
}

# Create a pandas DataFrame with the counts
# The index is set to "Report" to indicate these counts are for the entire document
esg_df = pd.DataFrame(esg_counts, index=["Report"])

# Print the resulting DataFrame showing the distribution of ESG categories
print("ESG Category Distribution:")
print(esg_df)

# Calculate and print the total number of sentences classified
total_sentences = environmental_count + social_count + governance_count
print(f"\nTotal sentences classified: {total_sentences}")