# Task 1

### 1.1 import necessary libraries

In [6]:
import pdfplumber
import re
import json

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from typing import List, TypedDict
from langgraph.graph import StateGraph, END

from dotenv import load_dotenv
load_dotenv() 

# Access the key
api_key = os.getenv("OPENAI_API_KEY")

### 1.2 extracting data from two pdf file

In [7]:
# pdf_reading function
def pdf_reading(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
        return full_text

# Extract data from participant statement
def extract_participant_statement_data(full_text):

    # Participant statement data 
    participant_statement_data = {
        "account_holder": None,
        "statement_date_range": None,
        "total_portfolio_balance": None,
        "asset_allocation": {},
        "quarterly_contributions": {
            "employee": None,
            "employer": None,
            "gains_loss": None
        }
    }
    
    # 1. Account holder name
    account_match = re.search(r"Account holder:\s*(.+)", full_text)
    if account_match:
        name = account_match.group(1).strip()
        if name and "SAMPLE" not in name:
            participant_statement_data["account_holder"] = name

    # 2. Statement date range
    date_match = re.search(r"FOR\s+([A-Za-z]+\s+\d{1,2},\s*\d{4})\s+TO\s+([A-Za-z]+\s+\d{1,2},\s*\d{4})", full_text, re.IGNORECASE)
    if date_match:
        participant_statement_data["statement_date_range"] = f"{date_match.group(1)} to {date_match.group(2)}"

    # 3. Total portfolio balance as of statement end date
    balance_match = re.search(r"Ending balance\s*\$([\d,]+\.\d{2})", full_text)
    if balance_match:
        participant_statement_data["total_portfolio_balance"] = balance_match.group(1)

    # 4. Asset allocation breakdown
    asset_section = re.search(r"How your portfolio is allocated(.+?)These asset allocation", full_text, re.DOTALL)
    if asset_section:
        for asset_type in ["Equities", "Fixed Income", "Multi-Asset"]:
            match = re.search(rf"{asset_type}\s*\$([\d,]+\.\d{{2}})\s*([\d\.]+)%", asset_section.group(1))
            if match:
                participant_statement_data["asset_allocation"][asset_type] = {
                    "value": match.group(1),
                    "percent": match.group(2)
                }

    # 5. Contributions and gains/loss for the quarter
    contrib_match = re.search(
        r"Your contributions\s*([\d,]+\.\d{2}).*?Employer contributions\s*([\d,]+\.\d{2}).*?Gains/Loss\s*([\d,]+\.\d{2})",
        full_text, re.DOTALL
    )
    if contrib_match:
        participant_statement_data["quarterly_contributions"]["employee"] = contrib_match.group(1)
        participant_statement_data["quarterly_contributions"]["employer"] = contrib_match.group(2)
        participant_statement_data["quarterly_contributions"]["gains_loss"] = contrib_match.group(3)
    return participant_statement_data


# Extract data from retirement benefits
def extract_retirement_benefit_data(full_text):

    # Retirement benefits data
    retirement_benefit_data = {
        "defined_benefit_vs_contribution": None,
        "vesting_rules": None,
        "participant_rights": None
    }

    # 1. Defined benefit vs defined contribution
    match1 = re.search(r"(defined benefit.*?defined contribution.*?)(?:\n\n|\Z)", full_text, re.IGNORECASE | re.DOTALL)
    if match1:
        retirement_benefit_data["defined_benefit_vs_contribution"] = match1.group(1).strip()

    # 2. Vesting rules summary (cliff & graduated)
    match2 = re.search(r"(vesting.*?cliff.*?graduated.*?)(?:\n\n|\Z)", full_text, re.IGNORECASE | re.DOTALL)
    if match2:
        retirement_benefit_data["vesting_rules"] = match2.group(1).strip()

    # 3. Participant rights related to account balances
    match3 = re.search(r"(participant rights.*?account balance.*?)(?:\n\n|\Z)", full_text, re.IGNORECASE | re.DOTALL)
    if match3:
        retirement_benefit_data["participant_rights"] = match3.group(1).strip()

    return retirement_benefit_data


### 1.3 generating summary

In [8]:
# --- 2. Define the Agent State ---
# This dictionary will hold the state of our application as it runs.
class AgentState(TypedDict):
    topic: str
    pdf_paths: List[str]
    documents: List[str]
    summary: str

# --- 3. Define the Nodes of the Graph ---
def load_and_split_pdfs(state: AgentState):
    """
    Loads the PDF files and splits them into chunks.
    """
    topic = state.get("topic")
    pdf_paths = state.get("pdf_paths")
    
    all_docs = []
    for pdf_path in pdf_paths:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        all_docs.extend(docs)

    #print(all_docs)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_documents = text_splitter.split_documents(all_docs)
    
    # Convert Document objects to strings for easier processing
    documents_as_strings = [doc.page_content for doc in split_documents]

    return {"documents": documents_as_strings, "topic": topic}

def summarize(state: AgentState):
    """
    Summarizes the relevant documents based on the topic.
    This node acts as our summarization agent.
    """
    
    topic = state.get("topic")
    documents = state.get("documents")

    # Create a retriever to find the most relevant document chunks
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(documents, embedding=embeddings)
    retriever = vectorstore.as_retriever()
    
    relevant_docs = retriever.invoke(topic)
    
    # Convert relevant Document objects to a single string
    context = "\n\n".join([doc.page_content for doc in relevant_docs])

    # Define the prompt for our summarization agent
    prompt_template = """You are an expert summarizer. Your task is to provide a concise 150 words summary of the following text, focusing on the key points related to the topic: {topic}.

    Text to summarize:
    {context}

    Concise Summary:"""
    
    prompt = ChatPromptTemplate.from_template(prompt_template)
    
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    chain = prompt | llm | StrOutputParser()
    
    summary = chain.invoke({"context": context, "topic": topic})
    
    #print("---SUMMARY GENERATED---")
    #print(summary)

    return {"summary": summary}



# --- 4. Build the Graph ---

# Initialize the graph
workflow = StateGraph(AgentState)

# Add the nodes
workflow.add_node("load_and_split", load_and_split_pdfs)
workflow.add_node("summarize", summarize)

# Set the entry point
workflow.set_entry_point("load_and_split")

# Add the edges
workflow.add_edge("load_and_split", "summarize")
workflow.add_edge("summarize", END)

# Compile the graph
app = workflow.compile()


### 1.4 run the application

In [9]:
# Some variables to hold the path to the PDF file
pdf_paths = ["Data/PDF_Participant-statement-sample2.pdf", "Data/your-retirement-benefits.pdf"]


# For Participant Statement
full_text_participant = pdf_reading(pdf_paths[0])
extracted_participant_data = extract_participant_statement_data(full_text_participant)

# For Retirement Benefits
full_text_retirement = pdf_reading(pdf_paths[1])
extracted_retirement_data = extract_retirement_benefit_data(full_text_retirement)

# Combine the extracted data
combined_data = {**extracted_participant_data, **extracted_retirement_data}

# Save to JSON
with open("task_1_output/extracted_data.json", "w") as f:
    json.dump(combined_data, f, indent=4)
print("\n---SAVED EXTRACTED DATA TO extracted_data.json FILE in task_1_output folder---")


# Generate a summary report

# Define the inputs for the graph
inputs = {
    "topic": "The portfolio's performance and allocation. How the plan's vesting rules apply to the participant's scenario",
    "pdf_paths": ["Data/PDF_Participant-statement-sample2.pdf", "Data/your-retirement-benefits.pdf"]
}

# Run the graph and get the final state
final_state = app.invoke(inputs)

print("\n---GENERATED SUMMARY---")
print(final_state['summary'])



---SAVED EXTRACTED DATA TO extracted_data.json FILE in task_1_output folder---

---GENERATED SUMMARY---
The portfolio's performance from January 1, 2021, to March 31, 2021, shows a balance increase from $216,284.58 to $228,743.55, with gains of $12,458.97. The plan includes annuity contracts with TIAA and CREF. A delayed vesting provision may apply, affecting the vested status of employer contributions and earnings based on the participant's length of employment. However, specific vested percentages and market values are not displayed, as the employer maintains this information. Participants are advised to refer to the Summary Plan Description for detailed vesting rules. The plan year, a 12-month period, is used for calculating vesting and distribution. Participants should be informed of any material changes through a revised Summary Plan Description or a Summary of Material Modifications. The trustee, under the direction of a named fiduciary, can appoint investment managers for the p