In [137]:
import os
import json
import re
import ast
from typing import Dict, Any, List

import pandas as pd
import numpy as np

from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

In [169]:
QUESTION_PROMPT_TEMPLATE = """
Context Information:
{context}

Table Information:
{table}

Question:
{question}

Your task:
1. Carefully analyse the question and the context
2. Read through the table information in depth
3. Solve the question with clear, step-by-step reasoning
4. Provide your solution in the following format as a json dictionary:

{{"Reasoning_Steps": A detailed explanation of how you approached the problem,
"Relevant_Data_Points": Key numbers and data you used to calculate the answer,
"Calculation_Formula": The mathematical formula or logic used to calculate the answer,
"Potential_Validation_Checks": Ways to verify the answer,
"Final_Answer": State your answer which should be a number
"Confidence_Level": Estimate your confidence in the answer from 0-100%
}}

Be EXPLICIT about your process and any assumptions you made.
"""

VALIDATION_PROMPT = """ 
Question:
{question}

Reasoning:
{reasoning}

Actual Answer:
{actual_answer}

Your task:
1. Assess the question and the reasoning for the calculated answer
2. Compare the reasoning with the actual answer
3. Identify any potential errors
4. Suggest improvements or different approaches
5. Provide an overall assessment in the following format as a json dictionary:

{{"Validity_Assessment": High/Medium/Low,
"Potential_Issues": List any problems,
"Suggested_Improvements": Recommendations for improvements,
"Confidence_Level": Estimate your confidence in your validation from 0-100%}}
"""

In [186]:
class FinancialQAAgent:
    def __init__(self, llm):
        """
        Initialise the Finanical Question Answering Agent

        Args:
            llm (_type_): the language model used to solve the questions
        """
        
        self.llm = llm
        self.question_prompt = PromptTemplate(input_variables = ['question', 'context', 'table'], 
                                              template=QUESTION_PROMPT_TEMPLATE)
        self.validate_prompt = PromptTemplate(input_variables=['question', 'reasoning', 'actual_answer'],
                                              template=VALIDATION_PROMPT)
        
    def extract_table_data(self, sample: Dict[str, Any]) -> str:
        """
        Convert table data to a string representation
        
        Args:
            sample: A singular entry from the dataset
        
        Returns:
            String representation of the table
        """
        try:
            # Handle different table formats (string or list of lists)
            if isinstance(sample['table'], str):
                table_data = ast.literal_eval(sample['table'])
            else:
                table_data = sample['table']
            
            # Convert to DataFrame and then to a readable string
            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            return df.to_string(index=False)
        except Exception as e:
            print(f"Error extracting table data: {e}")
            return "Unable to parse table data"
        
    def llm_solve_question(self, sample: Dict[str, Any]) -> Dict[str, Any]:
        """
        Solve the question using the LLM with reasoning

        Args:
            sample (Dict[str, Any]): A singular entry from the dataset

        Returns:
            Dict[str, Any]: A dictionary with the answer and reasoning
        """
        question = sample.get('qa').get('question')
        answer = sample.get('qa').get('answer')
        pre_text = sample.get('pre_text')
        post_text = sample.get('post_text')
        context = f"Text before the table: {pre_text}\nText after the table: {post_text}"
        table = self.extract_table_data(sample)
        # llm_chain = LLMChain(llm=self.llm,
        #                      prompt=self.question_prompt)
        # llm_response = llm_chain.run(
        #     question=question,
        #         context=context,
        #         table=table)
        # try:
        # llm_response = self.llm.invoke(
        #     self.question_prompt.format(
        #         question=question,
        #         context=context,
        #         table=table
        #     )
        # )
        llm_response = self.llm(
            [
                SystemMessage(content='Your role is to answer mathematical questions.'),
                HumanMessage(content=self.question_prompt.format(
                question=question,
                context=context,
                table=table
                ))
            ]
        )
        
        llm_response = llm_response.content
        llm_response_dict = json.loads(llm_response.replace('\n', ''))
        return {
            "question": question,
            "response": llm_response_dict,
            "actual_answer": answer
        }
        
        # except Exception as e:
        #     return {
        #         "question": question,
        #         "error": str(e),
        #         "actual_answer": answer
        #     }
            
    def validate_answer(self, answer_dict: Dict[str, Any]) -> Dict[str, Any]:
        """
        Self validation of answer and reasoning

        Args:
            answer_dict (Dict[str, Any]): Dictionary with answer and reasoning

        Returns:
            Dict[str, Any]: Dictionary of validation
        """
        
        question = answer_dict.get('question')
        reasoning = answer_dict.get('response')
        actual_answer = answer_dict.get('actual_answer')
        # llm_chain = LLMChain(llm=self.llm,
        #                      prompt=self.validate_prompt)
        # llm_response = llm_chain.run(
        #         question=question,
        #         reasoning=reasoning,
        #         actual_answer=actual_answer
        #         )
        # try:
        # llm_response = self.llm.invoke(
        #     self.validate_prompt.format(
        #         question=question,
        #         reasoning=reasoning,
        #         actual_answer=actual_answer
        #     )
        # )
        llm_response = self.llm(
            [
                SystemMessage(content='Your role is to answer mathematical questions.'),
                HumanMessage(content=self.validate_prompt.format(
                question=question,
                reasoning=reasoning,
                actual_answer=actual_answer
                ))
            ]
        )
        llm_response = llm_response.content
        llm_response_dict = json.loads(llm_response.replace('\n', ''))
        answer_dict['validation_response'] = llm_response_dict
        return answer_dict
        # except Exception as e:
        #     answer_dict['validation_response'] = str(e)
        #     return answer_dict
        
    def calculate_accuracy_metrics(self, results: List[Dict[str, Any]]) -> Dict[str, float]:
        """
        Calculate accuracy metrics for the financial Q&A results

        Args:
            results (List[Dict[str, Any]]): List of results from Q&A process

        Returns:
            Dict[str, float]: Dictionary with accuracy metrics and reasoning
        """
        # Exact Match Accuracy
        exact_match = sum(
            self._parse_answer(str(result.get('response', {}).get('Final_Answer', ''))) == 
            self._parse_answer(result.get('actual_answer', ''))
            for result in results
        ) / len(results)
        
        # Fuzzy Match Accuracy
        fuzzy_match = sum(
            self._fuzzy_match(
                self._parse_answer(str(result.get('response', {}).get('Final_Answer', ''))), 
                self._parse_answer(result.get('actual_answer', ''))
            )
            for result in results
        ) / len(results)
        
        # Validation Confidence
        validation_confidence = sum(
            result.get('validation_response', {}).get('Validity_Assessment', '') == 'High'
            for result in results
        ) / len(results)
        
        return {
            "Exact Match Accuracy": {
                "value": exact_match,
                "reasoning": "Percentage of answers that match ground truth exactly. Strict metric that requires precise numerical match."
            },
            "Fuzzy Match Accuracy": {
                "value": fuzzy_match,
                "reasoning": "Allows for minor numerical variations (e.g., rounding differences). More forgiving than exact match."
            },
            "Validation Confidence": {
                "value": validation_confidence,
                "reasoning": "Proportion of solutions marked as 'High' validity by the self-validation mechanism. Indicates internal confidence."
            }
        }
    
    def _parse_answer(self, answer: str) -> str:
        """
        Parse and clean numerical answers
        
        Args:
            answer: Raw answer string
        
        Returns:
            Cleaned numerical answer
        """
        # Extract numerical values, handle percentage
        matches = re.findall(r'-?\d+\.?\d*', answer)
        return matches[0] if matches else ''
    
    def _fuzzy_match(self, pred: str, truth: str, tolerance: float = 0.1) -> bool:
        """
        Perform fuzzy numerical matching
        
        Args:
            pred: Predicted answer
            truth: Ground truth answer
            tolerance: Acceptable percentage difference
        
        Returns:
            Boolean indicating if answers are close enough
        """
        try:
            pred_float = float(pred)
            truth_float = float(truth)         
            # Calculate relative difference
            diff = abs(pred_float - truth_float) / abs(truth_float)
            return diff <= tolerance
        except ValueError:
            return False

In [65]:
with open('train.json', 'r') as file:
    dataset = json.load(file)

In [126]:
token = os.getenv("HF_ACCESS_TOKEN")
model_name = "meta-llama/Llama-3.1-8B-Instruct"

In [140]:
OPENAI_API_BASE=os.getenv('OPENAI_API_BASE')
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
OPENAI_API_VERSION=os.getenv('OPENAI_API_VERSION')
OPENAI_DEPLOYMENT_ID_FC=os.getenv('OPENAI_DEPLOYMENT_ID_FC')


In [141]:
model = AzureChatOpenAI(
    azure_endpoint=OPENAI_API_BASE,
    azure_deployment= OPENAI_DEPLOYMENT_ID_FC,
    api_version=OPENAI_API_VERSION,
    model_name=OPENAI_DEPLOYMENT_ID_FC,
    api_key=OPENAI_API_KEY,
    temperature=0
)

In [173]:
llm = HuggingFaceHub(huggingfacehub_api_token=token, repo_id=model_name)

In [188]:
agent = FinancialQAAgent(llm=model)

In [175]:
results = []
for sample in dataset[:1]:
    response = agent.llm_solve_question(sample=sample)
    validated_response = agent.validate_answer(answer_dict=response)
    results.append(validated_response)

In [176]:
results

[{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009',
  'response': {'Reasoning_Steps': 'To calculate the percentage change in the net cash from operating activities from 2008 to 2009, we need to find the difference between the two values and then divide it by the initial value. Finally, we multiply the result by 100 to get the percentage change.',
   'Relevant_Data_Points': ['Net cash from operating activities in 2008: $181001',
    'Net cash from operating activities in 2009: $206588'],
   'Calculation_Formula': 'Percentage change = ((New value - Old value) / Old value) * 100',
   'Potential_Validation_Checks': ['Check the accuracy of the given data points',
    'Verify the calculation using alternative methods or sources'],
   'Final_Answer': 14.1,
   'Confidence_Level': 95},
  'actual_answer': '14.1%',
  'validation_response': {'Validity_Assessment': 'High',
   'Potential_Issues': [],
   'Suggested_Improvements': [],
   'Confide

In [177]:
for i, result in enumerate(results, 1):
    print(f"\n--- Sample {i} ---")
    print(f"Question: {result['question']}")
    print(f"Actual Answer: {result['actual_answer']}")
    print(f"Response: {result['response']}")
    print(f"Validation Response:: {result['validation_response']}")
    print("-" * 50)


--- Sample 1 ---
Question: what was the percentage change in the net cash from operating activities from 2008 to 2009
Actual Answer: 14.1%
Response: {'Reasoning_Steps': 'To calculate the percentage change in the net cash from operating activities from 2008 to 2009, we need to find the difference between the two values and then divide it by the initial value. Finally, we multiply the result by 100 to get the percentage change.', 'Relevant_Data_Points': ['Net cash from operating activities in 2008: $181001', 'Net cash from operating activities in 2009: $206588'], 'Calculation_Formula': 'Percentage change = ((New value - Old value) / Old value) * 100', 'Potential_Validation_Checks': ['Check the accuracy of the given data points', 'Verify the calculation using alternative methods or sources'], 'Final_Answer': 14.1, 'Confidence_Level': 95}
Validation Response:: {'Validity_Assessment': 'High', 'Potential_Issues': [], 'Suggested_Improvements': [], 'Confidence_Level': 100}
-------------------

In [189]:
metrics = agent.calculate_accuracy_metrics(results)

pred_float:  14.1
truth_float:  14.1
diff:  0.0


In [190]:
metrics

{'Exact Match Accuracy': {'value': 1.0,
  'reasoning': 'Percentage of answers that match ground truth exactly. Strict metric that requires precise numerical match.'},
 'Fuzzy Match Accuracy': {'value': 1.0,
  'reasoning': 'Allows for minor numerical variations (e.g., rounding differences). More forgiving than exact match.'},
 'Validation Confidence': {'value': 1.0,
  'reasoning': "Proportion of solutions marked as 'High' validity by the self-validation mechanism. Indicates internal confidence."}}

In [185]:
print("\n--- Accuracy Metrics ---")
for metric, details in metrics.items():
    print(f"{metric}: {details['value']}")
    print(f"Reasoning: {details['reasoning']}\n")


--- Accuracy Metrics ---
Exact Match Accuracy: 1.0
Reasoning: Percentage of answers that match ground truth exactly. Strict metric that requires precise numerical match.

Fuzzy Match Accuracy: 0.0
Reasoning: Allows for minor numerical variations (e.g., rounding differences). More forgiving than exact match.

Validation Confidence: 1.0
Reasoning: Proportion of solutions marked as 'High' validity by the self-validation mechanism. Indicates internal confidence.

