In [1]:
# Pathways
import os
import sys
from pathlib import Path

# Automatically set the project path to the current working directory
project_path = Path.cwd()
os.chdir(project_path)
sys.path.insert(0, str(project_path))

print(f"Current directory: {os.getcwd()}")

Current directory: c:\Users\joshu\OneDrive\Documents\1 Work\Bank of England NLP\Bank-of-England-NLP-on-Earnings-Calls


In [5]:
# Define prompt text - to be cashed for mass processing 
prompt_text = {
    "role": "system",
    "content": f"""
        Extract structured JSON objects from a transcript while maintaining speech order. 
        Each question and answer should be a distinct JSON object. Exclude Operator responses, greetings, and farewells. 
        If multiple people respond, capture each as a separate answer object.

        Each JSON object should include:
        - Type (Question/Answer)
        - Text (Spoken content)
        - Page (Transcript page number)
        - Person (Speaker’s name)
        - Role (Speaker’s role, e.g., Analyst, CFO)
        - Affiliation (Organization the speaker represents)
        - Topics (Up to 5, each with a brief justification)
        - Sentiment Analysis (Score: -1 to +1, Category, Justification)

        Ensure strict JSON compliance without extra punctuation or spacing errors.

    **Example JSON Structure:**

    {{
        "Type": "Question",
        "Text": "Can you elaborate on how JPMorgan Chase plans to address potential capital deficits while maintaining growth?",
        "Page": 3,
        "Person": "John McDonald",
        "Role": "Analyst",
        "Affiliation": "Truist Securities, Inc.",
        "Topics": [
            {{"Name": "Capital Management", "Justification": "Addresses capital reserves and growth balance."}},
            {{"Name": "Risk Management", "Justification": "Focuses on mitigating financial risk."}}
        ],
        "Sentiment": {{
            "Score": 0.1,
            "Category": "Neutral",
            "Justification": "Neutral inquiry seeking information."
        }}
    }}

    {{
        "Type": "Answer",
        "Text": "We have a comprehensive capital strategy that ensures we meet regulatory requirements while supporting business growth...",
        "Page": 3,
        "Person": "Jeremy Barnum",
        "Role": "Chief Financial Officer",
        "Affiliation": "JPMorgan Chase",
        "Topics": [
            {{"Name": "Capital Management", "Justification": "Balances compliance with growth."}},
            {{"Name": "Risk Management", "Justification": "Acknowledges market volatility and adaptability."}},
            {{"Name": "Regulatory Compliance", "Justification": "Ensures strategies meet regulatory needs."}}
        ],
        "Sentiment": {{
            "Score": -0.2,
            "Category": "Slightly Negative",
            "Justification": "Acknowledgment of challenges introduces slight negativity."
        }}
    }}

    {{
        "Type": "Answer",
        "Text": "To add, our strong liquidity position and diversified revenue streams provide a buffer against potential deficits...",
        "Page": 4,
        "Person": "Jamie Dimon",
        "Role": "Chairman & CEO",
        "Affiliation": "JPMorgan Chase",
        "Topics": [
            {{"Name": "Capital Management", "Justification": "Emphasizes liquidity and revenue diversification."}},
            {{"Name": "Resilience", "Justification": "Highlights strength against market fluctuations."}}
        ],
        "Sentiment": {{
            "Score": 0.3,
            "Category": "Slightly Positive",
            "Justification": "Focus on strengths conveys a slightly positive outlook."
        }}
    }}

    {{
        "Type": "Question",
        "Text": "How do you see current interest rate trends impacting your capital strategy?",
        "Page": 4,
        "Person": "Jane Doe",
        "Role": "Analyst",
        "Affiliation": "Goldman Sachs",
        "Topics": [
            {{"Name": "Interest Rates", "Justification": "Examines interest rate impact on capital management."}}
        ],
        "Sentiment": {{
            "Score": 0.0,
            "Category": "Neutral",
            "Justification": "Factual, unbiased inquiry."
        }}
    }}

    {{
        "Type": "Answer",
        "Text": "Rising interest rates can increase funding costs, but they also create opportunities to enhance net interest income...",
        "Page": 5,
        "Person": "Jeremy Barnum",
        "Role": "Chief Financial Officer",
        "Affiliation": "JPMorgan Chase",
        "Topics": [
            {{"Name": "Interest Rates", "Justification": "Acknowledges both challenges and opportunities."}}
        ],
        "Sentiment": {{
            "Score": 0.2,
            "Category": "Slightly Positive",
            "Justification": "Recognizes cost increases but emphasizes opportunities."
        }}
    }}
    """
}


In [None]:
import os
import openai
import tiktoken
import pandas as pd
import json
import time
from dotenv import load_dotenv
import PyPDF2
import mlflow
import mlflow.pyfunc
from datetime import datetime

# Load environment variables from a .env file
load_dotenv()

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Tokenizer initialisation - this is needed if costs are used as a performance metric - be aware that tokenizer should match the model - 
# TO DO update to model based encoding = tiktoken.encoding_for_model("gpt-4o-mini")
tokenizer = tiktoken.get_encoding("o200k_base")    #get_encoding("cl100k_base") despite seeing model type in prompt, ChatGPT suggested incorrect tokenizer https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

# MLflow setup
mlflow.set_experiment("Topic_Modeling_Extraction")


def count_tokens(text):
    """Returns the number of tokens in a given text"""
    return len(tokenizer.encode(text))


def extract_roles_and_affiliations(transcript_data):
    """Calls OpenAI API to extract job roles and company names from structured PDF transcript data with page numbers."""
    
    # Convert the structured DataFrame to JSON format
    json_text = transcript_data.to_json(orient='records')
    
    start_time = time.time()
    token_count = count_tokens(json_text)

    prompt = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": f"**Instruction**: {prompt_text}"},
            {"role": "user", "content": f"**Input Data**: {json_text}"}
        ],
        "temperature": 0.2,
        #"max_tokens": 16384
    }

    with mlflow.start_run():
        try:
            response = client.chat.completions.create(**prompt)
            latency = time.time() - start_time

            # Log standard MLflow metrics
            mlflow.log_param("model", "gpt-4o")
            mlflow.log_param("temperature", 0.2)
            mlflow.log_metric("latency_seconds", latency)
            mlflow.log_metric("token_count", token_count)
            mlflow.log_metric("response_length", count_tokens(response.choices[0].message.content))

            # Log input and output artifacts
            mlflow.log_text(json_text, "input_transcript.json")
            mlflow.log_text(response.choices[0].message.content, "output_extracted.json")

            return response.choices[0].message.content.strip()

        except Exception as e:
            mlflow.log_param("error", str(e))
            print(f"API Error: {e}")
            return None



# -- MAIN SCRIPT --
#data = pd.read_csv(r"1_data_and_preprocess/1.0_raw/bank_jp_morgan_chase/JPMorgan_Earnings_Transcripts.csv")

pdf_file_path = Path("data") / "Q1_Transcript-Analyst-Call-25-April-2024.pdf"


#Extracted text. With pages metadata
extracted_data = []
with open(pdf_file_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    for page_number, page in enumerate(reader.pages, start=1):  # Start page numbering from 1
        page_text = page.extract_text()
        if page_text:
            extracted_data.append({"Page Number": page_number, "Text": page_text})


# Convert to DataFrame
transcript_data = pd.DataFrame(extracted_data)
extracted_json = extract_roles_and_affiliations(transcript_data)

if extracted_json:
    output_dir = Path("3_processed_json")
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "processed_transcript_test.json"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(extracted_json)
    print(f"Processing completed successfully. Output saved to: {output_file}")
else:
    print("Processing failed.")

# --- Additional Quality Evaluation Ideas ---
# 1. **Accuracy**: Manually verify the extracted roles, affiliations, and sentiments against ground truth.
# 2. **Completeness**: Ensure all conversation turns are captured and categorized correctly.
# 3. **Consistency**: Check if similar phrases yield similar sentiment scores.
# 4. **Topic Relevance**: Validate that the identified topics are relevant to the context.
# 5. **Token Efficiency**: Monitor the ratio of input tokens to useful output tokens.
# 6. **Error Rate**: Track the number of parsing or formatting errors in the output JSON.
# 7. **Human Feedback**: Collect qualitative feedback from users reviewing the extracted data.
# 8. **Processing Cost**: Log API cost using token usage metrics.
# 9. **Time-to-Completion**: Measure the end-to-end processing time to optimize performance.
# 10. **Edge Cases**: Test with transcripts featuring interruptions, overlaps, or unclear speech.




Processing completed successfully.


In [7]:
transcript_data

Unnamed: 0,Page Number,Text
0,1,\n \n \n1 \n \n \n \n \n \n \n \n \n \n \n ...
1,2,\n \n \n2 \n \n CHRISTIAN SEWING \nSlide 1 ...
2,3,\n \n \n3 \n \n - We increased revenues in o...
3,4,\n \n \n4 \n \n - The revenue increase in FI...
4,5,\n \n \n5 \n \n - We continue to deliver on ...
5,6,\n \n \n6 \n \n - 2023 marked the peak of ou...
6,7,\n \n \n7 \n \n \nSlide 8 – Q1 2024 highlig...
7,8,"\n \n \n8 \n \n - On an absolute basis, net ..."
8,9,\n \n \n9 \n \n - Stage 3 provisions at 471 ...
9,10,\n \n \n10 \n \n Slide 14 – Corporate Bank ...


In [None]:
# Jupyter Notebook Script for Processing JSON Transcript Data

import pandas as pd
import json
import re

# Define file path (update this if needed)
file_path = Path("3_processed_json") / "processed_transcript_test.json"

# Read the file content
with open(file_path, "r", encoding="utf-8") as file:
    file_content = file.read().strip()

# Remove Markdown-style JSON code block markers if present
if file_content.startswith("```json") and file_content.endswith("```"):
    file_content_cleaned = file_content[7:-3].strip()
else:
    file_content_cleaned = file_content

# Ensure JSON objects are wrapped in a proper list
file_content_cleaned = "[" + file_content_cleaned.replace("}\n{", "},\n{") + "]"

try:
    # Load the properly structured JSON
    json_data = json.loads(file_content_cleaned)

    # Convert JSON data to Pandas DataFrame
    df = pd.DataFrame(json_data)

    # Ensure 'Page' is present and correctly formatted
    if "Page" in df.columns:
        df["Page"] = df["Page"].astype(int)  # Convert to integer for sorting

    # Define column mapping to rename columns for readability
    column_mapping = {
        "Page": "Page Number",
        "Text": "Extracted Text",
        "Type": "Question/Answer",
        "Person": "Name",
        "Role": "Role",
        "Affiliation": "Affiliation",
        "Sentiment": "Sentiment",
        "Topics": "Topics"
    }

    # Filter only available columns
    available_columns = [col for col in column_mapping.keys() if col in df.columns]

    # Rename and reorder columns
    df_filtered = df[available_columns].rename(columns=column_mapping)

    # Sort by page number for logical flow
    df_filtered = df_filtered.sort_values(by="Page Number")

    # Extract sentiment details into separate columns
    if "Sentiment" in df_filtered.columns:
        df_filtered["Sentiment Score"] = df_filtered["Sentiment"].apply(lambda x: x.get("Score", None) if isinstance(x, dict) else None)
        df_filtered["Sentiment Category"] = df_filtered["Sentiment"].apply(lambda x: x.get("Category", None) if isinstance(x, dict) else None)
        df_filtered["Sentiment Justification"] = df_filtered["Sentiment"].apply(lambda x: x.get("Justification", None) if isinstance(x, dict) else None)

    # Expand topics into separate columns
    if "Topics" in df_filtered.columns:
        max_topics = 5  # Assume a maximum of 5 topics per entry
        for i in range(max_topics):
            df_filtered[f"Topic {i+1}"] = df_filtered["Topics"].apply(lambda x: x[i]["Name"] if isinstance(x, list) and len(x) > i else None)
            df_filtered[f"Topic {i+1} Justification"] = df_filtered["Topics"].apply(lambda x: x[i]["Justification"] if isinstance(x, list) and len(x) > i else None)

    # Drop original nested columns
    df_filtered = df_filtered.drop(columns=["Sentiment", "Topics"], errors="ignore")

    # Display the structured DataFrame
    from IPython.display import display
    display(df_filtered)

except json.JSONDecodeError as e:
    print(f"JSON Decode Error after restructuring: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


Unnamed: 0,Page Number,Extracted Text,Question/Answer,Name,Role,Affiliation,Sentiment Score,Sentiment Category,Sentiment Justification,Topic 1,Topic 1 Justification,Topic 2,Topic 2 Justification,Topic 3,Topic 3 Justification,Topic 4,Topic 4 Justification,Topic 5,Topic 5 Justification
0,15,"First of all, on the revenues. €32 billion in ...",Question,Kian Abouhossein,Analyst,JP Morgan,0.0,Neutral,Factual inquiry about financial targets and as...,Revenue Targets,Inquires about achieving revenue goals.,Cost Management,Seeks details on cost assumptions and targets.,Bank Levies,Questions assumptions regarding bank levies.,,,,
1,15,"Let me start actually on the journey in 2024, ...",Answer,Christian Sewing,Chief Executive Officer,Deutsche Bank,0.3,Slightly Positive,Optimistic about achieving revenue targets and...,Revenue Growth,Discusses progress towards revenue targets.,Business Stability,Highlights stability in key business areas.,Net Interest Income,Mentions positive performance of NII.,,,,
2,18,"So Kian, on expenses, we talk a lot about run ...",Answer,James von Moltke,Chief Financial Officer,Deutsche Bank,0.2,Slightly Positive,Confident in managing costs and achieving targ...,Cost Management,Discusses strategies for managing expenses.,Bank Levies,Provides details on expected bank levy costs.,Non-operating Costs,Mentions restructuring and litigation costs.,,,,
3,19,"The first one is on capital. Now, we start wit...",Question,Anke Reingen,Analyst,RBC,0.0,Neutral,Factual inquiry about capital and loan loss ma...,Capital Management,Inquires about CET1 ratio and buybacks.,Regulatory Impact,Questions potential regulatory headwinds.,Loan Losses,Seeks clarity on loan loss expectations.,,,,
4,20,"Thanks, Anke. So, look, the target that we’ve ...",Answer,James von Moltke,Chief Financial Officer,Deutsche Bank,0.1,Neutral,Provides a factual update on capital managemen...,Capital Management,Discusses CET1 ratio and capital optimisation.,Regulatory Impact,Mentions Basel III impacts.,Stock Buybacks,Addresses stock buyback considerations.,,,,
5,20,"On the loan losses, we talk about three things...",Answer,James von Moltke,Chief Financial Officer,Deutsche Bank,0.2,Slightly Positive,Optimistic about improvements in loan losses a...,Loan Losses,Discusses expectations for loan loss improveme...,Commercial Real Estate,Mentions expected improvements in CRE.,Wealth Management,Highlights potential recoveries in Wealth Mana...,,,,
6,21,"I have two, please. The first one will be on N...",Question,Nicolas Payen,Analyst,Kepler Cheuvreux,0.0,Neutral,Factual inquiry about NII and share buyback pl...,Net Interest Income,Seeks conditions for exceeding NII guidance.,Interest Rates,Questions impact of interest rates on NII.,Share Buybacks,Inquires about share buyback plans.,,,,
7,21,"Thanks, Nicolas. So on the net interest income...",Answer,James von Moltke,Chief Financial Officer,Deutsche Bank,0.2,Slightly Positive,Optimistic about exceeding NII guidance due to...,Net Interest Income,Discusses potential for exceeding NII guidance.,Deposit Margins,Mentions improvement in deposit margins.,Funding Costs,Highlights better funding costs as a driver.,,,,
8,22,"Nicolas, on the buybacks, nothing changed from...",Answer,Christian Sewing,Chief Executive Officer,Deutsche Bank,0.3,Slightly Positive,Confident in achieving share buyback goals and...,Share Buybacks,Reaffirms commitment to share buyback plans.,Shareholder Value,Emphasizes focus on creating shareholder value.,,,,,,
9,22,One thing which surprises me a little bit is t...,Question,Giulia Aurora Miotto,Analyst,Morgan Stanley,-0.1,Slightly Negative,Expresses concern over lack of deleveraging an...,Deleveraging,Questions lack of deleveraging in exposure.,Litigation Costs,Inquires about expected increase in litigation...,,,,,,


In [5]:
# Script to view json output of previous cell - currently file goes to the root folder 

import re
import pandas as pd
import json

# Define file path
file_path = "processed_transcript_test.json"

# Read the file content
with open(file_path, "r", encoding="utf-8") as file:
    file_content = file.read().strip()

# Remove potential Markdown-style code blocks
if file_content.startswith("```json") and file_content.endswith("```"):
    file_content = file_content[7:-3].strip()

# Attempt to split multiple JSON objects and wrap them into a valid JSON array
json_objects = re.findall(r'\{.*?\}', file_content, re.DOTALL)

# Properly structure the extracted JSON objects as a list
structured_json = "[" + ",\n".join(json_objects) + "]"

try:
    # Load the properly structured JSON
    json_data = json.loads(structured_json)
    
    # Convert JSON data to Pandas DataFrame
    df = pd.DataFrame(json_data)

    # Define the desired column order and names
    column_mapping = {
        "Type": "Question/Answer",
        "Text": "Text",
        "Person": "Name",
        "Role": "Role",
        "Affiliation": "Affiliation",
        "Topic 1": "Topic 1",
        "Topic 1 Justification": "Topic 1 Justification",
        "Topic 2": "Topic 2",
        "Topic 2 Justification": "Topic 2 Justification",
        "Topic 3": "Topic 3",
        "Topic 3 Justification": "Topic 3 Justification",
        "Topic 4": "Topic 4",
        "Topic 4 Justification": "Topic 4 Justification",
        "Topic 5": "Topic 5",
        "Topic 5 Justification": "Topic 5 Justification",
        "Sentiment Score": "Sentiment Score",
        "Sentiment Category": "Sentiment Category",
        "Sentiment Justification": "Sentiment Justification"
    }

    # Filter only available columns
    available_columns = [col for col in column_mapping.keys() if col in df.columns]

    # Rename and reorder columns
    df_filtered = df[available_columns].rename(columns=column_mapping)



except json.JSONDecodeError as e:
    print(f"JSON Decode Error after restructuring: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


In [6]:
df_filtered

Unnamed: 0,Question/Answer,Text,Name,Role,Affiliation,Topic 1,Topic 1 Justification,Topic 2,Topic 2 Justification,Topic 3,Topic 3 Justification,Sentiment Score,Sentiment Category,Sentiment Justification
0,Question,"Two questions, first of all, on the revenues. ...",Kian Abouhossein,Analyst,JP Morgan,Revenue Targets,The question seeks clarification on the revenu...,Cost Management,Inquires about the assumptions and strategies ...,,,0.0,Neutral,The question is factual and seeks detailed inf...
1,Answer,"Thank you, Kian, and thank you for your questi...",Christian Sewing,Chief Executive Officer,Deutsche Bank,Revenue Targets,Discusses the path to achieving €32 billion re...,Cost Management,Addresses cost management strategies and assum...,Market Share,Mentions market share gains in various divisio...,0.3,Slightly Positive,The response is optimistic about achieving tar...
2,Answer,"So Kian, on expenses, we talk a lot about run ...",James von Moltke,Chief Financial Officer,Deutsche Bank,Cost Management,Discusses strategies and assumptions for achie...,Operational Efficiency,Mentions ongoing efforts in process simplifica...,,,0.2,Slightly Positive,The response is cautiously optimistic about ac...
3,Question,"The first one is on capital. Now, we start wit...",Anke Reingen,Analyst,RBC,Capital Management,Inquires about CET1 ratio targets and room for...,Regulatory Impact,Asks about potential regulatory headwinds impa...,Loan Loss Provisions,Seeks clarity on the expected decline in loan ...,0.0,Neutral,The question is factual and seeks detailed inf...
4,Answer,"Thanks, Anke. So, look, the target that we’ve ...",James von Moltke,Chief Financial Officer,Deutsche Bank,Capital Management,Discusses CET1 ratio targets and capital optim...,Loan Loss Provisions,Explains expectations for loan loss provisions...,Commercial Real Estate,Addresses the state of the commercial real est...,0.1,Neutral,The response is factual and provides detailed ...
5,Question,"I have two, please. The first one will be on N...",Nicolas Payen,Analyst,Kepler Cheuvreux,Net Interest Income,Seeks clarification on conditions and expectat...,Share Buybacks,Inquires about updates on share buyback plans ...,,,0.0,Neutral,The question is factual and seeks detailed inf...
6,Answer,"Thanks, Nicolas. So on the net interest income...",James von Moltke,Chief Financial Officer,Deutsche Bank,Net Interest Income,Discusses conditions and expectations for exce...,Share Buybacks,Provides updates on share buyback plans and re...,,,0.2,Slightly Positive,The response is cautiously optimistic about ex...
7,Question,"My first question, I want to go back onto the ...",Giulia Aurora Miotto,Analyst,Morgan Stanley,Commercial Real Estate,Questions the lack of reduction in commercial ...,Litigation Costs,Inquires about the expected increase in litiga...,,,-0.1,Slightly Negative,The question implies concern about the lack of...
8,Answer,"So Giulia, on the second question, mostly to d...",James von Moltke,Chief Financial Officer,Deutsche Bank,Commercial Real Estate,Explains the reasons for the stable commercial...,Litigation Costs,Clarifies the expected increase in litigation ...,,,0.0,Neutral,The response is factual and provides explanati...
9,Question,"The first one, just picking up on CRE again, t...",Jeremy Sigee,Analyst,BNP Paribas Exane,Commercial Real Estate,Questions the health and nature of modificatio...,Cost Management,Inquires about the year-on-year increase in ad...,,,-0.1,Slightly Negative,The question implies concern about the nature ...
