# Jamie Dimon Response Simulation - Four Specifications

This notebook implements four different approaches to generate Jamie Dimon's responses:
1. **Spec 1**: Random 500 Q&A pairs
2. **Spec 2**: Most Recent 500 Q&A pairs
3. **Spec 3**: Most Recent 500 Q&A + 10-Q Summary
4. **Spec 4**: Persona Summary + 10-Q Summary

In [13]:
# Import libraries
import pandas as pd
import json
import random
from openai import OpenAI
import tiktoken
from datetime import datetime
from tqdm import tqdm
import os

## 1. Load and Prepare Data

In [14]:
# Load training data (historical Q&A pairs before 2025)
df = pd.read_csv("../Data/CIQ transcripts/46625H100.csv", low_memory=False)

print(f"Total rows in CSV: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

Total rows in CSV: 68866
Columns: ['transcriptcreationdateutc', 'companyid', 'companyname', 'keydevid', 'headline', 'mostimportantdateutc', 'transcriptid', 'transcriptcollectiontypeid', 'transcriptpresentationtypeid', 'transcriptcomponentid', 'componentorder', 'transcriptcomponenttypeid', 'transcriptcomponenttypename', 'transcriptpersonname', 'speaker_companyname', 'speakertypeid', 'speakertypename', 'componenttext']


In [15]:
# Deduplicate and prepare data
print("\n=== Deduplicating Transcripts ===")

# Convert dates
df['call_date'] = pd.to_datetime(df['mostimportantdateutc']).dt.date
df['creation_date'] = pd.to_datetime(df['transcriptcreationdateutc'])

# Keep only the most recent version of each earnings call
df_grouped = df.groupby('call_date')
latest_transcripts = []

for call_date, group in df_grouped:
    latest_creation = group['creation_date'].max()
    latest_group = group[group['creation_date'] == latest_creation]
    latest_transcripts.append(latest_group)

df = pd.concat(latest_transcripts, ignore_index=True)
print(f"Rows after deduplication: {len(df)}")

# Prepare data for Q&A extraction
df = df[['call_date', 'transcriptcomponenttypename', 'transcriptpersonname', 'speakertypename', 'componenttext']].copy()
df.rename(columns={'call_date': 'date'}, inplace=True)
df['transcriptpersonname'] = df['transcriptpersonname'].str.lower().str.strip()

# Keep only Q&A section components
qa_df = df[df['transcriptcomponenttypename'].isin(['Question', 'Answer'])].reset_index(drop=True)

print(f"\nTotal Q&A components: {len(qa_df)}")
print(f"  Questions: {len(qa_df[qa_df['transcriptcomponenttypename'] == 'Question'])}")
print(f"  Answers: {len(qa_df[qa_df['transcriptcomponenttypename'] == 'Answer'])}")


=== Deduplicating Transcripts ===
Rows after deduplication: 18635

Total Q&A components: 15947
  Questions: 6004
  Answers: 9943


In [16]:
# Extract Q&A pairs
train_qa_pairs = []

grouped = qa_df.groupby('date')

for date, group in grouped:
    group = group.reset_index(drop=True)
    
    for i in range(len(group)):
        row = group.loc[i]
        
        # If this is Jamie Dimon's answer
        if row['transcriptcomponenttypename'] == 'Answer' and 'james dimon' in str(row['transcriptpersonname']).lower():
            # Look for the question immediately before it
            if i > 0:
                prev_row = group.loc[i - 1]
                
                # Check if previous row is a question from an analyst
                if prev_row['transcriptcomponenttypename'] == 'Question' and prev_row['speakertypename'] == 'Analysts':
                    train_qa_pairs.append({
                        'date': str(date),
                        'question': prev_row['componenttext'].strip(),
                        'answer': row['componenttext'].strip()
                    })

print(f"\n=== Training Data Summary ===")
print(f"Total Q&A pairs extracted: {len(train_qa_pairs)}")
print(f"Unique call dates: {len(set([pair['date'] for pair in train_qa_pairs]))}")
if train_qa_pairs:
    print(f"Date range: {min([pair['date'] for pair in train_qa_pairs])} to {max([pair['date'] for pair in train_qa_pairs])}")


=== Training Data Summary ===
Total Q&A pairs extracted: 2115
Unique call dates: 101
Date range: 2007-07-18 to 2024-10-11


In [17]:
# Filter to only pre-2025 data for training
train_qa_pairs_pre_2025 = [pair for pair in train_qa_pairs if pair['date'] < '2025-01-01']
print(f"\nPre-2025 Q&A pairs: {len(train_qa_pairs_pre_2025)}")
print(f"Date range: {min([pair['date'] for pair in train_qa_pairs_pre_2025])} to {max([pair['date'] for pair in train_qa_pairs_pre_2025])}")


Pre-2025 Q&A pairs: 2115
Date range: 2007-07-18 to 2024-10-11


In [18]:
# Load test data (2025 Q1-Q3)
test_qa_pairs = []
with open("../Processed Data/test_qa_pairs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        test_qa_pairs.append(json.loads(line))

print(f"\n=== Testing Data Summary ===")
print(f"Testing Q&A pairs loaded: {len(test_qa_pairs)}")
print(f"Test data quarters: {set([pair['quarter'] for pair in test_qa_pairs])}")


=== Testing Data Summary ===
Testing Q&A pairs loaded: 43
Test data quarters: {'2025Q1', '2025Q2', '2025Q3'}


In [19]:
# Load 10-Q summaries
with open("../Processed Data/10Q_summaries.json", "r", encoding="utf-8") as f:
    summaries_data = json.load(f)

# Create a dictionary for easy lookup by quarter
summaries_dict = {}
for summary in summaries_data['summaries']:
    summaries_dict[summary['quarter']] = summary['summary']

print(f"\n=== 10-Q Summaries Loaded ===")
print(f"Available quarters: {list(summaries_dict.keys())}")


=== 10-Q Summaries Loaded ===
Available quarters: ['2025Q1', '2025Q2', '2025Q3']


## 2. Utility Functions

In [20]:
def count_tokens(text, model="gpt-4o"):
    """Count the number of tokens in a text string"""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def save_results(results, spec_name, output_dir="../Results"):
    """Save results to JSON file"""
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{spec_name}_{timestamp}.json"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"Results saved to: {filepath}")
    return filepath

## 3. Initialize OpenAI Client

## 4. Specification 1: Random 500 Q&A Pairs

In [22]:
def build_prompt_spec1(past_qa_pairs, new_question, max_tokens=120000):
    """
    Build prompt for Specification 1: Random 500 Q&A pairs
    """
    prompt_header = "You are Jamie Dimon. Based on your past earnings call responses, answer the following analyst question in your usual tone and style.\n\n"
    prompt_header += "PAST RESPONSES:\n"
    
    prompt_footer = f"\nCURRENT QUESTION:\nQ: {new_question}\n\nYour Answer:"
    
    # Count tokens for fixed parts
    header_tokens = count_tokens(prompt_header)
    footer_tokens = count_tokens(prompt_footer)
    fixed_tokens = header_tokens + footer_tokens
    
    # Build examples section
    examples_text = ""
    examples_used = 0
    total_tokens = fixed_tokens
    
    for pair in past_qa_pairs:
        example_text = f"Q: {pair['question']}\nA: {pair['answer']}\n\n"
        example_tokens = count_tokens(example_text)
        
        if total_tokens + example_tokens > max_tokens:
            break
        
        examples_text += example_text
        total_tokens += example_tokens
        examples_used += 1
    
    prompt = prompt_header + examples_text + prompt_footer
    
    return prompt, total_tokens, examples_used

In [23]:
# Specification 1: Random 500 Q&A
print("\n" + "="*80)
print("SPECIFICATION 1: Random 500 Q&A Pairs")
print("="*80)

# Randomly sample 500 Q&A pairs from pre-2025 data
random.seed(42)  # For reproducibility
random_500_qa = random.sample(train_qa_pairs_pre_2025, min(500, len(train_qa_pairs_pre_2025)))

print(f"\nSampled {len(random_500_qa)} random Q&A pairs")
print(f"Date range: {min([pair['date'] for pair in random_500_qa])} to {max([pair['date'] for pair in random_500_qa])}")

# Generate responses for all test questions
spec1_results = {
    "specification": "spec1_random_500",
    "description": "Random 500 Q&A pairs from historical data",
    "training_samples": len(random_500_qa),
    "test_samples": len(test_qa_pairs),
    "generated_date": datetime.now().isoformat(),
    "results": []
}

print(f"\nGenerating responses for {len(test_qa_pairs)} test questions...")

for idx, test_pair in enumerate(tqdm(test_qa_pairs, desc="Spec 1")):
    try:
        # Build prompt
        prompt, token_count, examples_used = build_prompt_spec1(
            random_500_qa,
            test_pair['question']
        )
        
        # Generate response
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        
        generated_answer = response.choices[0].message.content
        
        # Store result
        spec1_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": generated_answer,
            "prompt_tokens": token_count,
            "examples_used": examples_used
        })
        
    except Exception as e:
        print(f"\nError processing test question {idx}: {str(e)}")
        spec1_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": f"ERROR: {str(e)}",
            "error": str(e)
        })

# Save results
spec1_filepath = save_results(spec1_results, "spec1_random_500")
print(f"\nSpec 1 completed! {len(spec1_results['results'])} responses generated.")


SPECIFICATION 1: Random 500 Q&A Pairs

Sampled 500 random Q&A pairs
Date range: 2007-07-18 to 2024-10-11

Generating responses for 43 test questions...


Spec 1: 100%|██████████| 43/43 [05:55<00:00,  8.27s/it]

Results saved to: ../Results/spec1_random_500_20251110_180802.json

Spec 1 completed! 43 responses generated.





## 5. Specification 2: Most Recent 500 Q&A Pairs

In [24]:
# Specification 2: Most Recent 500 Q&A
print("\n" + "="*80)
print("SPECIFICATION 2: Most Recent 500 Q&A Pairs")
print("="*80)

# Sort by date and take the most recent 500
sorted_qa = sorted(train_qa_pairs_pre_2025, key=lambda x: x['date'], reverse=True)
recent_500_qa = sorted_qa[:500]

print(f"\nSelected {len(recent_500_qa)} most recent Q&A pairs")
print(f"Date range: {min([pair['date'] for pair in recent_500_qa])} to {max([pair['date'] for pair in recent_500_qa])}")

# Generate responses
spec2_results = {
    "specification": "spec2_recent_500",
    "description": "Most recent 500 Q&A pairs prior to 2025",
    "training_samples": len(recent_500_qa),
    "test_samples": len(test_qa_pairs),
    "generated_date": datetime.now().isoformat(),
    "results": []
}

print(f"\nGenerating responses for {len(test_qa_pairs)} test questions...")

for idx, test_pair in enumerate(tqdm(test_qa_pairs, desc="Spec 2")):
    try:
        # Build prompt (same as Spec 1, but with recent_500_qa)
        prompt, token_count, examples_used = build_prompt_spec1(
            recent_500_qa,
            test_pair['question']
        )
        
        # Generate response
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        
        generated_answer = response.choices[0].message.content
        
        # Store result
        spec2_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": generated_answer,
            "prompt_tokens": token_count,
            "examples_used": examples_used
        })
        
    except Exception as e:
        print(f"\nError processing test question {idx}: {str(e)}")
        spec2_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": f"ERROR: {str(e)}",
            "error": str(e)
        })

# Save results
spec2_filepath = save_results(spec2_results, "spec2_recent_500")
print(f"\nSpec 2 completed! {len(spec2_results['results'])} responses generated.")


SPECIFICATION 2: Most Recent 500 Q&A Pairs

Selected 500 most recent Q&A pairs
Date range: 2019-09-10 to 2024-10-11

Generating responses for 43 test questions...


Spec 2: 100%|██████████| 43/43 [05:27<00:00,  7.62s/it]

Results saved to: ../Results/spec2_recent_500_20251110_181330.json

Spec 2 completed! 43 responses generated.





## 6. Specification 3: Most Recent 500 Q&A + 10-Q Summary

In [25]:
def build_prompt_spec3(past_qa_pairs, new_question, quarter_summary, max_tokens=120000):
    """
    Build prompt for Specification 3: Most Recent 500 Q&A + 10-Q Summary
    """
    prompt_header = "You are Jamie Dimon. Based on your past earnings call responses and the current quarter's financial report, answer the following analyst question in your usual tone and style.\n\n"
    
    # Add 10-Q summary
    prompt_header += f"CURRENT QUARTER 10-Q SUMMARY:\n{quarter_summary}\n\n"
    prompt_header += "PAST RESPONSES:\n"
    
    prompt_footer = f"\nCURRENT QUESTION:\nQ: {new_question}\n\nYour Answer:"
    
    # Count tokens for fixed parts
    header_tokens = count_tokens(prompt_header)
    footer_tokens = count_tokens(prompt_footer)
    fixed_tokens = header_tokens + footer_tokens
    
    # Build examples section
    examples_text = ""
    examples_used = 0
    total_tokens = fixed_tokens
    
    for pair in past_qa_pairs:
        example_text = f"Q: {pair['question']}\nA: {pair['answer']}\n\n"
        example_tokens = count_tokens(example_text)
        
        if total_tokens + example_tokens > max_tokens:
            break
        
        examples_text += example_text
        total_tokens += example_tokens
        examples_used += 1
    
    prompt = prompt_header + examples_text + prompt_footer
    
    return prompt, total_tokens, examples_used

In [26]:
# Specification 3: Most Recent 500 Q&A + 10-Q Summary
print("\n" + "="*80)
print("SPECIFICATION 3: Most Recent 500 Q&A + 10-Q Summary")
print("="*80)

spec3_results = {
    "specification": "spec3_recent_500_plus_10q",
    "description": "Most recent 500 Q&A pairs + quarterly 10-Q summary",
    "training_samples": len(recent_500_qa),
    "test_samples": len(test_qa_pairs),
    "generated_date": datetime.now().isoformat(),
    "results": []
}

print(f"\nGenerating responses for {len(test_qa_pairs)} test questions...")

for idx, test_pair in enumerate(tqdm(test_qa_pairs, desc="Spec 3")):
    try:
        # Get the appropriate 10-Q summary for this quarter
        quarter = test_pair['quarter']
        quarter_summary = summaries_dict.get(quarter, "No 10-Q summary available for this quarter.")
        
        # Build prompt with 10-Q summary
        prompt, token_count, examples_used = build_prompt_spec3(
            recent_500_qa,
            test_pair['question'],
            quarter_summary
        )
        
        # Generate response
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        
        generated_answer = response.choices[0].message.content
        
        # Store result
        spec3_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": generated_answer,
            "prompt_tokens": token_count,
            "examples_used": examples_used,
            "10q_summary_included": True
        })
        
    except Exception as e:
        print(f"\nError processing test question {idx}: {str(e)}")
        spec3_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": f"ERROR: {str(e)}",
            "error": str(e)
        })

# Save results
spec3_filepath = save_results(spec3_results, "spec3_recent_500_plus_10q")
print(f"\nSpec 3 completed! {len(spec3_results['results'])} responses generated.")


SPECIFICATION 3: Most Recent 500 Q&A + 10-Q Summary

Generating responses for 43 test questions...


Spec 3: 100%|██████████| 43/43 [06:40<00:00,  9.32s/it]

Results saved to: ../Results/spec3_recent_500_plus_10q_20251110_182010.json

Spec 3 completed! 43 responses generated.





## 7. Specification 4: Persona Summary + 10-Q Summary

In [27]:
# Generate persona summary from most recent 500 Q&A pairs
print("\n" + "="*80)
print("SPECIFICATION 4: Persona Summary + 10-Q Summary")
print("="*80)

print("\nStep 1: Generating Jamie Dimon's persona summary...")

# Build prompt for persona generation
persona_prompt = """Based on the following Q&A exchanges from Jamie Dimon's earnings call responses, 
create a concise persona summary that captures his communication style, tone, key themes, 
and typical response patterns. Your analysis may include, but not be limited to, the following dimensions:
1. Communication style and tone
2. Common themes and topics he emphasizes
3. How he structures his responses
4. His values and perspectives
5. Characteristic phrases or expressions

Q&A EXAMPLES:\n"""

# Add the 500 most recent Q&A pairs (with token limit)
max_tokens_for_persona = 100000
current_tokens = count_tokens(persona_prompt)

for pair in recent_500_qa:
    example_text = f"Q: {pair['question']}\nA: {pair['answer']}\n\n"
    example_tokens = count_tokens(example_text)
    
    if current_tokens + example_tokens > max_tokens_for_persona:
        break
    
    persona_prompt += example_text
    current_tokens += example_tokens

persona_prompt += "\nBased on these examples, provide a comprehensive persona summary:"

# Generate persona summary
print(f"Generating persona from {current_tokens:,} tokens...")

persona_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": persona_prompt}],
    temperature=0.7
)

jamie_persona = persona_response.choices[0].message.content

print("\n" + "="*80)
print("JAMIE DIMON PERSONA SUMMARY:")
print("="*80)
print(jamie_persona)
print("="*80)


SPECIFICATION 4: Persona Summary + 10-Q Summary

Step 1: Generating Jamie Dimon's persona summary...
Generating persona from 99,969 tokens...

JAMIE DIMON PERSONA SUMMARY:
### Persona Summary: Jamie Dimon

**1. Communication Style and Tone:**
Jamie Dimon’s communication style is direct, candid, and often informal. He uses plain language and is unafraid to express his opinions strongly, even when discussing complex financial topics. His tone is confident and authoritative, reflecting his deep knowledge and experience in the banking sector. Dimon is also personable and occasionally injects humor into his responses, making his communication approachable despite the technical nature of the discussions.

**2. Common Themes and Topics He Emphasizes:**
Dimon frequently emphasizes the importance of serving clients, maintaining a strong balance sheet, and being prepared for economic uncertainties. He often discusses the competitive landscape, highlighting the challenges from fintech and big te

In [28]:
def build_prompt_spec4(persona_summary, new_question, quarter_summary, max_tokens=120000):
    """
    Build prompt for Specification 4: Persona Summary + 10-Q Summary
    """
    prompt = f"""You are Jamie Dimon, CEO of JPMorgan Chase. 

YOUR COMMUNICATION STYLE AND PERSONA:
{persona_summary}

CURRENT QUARTER 10-Q SUMMARY:
{quarter_summary}

CURRENT QUESTION:
Q: {new_question}

Your Answer:"""
    
    token_count = count_tokens(prompt)
    
    return prompt, token_count

In [29]:
# Specification 4: Generate responses using persona + 10-Q
print("\nStep 2: Generating responses using persona summary + 10-Q summaries...")

spec4_results = {
    "specification": "spec4_persona_plus_10q",
    "description": "Persona summary from 500 most recent Q&A + quarterly 10-Q summary",
    "persona_summary": jamie_persona,
    "test_samples": len(test_qa_pairs),
    "generated_date": datetime.now().isoformat(),
    "results": []
}

print(f"\nGenerating responses for {len(test_qa_pairs)} test questions...")

for idx, test_pair in enumerate(tqdm(test_qa_pairs, desc="Spec 4")):
    try:
        # Get the appropriate 10-Q summary for this quarter
        quarter = test_pair['quarter']
        quarter_summary = summaries_dict.get(quarter, "No 10-Q summary available for this quarter.")
        
        # Build prompt with persona + 10-Q summary
        prompt, token_count = build_prompt_spec4(
            jamie_persona,
            test_pair['question'],
            quarter_summary
        )
        
        # Generate response
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        
        generated_answer = response.choices[0].message.content
        
        # Store result
        spec4_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": generated_answer,
            "prompt_tokens": token_count,
            "persona_summary_included": True,
            "10q_summary_included": True
        })
        
    except Exception as e:
        print(f"\nError processing test question {idx}: {str(e)}")
        spec4_results['results'].append({
            "test_id": idx,
            "quarter": test_pair['quarter'],
            "date": test_pair['date'],
            "analyst": test_pair['analyst'],
            "question": test_pair['question'],
            "ground_truth_answer": test_pair['answer'],
            "generated_answer": f"ERROR: {str(e)}",
            "error": str(e)
        })

# Save results
spec4_filepath = save_results(spec4_results, "spec4_persona_plus_10q")
print(f"\nSpec 4 completed! {len(spec4_results['results'])} responses generated.")


Step 2: Generating responses using persona summary + 10-Q summaries...

Generating responses for 43 test questions...


Spec 4: 100%|██████████| 43/43 [03:34<00:00,  5.00s/it]

Results saved to: ../Results/spec4_persona_plus_10q_20251110_182428.json

Spec 4 completed! 43 responses generated.



