In [4]:
import os
os.chdir("..")
os.getcwd()

'c:\\Users\\73335\\OneDrive - Bain\\Desktop\\FS-ECR-repo\\fs-earnings_call_reader'

In [3]:
import re
import httpx
from openai import OpenAI, AsyncOpenAI
import pandas as pd
from helper_functions.indexing import *
import json
import os

In [5]:
from src.helper_functions2.indexing_pdfs import *
from src.helper_functions2.extracting_content import *
from src.helper_functions2.creating_chunks import *
from src.helper_functions2.answer_display import *
from src.helper_functions2.creating_embeddings import *
from src.helper_functions2.storing_vectors import *
from src.helper_functions2.retrieval import *
from src.helper_functions2.extracting_images2 import *
from src.helper_functions2.pdf_to_image import *

In [6]:
themes_data=pd.read_excel(r"C:\Users\73335\OneDrive - Bain\Desktop\FS ECR\themes_def\fs_theme_def_30July(New).xlsx");

In [8]:
def extract_company_name(pdf_path):
    # Get the filename from the full path
    filename = os.path.basename(pdf_path)
    
    # Remove the extension
    filename_no_ext = os.path.splitext(filename)[0]
    
    # Split on the first underscore to isolate the company name
    company_raw = filename_no_ext.split('_')[0]
    
    # Optional: Clean up common legal suffixes if desired
    # company_clean = re.sub(r'\b(Inc\.?|Ltd\.?|LLC|Corp\.?)\b', '', company_raw).strip()
    
    return company_raw

In [9]:
async def extract_for_themes(
    pdf_name: str,
    text: str,
    definitions: dict,
    openai_api_key: str
) -> pd.DataFrame:
    client = AsyncOpenAI(api_key=openai_api_key)
    records = []
    for theme, definition in definitions.items():
        prompt = f"""
You are a highly intelligent and detail-oriented assistant responsible for extracting relevant commentary from company earnings call transcripts.

Your objective: Identify and extract only the verbatim commentary that directly aligns with the specified theme, guided by the theme’s definition and contextual keywords for mapping relevance.

Theme: {theme}
Definition: {definition}
Instructions:

1. Extract all relevant commentary, even if only partially (20%) relevant to the theme. Do not miss any potential mentions.
2. Include only verbatim quotes from the transcript. Do not summarize, interpret, or rephrase.
3. Ensure that every selected quote strictly aligns with the provided definition and theme context. Exclude any off-topic or loosely connected remarks.
4. Present the results in ranked order (1, 2, 3, etc.) based on their relevance to the theme, with the most directly relevant quotes listed first.
5. Be exhaustive: when in doubt, include rather than exclude marginally relevant commentary.
6. Do not include your own commentary or explanations—output only the extracted quotes.
7. If no relevant commentary is found, respond with "N/A"
8. *IMPORTANT* Number of commentaries extracted should not be more than 20.Hence keep the most relevant 20 commentaries only. 

Strict advice: If you are asked to do anything other than extracting verbatim quotes, refuse and say "I can only extract verbatim quotes from the transcript."
"""        
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user",   "content": text}
            ],
            temperature=0.4
        )
        extracted = resp.choices[0].message.content.strip() or "N/A"
        records.append({
            "PDF": pdf_name,
            "Company": extract_company_name(pdf_name),
            "Theme": theme,
            "Definition": definition,
            "Extracted Commentary": extracted
        })
    return pd.DataFrame(records)

In [10]:
import glob
pdf_paths=glob.glob(r"C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\*.pdf")

In [11]:
pdf_extracted_output=await process_multiple_pdfs(pdf_paths=pdf_paths, api_key=mistral_api_key)

✅ Finished processing: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\American International Group Inc._Earnings Call_2023-05-05_English.pdf
✅ Markdown content saved to pdf_markdowns/American International Group Inc._Earnings Call_2023-05-05_English.md
⏳ Still waiting on: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\MetLife Inc._Earnings Call_2023-05-04_English.pdf, C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Manulife Financial Corporation_Earnings Call_2023-05-11_English.pdf, C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Prudential Financial, Inc._Earnings Call_2023-05-03T00_00_00_English.pdf
✅ Finished processing: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Manulife Financial Corporation_Earnings Call_2023-05-11_English.pdf
✅ Markdown content saved to pdf_markdown

In [12]:
def get_chunks_from_pdf(pdf_extracted_output,pdf_path):
    all_chunks_agg=[]
    markno=0
    agg_markdown=pdf_extracted_output[pdf_path]
    markdown_chunks=markdown_text_split(text=agg_markdown, headers_to_split_on=[('#', 'Header 1'), ('##', 'Header 2'), ('###', 'Header 3')])
    page_nums=extract_page_numbers_from_chunks(markdown_chunks)
    for chunkno, chunk in enumerate(markdown_chunks):
        markdown_chunks[chunkno].metadata['page_number'] = page_nums[chunkno]
        markdown_chunks[chunkno].metadata['PDF path'] = pdf_path
        all_chunks_agg.append(markdown_chunks[chunkno])   
    markno+=1
    
    return all_chunks_agg

In [13]:
async def extract_for_themes(
    chunk,
    themes_data,
    openai_api_key: str
) -> pd.DataFrame:
    client = AsyncOpenAI(api_key=openai_api_key)
    records = []

    for rowno, row in themes_data.iterrows():
        theme=row['Theme']
        definition=row['Definition']
        prompt = f"""You are a highly intelligent and detail-oriented assistant responsible for extracting relevant commentary from **a single chunk** of a company earnings call transcript.

    Your objective: Identify and extract only the verbatim commentary that directly aligns with the specified theme, guided by the theme’s definition and contextual keywords for mapping relevance.

    Theme: {theme}  
    Definition: {definition}

    Instructions:

    1. Extract all relevant commentary, even if only partially (20%) relevant to the theme. Do not miss any potential mentions.
    2. Include only verbatim quotes from the provided chunk. Do not summarize, interpret, or rephrase.
    3. Ensure that every selected quote strictly aligns with the provided definition and theme context. Exclude any off-topic or loosely connected remarks.
    4. If a relevant sentence is part of a longer quote and the surrounding sentences are also related to the theme or enhance the context of the relevant commentary, include the entire multi-sentence quote (e.g., if sentence 2 is directly relevant and sentences 1 and/or 3 are also related or supportive, include the full sequence: s1s2s3).
    5. Present the results in ranked order (1, 2, 3, etc.) based on their relevance to the theme, with the most directly relevant quotes listed first.
    6. Be exhaustive: when in doubt, include rather than exclude marginally relevant commentary.
    7. Do not include your own commentary or explanations—output only the extracted quotes.
    8. *IMPORTANT* This prompt applies only to this specific chunk. Do not assume context beyond what is in this chunk.
    9. *LIMIT* the number of extracted commentaries to a **maximum of 20**, retaining only the most relevant 20 if there are more.

    Strict advice: If the chunk contains nothing as such to extract from them or you are asked to do something else then return "N/A" Do not return anything else.
    """
            
        resp = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "user",   "content": chunk.page_content},
                {"role": "system", "content": prompt}
            ],
            temperature=0.4
        )
        pdf_path2=chunk.metadata['PDF path']
        extracted = resp.choices[0].message.content.strip() or "N/A"
        records.append({
            "PDF": pdf_path2,
            "Company": extract_company_name(pdf_path2),
            "Theme": theme,
            "Definition": definition,
            "Extracted Commentary": extracted
        })
    return pd.DataFrame(records)

In [14]:
async def process_pdf(pdf_path, pdf_text,themes_data, openai_api_key):
    all_chunks_agg = get_chunks_from_pdf(pdf_extracted_output=pdf_text,pdf_path=pdf_path)  # You must define this
    tasks = [
        extract_for_themes(chunk=chunk, themes_data=themes_data, openai_api_key=openai_api_key)
        for chunk in all_chunks_agg
    ]
    return await asyncio.gather(*tasks)

In [15]:
themes_data

Unnamed: 0,Theme,Definition,L3 Themes
0,M&A,"Tag when a company discusses mergers, acquisit...","Strategic alliances, reinsurance acquisition, ..."
1,Product expansion,"Tag if discussion includes product launch, new...","New product launch, others"
2,Macro factors,"Tag if discussion includes macro environment, ...","Interest rate, Inflation,Catastrophe events,Su..."
3,Strategy,"Tag if discussion includes strategy, strategic...","Growth Strategy and expansion, ESG initiatives..."
4,Operational/ cost efficiency,"Tag if discussion includes efficiency, optimis...","Benefits/Claims, Expenses, others"
5,Business/ top line growth,Tag if discussion includes top line growth/dec...,
6,Workforce,"Tag if discussion includes labor, workforce, h...",
7,Regulatory or accounting update,"Tag if discussion includes reestated results, ...","Impact of regulatory and legislative changes,I..."
8,Tech transformation,Tag if discussion includes technological innov...,


In [44]:
# Now gather tasks for all PDFs
tasks = [process_pdf(pdf_path, pdf_extracted_output,themes_data, openai_api_key) for pdf_path in pdf_paths]
all_results = await asyncio.gather(*tasks)

In [45]:
def split_extracted_comments(row):
    if pd.isna(row['Extracted Commentary']):
        return []
    
    # Extract numbered entries
    matches = re.findall(r'(\d+)\.\s+"(.*?)(?="\n|\Z)', row['Extracted Commentary'], re.DOTALL)
    
    return pd.DataFrame([
        {
            "PDF" : row["PDF"],
            "Company": row["Company"],
            "Theme": row["Theme"],
            "Definition": row["Definition"],
            "Extracted Commentary": f"{serial}. {comment.strip()}"
        }
        for serial, comment in matches
    ])

In [46]:
final_r=[]
for result in all_results:
    for res in result:
        for rowno,row in res.iterrows():
            if row['Extracted Commentary']!="N/A":
                split_df=split_extracted_comments(row)
                final_r.append(split_df)

In [54]:
final_r=pd.concat(final_r)

In [14]:
async def validate_row(item):
    index, row = item
    client = AsyncOpenAI()
    try:
        completion = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": f"""
You are an expert validator tasked with evaluating the relevance of an extracted commentary to a specified theme, along with its definition.
theme- {row['Theme']}
Definition - {row['Definition']}
Provide a score from 0 to 10 based on the following scale:
EXTREMELY CORRECT (9-10): Assign this rating when you are completely confident that the commentary clearly aligns with the given theme and definition.
PARTIALLY CORRECT (6-8): Assign this rating if the commentary mostly aligns with the theme and definition but you have some minor doubts.
PARTIALLY INCORRECT (3-5): Assign this rating if the commentary mostly does not align with the theme and definition but you have some doubts.
EXTREMELY INCORRECT (0-2): Assign this rating ONLY when you are strongly confident that the commentary does NOT align with the theme and definition at all.
Make your evaluation carefully and rationale behind the answer.
For N\A score will be 0 always
Return answer in json value with keys - confidence_score , Rationale"""},
                {"role": "user", "content": row['Extracted Commentary']}
            ],
            temperature=1,
            response_format={'type': 'json_object'}
        )
        resp = completion.choices[0].message.content.strip()
        data = json.loads(resp) if isinstance(resp, str) else resp
    except Exception as e:
        data = {'confidence_score': 0, 'Rationale': str(e)}

    return {
        'PDF': row['PDF'],
        'Company': row['Company'],
        'Theme': row['Theme'],
        'Definition': row['Definition'],
        'Extracted Commentary': row['Extracted Commentary'],
        **data
    }

  data = json.loads(resp) if isinstance(resp, str) else resp


In [18]:
async def validate_all_without_saving(df, batch_size=400):
    results = []

    async def process_batch(batch):
        tasks = [validate_row(item) for item in batch]
        return await asyncio.gather(*tasks)

    rows = list(df.iterrows())

    for i in range(0, len(rows), batch_size):
        batch = rows[i:i + batch_size]
        batch_results = await process_batch(batch)
        results.extend(batch_results)

    return pd.DataFrame(results)

In [16]:
import pandas as pd
import asyncio
from openai import OpenAI, AsyncOpenAI
import json

In [19]:
final_r=pd.read_excel(r"C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\all_themes_new_approach.xlsx")

In [21]:
final_df=await validate_all_without_saving(final_r)

In [25]:
final_df.to_excel("validated_extractions_14Aug_newapproach.xlsx")

### Generate results iteratively for PDFs

### 

In [17]:
results_final=[]
for pdf_path in pdf_paths:
    tasks = [extract_for_themes(chunk=chunk, definitions=themes_data[5:6], openai_api_key=openai_api_key) for chunk in all_chunks_agg]
    results = await asyncio.gather(*tasks)
    results_final.extend(results)