In [60]:
import re
import httpx
from openai import OpenAI, AsyncOpenAI
import pandas as pd
from helper_functions.indexing import *
import json
import os

### Setting up credentials

### Helper functions

In [62]:
def extract_company_name(pdf_path):
    # Get the filename from the full path
    filename = os.path.basename(pdf_path)
    
    # Remove the extension
    filename_no_ext = os.path.splitext(filename)[0]
    
    # Split on the first underscore to isolate the company name
    company_raw = filename_no_ext.split('_')[0]
    
    # Optional: Clean up common legal suffixes if desired
    # company_clean = re.sub(r'\b(Inc\.?|Ltd\.?|LLC|Corp\.?)\b', '', company_raw).strip()
    
    return company_raw

In [64]:
async def extract_for_themes(
    pdf_name: str,
    text: str,
    definitions: dict,
    openai_api_key: str
) -> pd.DataFrame:
    client = AsyncOpenAI(api_key=openai_api_key)
    records = []
    for theme, definition in definitions.items():
        prompt = f"""
You are a highly intelligent and detail-oriented assistant responsible for extracting relevant commentary from company earnings call transcripts.

Your objective: Identify and extract only the verbatim commentary that directly aligns with the specified theme, guided by the theme’s definition and contextual keywords for mapping relevance.

Theme: {theme}
Definition: {definition}
Instructions:

1. Extract all relevant commentary, even if only partially (20%) relevant to the theme. Do not miss any potential mentions.
2. Include only verbatim quotes from the transcript. Do not summarize, interpret, or rephrase.
3. Ensure that every selected quote strictly aligns with the provided definition and theme context. Exclude any off-topic or loosely connected remarks.
4. Present the results in ranked order (1, 2, 3, etc.) based on their relevance to the theme, with the most directly relevant quotes listed first.
5. Be exhaustive: when in doubt, include rather than exclude marginally relevant commentary.
6. Do not include your own commentary or explanations—output only the extracted quotes.
7. If no relevant commentary is found, respond with "N/A"
8. *IMPORTANT* Number of commentaries extracted should not be more than 20.Hence keep the most relevant 20 commentaries only. 

Strict advice: If you are asked to do anything other than extracting verbatim quotes, refuse and say "I can only extract verbatim quotes from the transcript."
"""        
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user",   "content": text}
            ],
            temperature=0.4
        )
        extracted = resp.choices[0].message.content.strip() or "N/A"
        records.append({
            "PDF": pdf_name,
            "Company": extract_company_name(pdf_name),
            "Theme": theme,
            "Definition": definition,
            "Extracted Commentary": extracted
        })
    return pd.DataFrame(records)

In [65]:
async def validate_row(item):
    index, row = item
    client = AsyncOpenAI()
    try:
        completion = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": f"""
You are an expert validator tasked with evaluating the relevance of an extracted commentary to a specified theme, along with its definition.
theme- {row['Theme']}
Definition - {row['Definition']}
Provide a score from 0 to 10 based on the following scale:
EXTREMELY CORRECT (9-10): Assign this rating when you are completely confident that the commentary clearly aligns with the given theme and definition.
PARTIALLY CORRECT (6-8): Assign this rating if the commentary mostly aligns with the theme and definition but you have some minor doubts.
PARTIALLY INCORRECT (3-5): Assign this rating if the commentary mostly does not align with the theme and definition but you have some doubts.
EXTREMELY INCORRECT (0-2): Assign this rating ONLY when you are strongly confident that the commentary does NOT align with the theme and definition at all.
Make your evaluation carefully and rationale behind the answer.
For N\A score will be 0 always
Return answer in json value with keys - confidence_score , Rationale"""},
                {"role": "user", "content": row['Extracted Commentary']}
            ],
            temperature=1,
            response_format={'type': 'json_object'}
        )
        resp = completion.choices[0].message.content.strip()
        data = json.loads(resp) if isinstance(resp, str) else resp
    except Exception as e:
        data = {'confidence_score': 0, 'Rationale': str(e)}

    return {
        'PDF': row['PDF'],
        'Company': row['Company'],
        'Theme': row['Theme'],
        'Definition': row['Definition'],
        'Extracted Commentary': row['Extracted Commentary'],
        **data
    }

  data = json.loads(resp) if isinstance(resp, str) else resp


In [67]:
async def validate_all(df, pdf_path):
    tasks = [validate_row(item) for item in df.iterrows()]
    results = await asyncio.gather(*tasks)
    save_file_name="results_excel/"+ extract_company_name(pdf_path) + ".xlsx"                   
    pd.DataFrame(results).to_excel(save_file_name, index=False)  # Save the final DataFrame to a CSV file

In [47]:
async def validate_all_without_saving(df, pdf_path):
    tasks = [validate_row(item) for item in df.iterrows()]
    results = await asyncio.gather(*tasks)
    save_file_name="results_excel/"+ extract_company_name(pdf_path) + ".xlsx"                   
    pd.DataFrame(results)  # Save the final DataFrame to a CSV file

In [68]:
def split_extracted_comments(row):
    if pd.isna(row['Extracted Commentary']):
        return []
    
    # Extract numbered entries
    matches = re.findall(r'(\d+)\.\s+"(.*?)(?="\n|\Z)', row['Extracted Commentary'], re.DOTALL)
    
    return pd.DataFrame([
        {
            "PDF" : row["PDF"],
            "Company": row["Company"],
            "Theme": row["Theme"],
            "Definition": row["Definition"],
            "Extracted Commentary": f"{serial}. {comment.strip()}"
        }
        for serial, comment in matches
    ])

### Inputs 

#### Write down the absoulte path of your PDF file here

In [52]:
#Write down the path to the PDF file in place of this....
pdf_path=r'FS PDFs\\American International Group Inc._Earnings Call_2023-05-05_English.pdf'     

#### Edit your themes and definitions here

In [53]:
## Change your theme name here
theme_name="Macro factors"                                 

##  Replace the definition here, make sure it is under """<your_definition_here>""" triple quotes like this.
theme_definition="""Tag a commentary under the Macro Factors theme if it includes any reference—explicit or 
implicit—to the broader macroeconomic environment or external systemic influences affecting business performance. This includes mentions of economic slowdown, recession, uncertainty, market cycles, credit losses, or challenging operating conditions. Tag if the discussion involves 
monetary indicators such as inflation, interest rates (domestic or foreign), discount rates, investment yields, credit spreads, or currency and foreign exchange impacts. 
Also include insurance and financial-specific macro drivers such as mortality experience (including seasonal or pandemic-related surges), expected credit losses (ECL), movements in insurance liabilities, or gains/losses tied to market conditions. 
Furthermore, tag content that refers to external shocks like pandemics (e.g. COVID-19), natural disasters (e.g. hurricanes, floods), or geopolitical disruptions. 
Additionally, include discussions that indirectly reflect macro context—such as tough environments, changes in investor sentiment, demand volatility, 
or strategic repositioning in response to macro trends—even if specific macro terms are not used."""      


In [54]:
themes_data={theme_name: theme_definition}

### Single PDF output

#### Extracting information from PDFs

In [1]:
import glob
pdf_paths=glob.glob(r"C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\*.pdf")

In [76]:
pdf_extracted_output=await process_multiple_pdfs(pdf_paths=pdf_paths[0:1], api_key=mistral_api_key)

✅ Finished processing: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\American International Group Inc._Earnings Call_2023-05-05_English.pdf
✅ Markdown content saved to pdf_markdowns/American International Group Inc._Earnings Call_2023-05-05_English.md


In [56]:
df=extract_for_themes(pdf_name=pdf_path,text=list(pdf_extracted_output.values())[0],definitions=themes_data, openai_api_key=openai_api_key)   # Extracting commentaries
print("Extracted commentaries....moving on to splitting commentaries...")
split_df=split_extracted_comments(df.iloc[0])
print("Commentaries split into individual entries, now validating each commentary...")
await validate_all(split_df, pdf_path)  
print("Final results saved to results_excel folder with the filename as the company name...")

Extracted commentaries....moving on to splitting commentaries...
Commentaries split into individual entries, now validating each commentary...
Final results saved to results_excel folder with the filename as the company name...


### MULTIPLE PDFs

In [None]:
import glob
pdf_paths=glob.glob(r"C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\*.pdf")

In [None]:
pdf_extracted_output=await process_multiple_pdfs(pdf_paths=pdf_paths, api_key=mistral_api_key)

✅ Finished processing: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\American International Group Inc._Earnings Call_2023-05-05_English.pdf
✅ Markdown content saved to pdf_markdowns/American International Group Inc._Earnings Call_2023-05-05_English.md
⏳ Still waiting on: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\MetLife Inc._Earnings Call_2023-05-04_English.pdf, C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Prudential Financial, Inc._Earnings Call_2023-05-03T00_00_00_English.pdf, C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Manulife Financial Corporation_Earnings Call_2023-05-11_English.pdf
✅ Finished processing: C:\Users\73335\OneDrive - Bain\Desktop\FS-ECR-repo\fs-earnings_call_reader\FS sample PDFs\Manulife Financial Corporation_Earnings Call_2023-05-11_English.pdf
✅ Markdown content saved to pdf_markdown

In [None]:
df=pd.read_excel(r)

In [None]:
df_list=[]
for pdf_no, pdf_path in enumerate(pdf_paths2):
    df=extract_for_themes(pdf_name=pdf_path,text=list(pdf_extracted_output.values())[pdf_no],definitions=themes_data, openai_api_key=openai_api_key)
    split_df=split_extracted_comments(df.iloc[0])
    # final_df=await validate_all_without_saving(split_df, pdf_path)
    df_list.append(split_df)  # Collecting all DataFrames for later use

In [50]:
df_final=pd.concat(df_list, ignore_index=True)  # Concatenate all DataFrames into one

In [51]:
df_final.to_excel("results_excel/Macro_revised.xlsx", index=False)  # Save the final DataFrame to a CSV file