In [261]:
import re
import pdfplumber
from pathlib import Path
from typing import List, Dict
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import os
from dotenv import load_dotenv
from collections import defaultdict
import json
# Load environment variables
load_dotenv()

#initialize Parser
json_parser = JsonOutputParser()


In [262]:
PDF_PATH = Path("saksoft_q4.pdf")

In [263]:
def extract_lines(pdf_path: Path) -> List[str]:
    lines = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text(x_tolerance=1.5, y_tolerance=3)
            if text:
                 # split on hard-returns, strip extra white-space
                lines += [ln.strip() for ln in text.splitlines()
                          if ln.strip()]
    return lines

In [264]:
def parse_roles(lines: List[str]) -> Dict[str, set]:
    roles = {"management": set(), "moderator": set()}
    current_role = None

    for line in lines:
        line = line.strip()

        if line.upper().startswith("MANAGEMENT:"):
            current_role = "management"
            line = line[len("MANAGEMENT:"):].strip()
        elif line.upper().startswith("MODERATOR:"):
            current_role = "moderator"
            line = line[len("MODERATOR:"):].strip()
        elif re.match(r"^[A-Z ]+:$", line):  # new block starts, like "PARTICIPANT:"
            current_role = None

        if current_role:
            # Append this line to current buffer and extract name
            name_match = re.match(r"([A-Z][A-Z\. ]+[A-Z])\s*[–-]", line)
            if name_match:
                roles[current_role].add(name_match.group(1).strip())

    return roles

In [265]:
lines = extract_lines(PDF_PATH)

In [266]:
llm = ChatGroq(
    model="deepseek-r1-distill-llama-70b"
    # other params...
)

In [267]:
prompt = ChatPromptTemplate(["""
Here is the first page of an earnings call transcript:

{text}

From this, extract the list of **management members** and the **moderator**. 
For each person, provide:

- Full name
- Title
- Role (management/moderator)
- Company (if mentioned)

Respond in JSON format as a list of objects like:
[
  {{
    "name": "John Doe",
    "title": "Chief Financial Officer",
    "role": "management",
    "company": "XYZ Corp"
  }}
]
"""])

In [268]:
chain =prompt | llm | json_parser

In [269]:
joined_lines = "\n".join(lines[:200])

In [270]:
joined_lines

'Saksoft Limited\nCIN: L72200TN1999PLC054429\nGlobal Infocity Park, Block A, 2nd\nFloor, # 40, Dr.M.G.R. Salai,\nKadanchavadi Perungudi, Chennai –\n600 096.\nP: +91-44-2454 3500 F: +91-44-2454 3510\nEmail: info@saksoft.com\nMay 30, 2025\nTo\nThe Listing/Compliance Department The Listing/Compliance\nThe National Stock Exchange of India Department BSE Limited\nLimited FloorNo.25, Phiroze\n“Exchange Plaza” JeejeebhoyTowers, Dalal Street,\nBandra Kurla Complex Mumbai – 400 001\nBandra (E), Mumbai – 400 051\nStock Code: 590051\nStock Code: SAKSOFT\nDear Sir /Madam,\nSub: Transcript of the earnings conference call for the quarter and year ended March\n31, 2025\nPursuant to Regulation 30 of the SEBI (Listing Obligations and Disclosure Requirements)\nRegulations, 2015, please find enclosed the transcript of the earnings conference call for the\nquarter and year ended March 31, 2025.\nThe above information is available on the website of the Company-\nhttps://www.saksoft.com/investor/presentatio

In [271]:
json_str = chain.invoke({"text": joined_lines})  # Example with first line{

In [272]:
result = defaultdict(list)
for person in json_str:
    result[person["role"]].append(person['name'])

# Convert defaultdict to normal dict
result = dict(result)

In [None]:
def clean_name(name: str) -> str:
    name = name.upper()
    name = re.sub(r"\b(MR|MS|MRS|DR)\.?\s+", "", name)
    name = re.sub(r"\s{2,}", " ", name)
    return name.strip()

def build_chunks(all_lines: List[str], roles: Dict[str, List[str]]):
    intro = []
    qna_chunks = []

    # Normalize all names in roles
    management = set(clean_name(name) for name in roles.get("management", []))
    moderator = set(clean_name(name) for name in roles.get("moderator", []))

    current_q = None
    current_a = []
    collecting_intro = True
    expecting_answer = False

    last_speaker = None
    last_role = None

    speaker_pattern = re.compile(r"^([A-Z][A-Za-z .&']+):\s*(.*)")

    for line in all_lines:
        line = line.strip()
        if not line:
            continue

        match = speaker_pattern.match(line)

        if match:
            speaker, text = match.groups()
            speaker = speaker.strip()
            text = text.strip()

            norm_spk = clean_name(speaker)

            if norm_spk in moderator:
                continue

            # -- Introduction section --
            if collecting_intro:
                if norm_spk not in management:
                    collecting_intro = False
                    current_q = {
                        "question_speaker": speaker,
                        "question_text": text
                    }
                    current_a = []
                    expecting_answer = True
                else:
                    intro.append(line)
                continue

            # -- Q&A section --
            if norm_spk not in management:
                # New participant question
                if current_q:
                    qna_chunks.append({
                        **current_q,
                        "answer_speaker": ", ".join({spk for spk, _ in current_a}),
                        "answer_text": " ".join(txt for _, txt in current_a)
                    })
                current_q = {
                    "question_speaker": speaker,
                    "question_text": text
                }
                current_a = []
                expecting_answer = True
            else:
                # Management answer
                if expecting_answer:
                    current_a.append((speaker, text))
                    expecting_answer = False  # first answer line received
                else:
                    # May be follow-up answer
                    current_a.append((speaker, text))

            last_speaker = speaker
            last_role = "management" if norm_spk in management else "participant"
        else:
            # Line has no speaker — continuation
            if collecting_intro:
                intro.append(line)
            elif expecting_answer and current_q:
                # Continuation of question
                current_q["question_text"] += " " + line
            elif current_a:
                # Continuation of answer
                last_spk, prev = current_a[-1]
                current_a[-1] = (last_spk, prev + " " + line)

    # Final flush
    if current_q:
        qna_chunks.append({
            **current_q,
            "answer_speaker": ", ".join({spk for spk, _ in current_a}),
            "answer_text": " ".join(txt for _, txt in current_a)
        })

    return {
        "introduction": " ".join(intro).strip(),
        "qna": qna_chunks,
        "roles": roles
    }


In [274]:
build_chunks(lines, result)

{'introduction': 'Saksoft Limited',
 'qna': [{'question_speaker': 'CIN',
   'question_text': 'L72200TN1999PLC054429 Global Infocity Park, Block A, 2nd Floor, # 40, Dr.M.G.R. Salai, Kadanchavadi Perungudi, Chennai – 600 096. P: +91-44-2454 3500 F: +91-44-2454 3510',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Email',
   'question_text': 'info@saksoft.com May 30, 2025 To The Listing/Compliance Department The Listing/Compliance The National Stock Exchange of India Department BSE Limited Limited FloorNo.25, Phiroze “Exchange Plaza” JeejeebhoyTowers, Dalal Street, Bandra Kurla Complex Mumbai – 400 001 Bandra (E), Mumbai – 400 051',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Stock Code',
   'question_text': '590051',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Stock Code',
   'question_text': 'SAKSOFT Dear Sir /Madam,',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Sub',
   'questio

In [275]:
build_chunks(lines, result)

{'introduction': 'Saksoft Limited',
 'qna': [{'question_speaker': 'CIN',
   'question_text': 'L72200TN1999PLC054429 Global Infocity Park, Block A, 2nd Floor, # 40, Dr.M.G.R. Salai, Kadanchavadi Perungudi, Chennai – 600 096. P: +91-44-2454 3500 F: +91-44-2454 3510',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Email',
   'question_text': 'info@saksoft.com May 30, 2025 To The Listing/Compliance Department The Listing/Compliance The National Stock Exchange of India Department BSE Limited Limited FloorNo.25, Phiroze “Exchange Plaza” JeejeebhoyTowers, Dalal Street, Bandra Kurla Complex Mumbai – 400 001 Bandra (E), Mumbai – 400 051',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Stock Code',
   'question_text': '590051',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Stock Code',
   'question_text': 'SAKSOFT Dear Sir /Madam,',
   'answer_speaker': '',
   'answer_text': ''},
  {'question_speaker': 'Sub',
   'questio