In [18]:
import os
import re
import csv
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer





In [19]:
def extract_pages_from_pdf(pdf_path):
    pages = list(extract_pages(pdf_path))
    return pages


In [11]:
def extract_qa_order_from_header(page):
    qa_order = []
    header_text = ""
    # Assume that the header is within the first few text boxes
    for element in page:
        if isinstance(element, LTTextContainer):
            text = element.get_text()
            # Check if the text contains 'Q -' or 'A -'
            if 'Q -' in text or 'A -' in text:
                header_text += text
            # Break if we've gone past the header
            if len(header_text) > 0 and ('{BIO' in text or not ('Q -' in text or 'A -' in text)):
                break
    # Extract the sequence of Q and A with speaker names
    lines = header_text.strip().split('\n')
    for line in lines:
        match = re.match(r'(Q|A)\s*-\s*(.+)', line)
        if match:
            qa_order.append({'label': match.group(1), 'speaker': match.group(2).strip()})
        elif 'Operator' in line:
            qa_order.append({'label': 'Operator', 'speaker': 'Operator'})
    return qa_order


In [12]:
def split_speech_segments(page):
    page_text = ''
    for element in page:
        if isinstance(element, LTTextContainer):
            page_text += element.get_text()
    # Split the text into segments based on {BIO ... <GO>} markers
    segments = re.split(r'(\{BIO \d+ <GO>\})', page_text)
    # Remove empty strings and strip whitespace
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments


In [13]:
def map_speeches_to_qa_labels(segments, qa_order):
    qa_pairs = []
    qa_index = 0  # Index in the qa_order list
    current_speech = ''
    current_speaker = ''
    mapped_segments = []

    for segment in segments:
        # Check if the segment is a speaker marker
        speaker_match = re.match(r'\{BIO (\d+) <GO>\}', segment)
        if speaker_match:
            # Save the previous speech if any
            if current_speech and current_speaker:
                mapped_segments.append({'speaker': current_speaker, 'speech': current_speech})
                current_speech = ''
            # Reset current_speaker
            current_speaker = ''
        else:
            # Assume the next non-marker segment is the speaker's name or speech
            if not current_speaker:
                # First line might be the speaker's name
                lines = segment.split('\n')
                if len(lines) > 0:
                    current_speaker = lines[0].strip()
                    current_speech = '\n'.join(lines[1:]).strip()
                else:
                    current_speech = segment
            else:
                current_speech += ' ' + segment

    # Add the last speech
    if current_speech and current_speaker:
        mapped_segments.append({'speaker': current_speaker, 'speech': current_speech})

    # Now map the speeches to Q&A labels based on the qa_order
    for seg in mapped_segments:
        if qa_index < len(qa_order):
            qa = qa_order[qa_index]
            # Check if the speaker matches
            if qa['speaker'].startswith(seg['speaker']):
                qa_pairs.append({'label': qa['label'], 'speaker': seg['speaker'], 'speech': seg['speech']})
                qa_index += 1
            else:
                # Handle cases where the speaker name doesn't match exactly
                qa_pairs.append({'label': 'Unknown', 'speaker': seg['speaker'], 'speech': seg['speech']})
        else:
            qa_pairs.append({'label': 'Unknown', 'speaker': seg['speaker'], 'speech': seg['speech']})

    return qa_pairs


In [14]:
def collect_qa_pairs(qa_pairs_page):
    questions = []
    answers = []
    collected_pairs = []
    last_question = ''
    last_question_speaker = ''
    for qa in qa_pairs_page:
        if qa['label'] == 'Q':
            if last_question and answers:
                # Pair the last question with the collected answers
                collected_pairs.append({'question': last_question, 'question_speaker': last_question_speaker, 'answer': ' '.join(answers)})
                answers = []
            last_question = qa['speech']
            last_question_speaker = qa['speaker']
        elif qa['label'] == 'A':
            answers.append(qa['speech'])
    # Handle the last Q&A pair
    if last_question and answers:
        collected_pairs.append({'question': last_question, 'question_speaker': last_question_speaker, 'answer': ' '.join(answers)})
    return collected_pairs


In [15]:
def process_pdf(pdf_path):
    pages = extract_pages_from_pdf(pdf_path)
    all_qa_pairs = []
    for page in pages:
        # Extract the Q&A order from the page header
        qa_order = extract_qa_order_from_header(page)
        if not qa_order:
            continue  # Skip pages without Q&A order
        # Split speech segments
        segments = split_speech_segments(page)
        # Map speeches to Q&A labels
        qa_pairs_page = map_speeches_to_qa_labels(segments, qa_order)
        # Collect Q&A pairs from the page
        collected_pairs = collect_qa_pairs(qa_pairs_page)
        all_qa_pairs.extend(collected_pairs)
    return all_qa_pairs


In [16]:
def save_qa_pairs_to_csv(qa_pairs, csv_file_path):
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question Speaker', 'Question', 'Answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for pair in qa_pairs:
            writer.writerow({
                'Question Speaker': pair['question_speaker'],
                'Question': pair['question'],
                'Answer': pair['answer']
            })


In [20]:
pdf_path = '/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/BB_Docs_20240909_172015/20121105_Entergy_Corp-_Earnings_Call_2012-11-05_SD000000002694979962.pdf'
csv_file_path = 'earnings_questions_2012_11_05.csv'

# Process the PDF and collect Q&A pairs
qa_pairs = process_pdf(pdf_path)

# Save the Q&A pairs to CSV
save_qa_pairs_to_csv(qa_pairs, csv_file_path)
