In [31]:
import os
import re
import csv
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.layout import LTTextContainer





In [68]:
def extract_text_from_pdf(pdf_path):
    # Extracts all text from the PDF file
    text = extract_text(pdf_path)
    # Remove all occurrences of {BIO ...} patterns
    text = re.sub(r'\{BIO.*?\}', '', text)
    # Remove lines that look like page headers/footers
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        # Remove lines that match "Page X of Y ..." patterns
        if re.match(r'^Page \d+ of \d+', line):
            continue
        # Remove lines that contain "FINAL TRANSCRIPT"
        elif 'FINAL TRANSCRIPT' in line:
            continue
        # Remove lines that match "Entergy Corp (ETR US Equity)"
        elif re.match(r'^Entergy Corp', line):
            continue
        # Remove lines that are dates like "2012-11-05"
        elif re.match(r'^\d{4}-\d{2}-\d{2}$', line):
            continue
        # Remove operator text
        elif re.match(r'Operator.*?(?=Q\s*-|A\s*-)', line, flags=re.IGNORECASE | re.DOTALL):
            continue
        else:
            cleaned_lines.append(line)
    # Join the cleaned lines back into text
    text = '\n'.join(cleaned_lines)
    return text


In [72]:
def extract_qa_pairs(text):
    # Find the starting position of "Questions And Answers" or "Q&A"
    qa_start = text.find('Questions And Answers')
    if qa_start == -1:
        qa_start = text.find('Q&A') 
    if qa_start == -1:
        qa_start = 0
        print("No 'Questions and Answers' section found in the text, starting at beginning")
        
    # Extract text starting from the Q&A section
    text = text[qa_start:]
    # Split the text into lines
    lines = text.split('\n')
    # Initialize variables
    qa_pairs = []
    current_question = ''
    current_answer = ''
    state = None  # None, 'Q', or 'A'

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        # Check for question
        q_match = re.match(r'^Q\s*-\s*(.*)', line)
        a_match = re.match(r'^A\s*-\s*(.*)', line)
        if q_match:
            # If there's an existing question without an answer, add it with an empty answer
            if current_question and current_answer:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
            elif current_question:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
            state = 'Q'
            # Collect the question text
            i += 1
            question_lines = []
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('Q -') or next_line.startswith('A -'):
                    break
                question_lines.append(next_line)
                i += 1
            current_question = ' '.join(question_lines)
        elif a_match:
            state = 'A'
            # Collect the answer text
            i += 1
            answer_lines = []
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('Q -') or next_line.startswith('A -'):
                    break
                answer_lines.append(next_line)
                i += 1
            current_answer += ' ' + ' '.join(answer_lines)
            # Check for multiple answers
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('A -'):
                    i += 1
                    answer_lines = []
                    while i < len(lines):
                        next_line = lines[i].strip()
                        if next_line.startswith('Q -') or next_line.startswith('A -'):
                            break
                        answer_lines.append(next_line)
                        i += 1
                    current_answer += ' ' + ' '.join(answer_lines)
                else:
                    break
            # After collecting all answers, add the QA pair
            if current_question:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
        else:
            i += 1
    # If any question remains without an answer, add it
    if current_question:
        qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
    return qa_pairs

In [45]:
def write_qa_pairs_to_csv(qa_pairs, csv_filename):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question', 'Answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for pair in qa_pairs:
            writer.writerow({'Question': pair['Question'], 'Answer': pair['Answer']})



In [73]:
def main():
    pdf_dir = '/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/BB_Docs_20240909_172015'
    
    # Iterate through each PDF file in the directory
    for pdf_filename in os.listdir(pdf_dir):
        print(f'Parsing {pdf_filename} ...')
        if pdf_filename.endswith('.pdf'):
            # Extract date from the filename using regex
            date_match = re.search(r'(\d{4}-\d{1,2}-\d{1,2})', pdf_filename)
            if date_match:
                date = date_match.group(1)
                # Construct the full path to the PDF file
                pdf_path = os.path.join(pdf_dir, pdf_filename)
                # Construct the output CSV filename
                csv_filename = f'earnings_qa_{date}.csv'
                
                text = extract_text_from_pdf(pdf_path)
                qa_pairs = extract_qa_pairs(text)
                if qa_pairs:
                    write_qa_pairs_to_csv(qa_pairs, csv_filename)
                    print(f'Extracted {len(qa_pairs)} question-answer pairs.')
                    print(f'QA pairs have been written to {csv_filename}')
                else:
                    print('No question-answer pairs found.')

            else:
                pdf_path = os.path.join(pdf_dir, pdf_filename)
                # Construct the output CSV filename
                csv_filename = 'earnigs_report_qa_date_unknown.csv'
                
                text = extract_text_from_pdf(pdf_path)
                qa_pairs = extract_qa_pairs(text)
                if qa_pairs:
                    write_qa_pairs_to_csv(qa_pairs, csv_filename)
                    print(f'Extracted {len(qa_pairs)} question-answer pairs.')
                    print(f'QA pairs have been written to {csv_filename}')
                else:
                    print('No question-answer pairs found.')


    print('all pdfs have been parsed')


In [74]:
main()

Parsing 20121105_Entergy_Corp-_Earnings_Call_2012-11-05_SD000000002694979962.pdf ...
Extracted 21 question-answer pairs.
QA pairs have been written to earnings_qa_2012-11-05.csv
Parsing 20151102_Entergy_Corp-_Earnings_Call_2015-11-02_FS000000002237236499.pdf ...
Extracted 67 question-answer pairs.
QA pairs have been written to earnings_qa_2015-11-02.csv
Parsing 20180223_Entergy_Corp-_Earnings_Call_2018-2-23_DN000000002401920690.pdf ...
Extracted 50 question-answer pairs.
QA pairs have been written to earnings_qa_2018-2-23.csv
Parsing 20150428_Entergy_Corp-_Earnings_Call_2015-4-28_FS000000002207134300.pdf ...
Extracted 38 question-answer pairs.
QA pairs have been written to earnings_qa_2015-4-28.csv
Parsing 20211103_Entergy_Corp-_Earnings_Call_2021-11-03_RT000000002967196509.pdf ...
Extracted 23 question-answer pairs.
QA pairs have been written to earnings_qa_2021-11-03.csv
Parsing 20190501_Entergy_Corp-_Earnings_Call_2019-5-01_DN000000002629939073.pdf ...
Extracted 29 question-answer p