## Script to parse Earnings reports Q&A --> CSV

In [1]:
import os
import re
import csv
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from pdfminer.layout import LTTextContainer

In [3]:
def extract_text_from_pdf(pdf_path):
    """Helper function to remove unwanted text such as page numbers and oeprator text"""
    # Extracts all text from the PDF file
    text = extract_text(pdf_path)
    # Remove all occurrences of {BIO ...} patterns
    text = re.sub(r'\{BIO.*?\}', '', text)
    # Remove lines that look like page headers/footers
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        # Remove lines that match "Page X of Y ..." patterns
        if re.match(r'^Page \d+ of \d+', line):
            continue
        # Remove lines that contain "FINAL TRANSCRIPT"
        elif 'FINAL TRANSCRIPT' in line:
            continue
        # Remove lines that match "Entergy Corp (ETR US Equity)"
        elif re.match(r'^Entergy Corp', line):
            continue
        # Remove lines that are dates like "2012-11-05"
        elif re.match(r'^\d{4}-\d{2}-\d{2}$', line):
            continue
        # Remove operator text
        elif re.match(r'Operator.*?(?=Q\s*-|A\s*-)', line, flags=re.IGNORECASE | re.DOTALL):
            continue
        else:
            cleaned_lines.append(line)
    # Join the cleaned lines back into text
    text = '\n'.join(cleaned_lines)
    return text


In [4]:
def extract_qa_pairs(text):
    """Helper function to determine where questions and answers start"""
    # Find the starting position of "Questions And Answers" or "Q&A"
    qa_start = text.find('Questions And Answers')
    if qa_start == -1:
        qa_start = text.find('Q&A') 
    if qa_start == -1:
        qa_start = 0
        print("No 'Questions and Answers' section found in the text, starting at beginning")
        
    # Extract text starting from the Q&A section
    text = text[qa_start:]
    # Split the text into lines
    lines = text.split('\n')
    # Initialize variables
    qa_pairs = []
    current_question = ''
    current_answer = ''
    state = None  # None, 'Q', or 'A'

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        # Check for question
        q_match = re.match(r'^Q\s*-\s*(.*)', line)
        a_match = re.match(r'^A\s*-\s*(.*)', line)
        if q_match:
            # If there's an existing question without an answer, add it with an empty answer
            if current_question and current_answer:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
            elif current_question:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
            state = 'Q'
            # Collect the question text
            i += 1
            question_lines = []
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('Q -') or next_line.startswith('A -'):
                    break
                question_lines.append(next_line)
                i += 1
            current_question = ' '.join(question_lines)
        elif a_match:
            state = 'A'
            # Collect the answer text
            i += 1
            answer_lines = []
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('Q -') or next_line.startswith('A -'):
                    break
                answer_lines.append(next_line)
                i += 1
            current_answer += ' ' + ' '.join(answer_lines)
            # Check for multiple answers
            while i < len(lines):
                next_line = lines[i].strip()
                if next_line.startswith('A -'):
                    i += 1
                    answer_lines = []
                    while i < len(lines):
                        next_line = lines[i].strip()
                        if next_line.startswith('Q -') or next_line.startswith('A -'):
                            break
                        answer_lines.append(next_line)
                        i += 1
                    current_answer += ' ' + ' '.join(answer_lines)
                else:
                    break
            # After collecting all answers, add the QA pair
            if current_question:
                qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
                current_question = ''
                current_answer = ''
        else:
            i += 1
    # If any question remains without an answer, add it
    if current_question:
        qa_pairs.append({'Question': current_question.strip(), 'Answer': current_answer.strip()})
    return qa_pairs

In [5]:
def write_qa_pairs_to_csv(qa_pairs, csv_filename):
    """convert DF to CSV"""
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question', 'Answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for pair in qa_pairs:
            writer.writerow({'Question': pair['Question'], 'Answer': pair['Answer']})



In [7]:
def main():
    """Main script function that aggregates helpers above. Takes all pdfs from a directory and parses all questions and answers into a CSV, saved in new directory"""
    base_dir = '/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/UTY Transcripts-selected'  

    # Dynamically create the output directory name by replacing '_transcripts' with '_csvs'
    output_dir = base_dir.replace('_transcripts', '_csvs')
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

    # Iterate through each company folder in the base directory
    for company_name in os.listdir(base_dir):
        company_path = os.path.join(base_dir, company_name)

        if os.path.isdir(company_path):
            print(f'Processing company: {company_name}')

            # Iterate through each PDF transcript in the company folder
            for pdf_filename in os.listdir(company_path):
                if pdf_filename.endswith('.pdf'):
                    pdf_path = os.path.join(company_path, pdf_filename)
                    print(f'Parsing {pdf_filename} from {company_name} ...')

                    # Extract the date from the filename using regex
                    date_match = re.search(r'(\d{4}-\d{1,2}-\d{1,2})', pdf_filename)
                    if date_match:
                        date = date_match.group(1)
                        csv_filename = f'{company_name}_earnings_qa_{date}.csv'
                    else:
                        csv_filename = f'{company_name}_earnings_report_qa_date_unknown.csv'
                    
                    # Construct the full path for the output CSV file
                    csv_filepath = os.path.join(output_dir, csv_filename)
                    
                    # Parse the PDF and extract the QA pairs
                    text = extract_text_from_pdf(pdf_path)
                    qa_pairs = extract_qa_pairs(text)

                    # Write the QA pairs to the company-specific CSV
                    if qa_pairs:
                        write_qa_pairs_to_csv(qa_pairs, csv_filepath)
                        print(f'Extracted {len(qa_pairs)} question-answer pairs.')
                        print(f'QA pairs have been written to {csv_filepath}')
                    else:
                        print(f'No question-answer pairs found in {pdf_filename}.')

    print('All PDFs have been parsed.')



In [8]:
main()


Parsing 20190425_American_Electric_Power_Co_Inc-_Earnings_Call_2019-4-25_DN000000002627073627.pdf ...
Extracted 43 question-answer pairs.
QA pairs have been written to earnings_qa_2019-4-25.csv
Parsing 20190725_American_Electric_Power_Co_Inc-_Earnings_Call_2019-7-25_DN000000002675713968.pdf ...
Extracted 32 question-answer pairs.
QA pairs have been written to earnings_qa_2019-7-25.csv
Parsing 20151022_American_Electric_Power_Co_Inc-_Earnings_Call_2015-10-22_FS000000002240299703.pdf ...
Extracted 66 question-answer pairs.
QA pairs have been written to earnings_qa_2015-10-22.csv
Parsing 20140725_American_Electric_Power_Co_Inc-_Earnings_Call_2014-7-25_FS000000002162696195.pdf ...
No 'Questions and Answers' section found in the text, starting at beginning
No question-answer pairs found.
Parsing 20160728_American_Electric_Power_Co_Inc-_Earnings_Call_2016-7-28_FS000000002294251800.pdf ...
No 'Questions and Answers' section found in the text, starting at beginning
No question-answer pairs fou