In [1]:
import os
import random

import PyPDF2
from typing import Optional

from openai import OpenAI
import ast
import concurrent.futures

OPENAI_API_KEY = "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"

In [2]:
def validate_pdf(file_path: str) -> bool:
    if not os.path.exists(file_path):
        print(f"Error: File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        print("Error: File is not a PDF")
        return False
    return True

In [3]:
def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

In [4]:
def create_sentence_bounded_chunks(text, target_chunk_size):
    """
    Split text into chunks at sentence boundaries close to the target chunk size.
    """
    import re
    
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence_length = len(sentence) + 1  # +1 for the space
        if current_length + sentence_length > target_chunk_size and current_chunk:
            # Join the current chunk and add it to chunks
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


In [5]:
def process_chunk(chunk, sys_prompt, model="gpt-4o-mini", temperature=0.0):
    client = OpenAI(api_key=OPENAI_API_KEY)
    completion = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": sys_prompt},
            {
                "role": "user",
                "content": chunk
            }
        ]
    )

    return completion.choices[0].message.content

In [6]:
def text_to_speech_audio_openai(text: str, voice_id: str = "alloy") -> bytes:
    speeds = [1.05, 1.1, 1.15, 1.2, 1.25]

    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.audio.speech.create(
        model="tts-1",
        voice=voice_id,
        input=text,
        speed=random.choice(speeds)
    )
    return response.content


In [7]:
def clean_text(chunks):
    sys_prompt = """
        You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

        The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

        Remove any legal or financial disclaimer sections. Some examples are sections with text similar to the following text:
            - "does not provide individually tailored investment advice"
            - "facts and views presented in Morgan Stanley Research may not reflect information"
            - "this document is not intended as an offer or solicitation for the purchase or sale of any financial instrument"
            - "Morgan Stanley Research may not be reprinted, sold, or redistributed without written consent"
            - "responsible for the preparation of Morgan Stanley Research have received compensation"
            - "Morgan Stanley Research has been published in accordance with our conflict management policy"
            - "Morgan Stanley Research is not a product of the research departments of Morgan Stanley Smith Barney LLC"
            - "Morgan Stanley Smith Barney LLC is a registered broker-dealer and a member of FINRA and SIPC"    
            - "valuation methodology and risks associated with any recommendation"
            - "The securities, instruments, or strategies discussed may not be suitable for all investors

        Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

        Please be smart with what you remove and be creative ok?

        Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED

        Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

        PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES

        REMEMBER TO NOT REMOVE THE AUTHORS NAME OR TITLE OF DOCUMENT FROM THE TEXT

        ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
        Here is the text:
        """

    processed_text = [""] * len(chunks)

    def process_and_append(index, chunk):
        return index, process_chunk(chunk, sys_prompt, model="gpt-4o-mini", temperature=0.0)

    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        future_to_chunk = {executor.submit(process_and_append, i, chunk): i for i, chunk in enumerate(chunks)}
        for future in concurrent.futures.as_completed(future_to_chunk):
            index, result = future.result()
            print(f"Completed processing chunk {index}.")
            processed_text[index] = result

    processed_text = "\n".join(processed_text)

    return processed_text

In [8]:
pdf_path = '/Users/joneickmeier/Downloads/ARZHANG20241030204819622.pdf'
#pdf_path = '/Users/joneickmeier/Documents/Papers Library/43bfb9b8_6485_11ef_9750_f050912173bd_604.pdf'
#pdf_path = '/Users/joneickmeier/Documents/EM.FM_.WhitePaper.Assets.pdf'

# Extract the filename from the path and remove the .pdf extension
filename = os.path.basename(pdf_path).replace('.pdf', '')

In [9]:
extracted_text = extract_text_from_pdf(pdf_path, max_chars=1000000)
chunks = create_sentence_bounded_chunks(extracted_text, target_chunk_size=4000)

Processing PDF with 33 pages...
Processed page 1/33
Processed page 2/33
Processed page 3/33
Processed page 4/33
Processed page 5/33
Processed page 6/33
Processed page 7/33
Processed page 8/33
Processed page 9/33
Processed page 10/33
Processed page 11/33
Processed page 12/33
Processed page 13/33
Processed page 14/33
Processed page 15/33
Processed page 16/33
Processed page 17/33
Processed page 18/33
Processed page 19/33
Processed page 20/33
Processed page 21/33
Processed page 22/33
Processed page 23/33
Processed page 24/33
Processed page 25/33
Processed page 26/33
Processed page 27/33
Processed page 28/33
Processed page 29/33
Processed page 30/33
Processed page 31/33
Processed page 32/33
Processed page 33/33

Extraction complete! Total characters: 66109


In [10]:
processed_text = clean_text(chunks)

Completed processing chunk 15.
Completed processing chunk 13.
Completed processing chunk 11.
Completed processing chunk 6.
Completed processing chunk 1.
Completed processing chunk 5.
Completed processing chunk 3.
Completed processing chunk 8.
Completed processing chunk 7.
Completed processing chunk 9.
Completed processing chunk 16.
Completed processing chunk 14.
Completed processing chunk 12.
Completed processing chunk 0.
Completed processing chunk 17.
Completed processing chunk 4.
Completed processing chunk 2.
Completed processing chunk 10.


In [11]:
if len(processed_text) > 150000:
    summarize_pmt = """
        You are an expert text analyzer and summarizer. Your task is to process a very long document exceeding 50,000 tokens and 
        create a series of smaller documents that collectively preserve all the detailed content of the original.

        Instructions:

        Divide the Document:

        Break down the original document into logical sections or chapters, ensuring that each part is within a manageable token limit (e.g., 2,000-3,000 tokens).
        Maintain the original sequence and structure to preserve the flow of information.
        Summarize Each Section:

        For each section, produce a detailed summary that includes all key points, arguments, data, and nuances presented in that part.
        Ensure that no critical information is omitted in the summary.
        Preserve Detail and Clarity:

        Use clear and precise language to convey complex ideas effectively.
        Retain any important technical terms, definitions, and explanations as presented in the original text.
        Maintain Consistency:

        Keep the tone, style, and perspective consistent throughout all summarized sections.
        Ensure that references to figures, tables, or citations are accurately reflected.
        Output Format:

        Present each summarized section sequentially, labeled appropriately (e.g., "Section 1 Summary," "Section 2 Summary," etc.).
        Do not include any analysis or personal opinions—focus solely on conveying the original content.
        Goal:

        The end result should be a collection of smaller documents that, when combined, represent a complete and detailed summary of the original long document without any loss of information.
    """
    processed_text = process_chunk(processed_text, summarize_pmt, model="gpt-4o-mini", temperature=0.0)


In [12]:
sys_prompt = """
You are a distinguished professor delivering a seminar to an audience of fellow professors. 
You have won several teaching awards for your engaging and clear presentations. 
Your students love your lectures and you are a master at explaining complex topics in a way that is easy to understand.

Your presentation is at a high technical level, covering advanced and complex topics in your field of expertise. 

We are in an alternate universe where actually you write every down every line that you speak. Please create a detailed script of your seminar that includes:

An engaging introduction that outlines the significance of the topic.
In-depth explanations of complex concepts, theories, or methodologies.
Relevant examples, case studies, or research findings that illustrate key points.
A thoughtful conclusion that summarizes the main ideas and suggests future directions or implications.
Use appropriate academic language and incorporate technical terminology relevant to advanced studies in the discipline. Ensure the content reflects a deep understanding of the subject matter and is suitable for an expert audience.

Remember to not add any markdown formatting, stop adding special characters that markdown capatilisation etc likes.

TRY NOT TO LEAVE OUT ANYTHING IMPORTANT.

THERE SHOULD BE NO HEADINGS, SUBHEADINGS, OR ANYTHING LIKE THAT

YOUR PRESENTATION MUST BE WRITEN OUT IN FULL, NO MARKDOWN FORMATTING, NO SPECIAL CHARACTERS, NO ACKNOWLEDGEMENTS ABOUT MY QUESTIONS, JUST THE TEXT OF THE SCRIPT
"""

In [13]:
script = process_chunk(processed_text, sys_prompt, model="gpt-4o-mini", temperature=0.1)

In [14]:
sys_prompt = """
You are a highly engaging and creative screenwriting editor tasked with revising a seminar script delivered 
by a distinguished professor to an audience of graduate students and fellow professors. 

Your goal is to enhance the script's engagement and captivate the audience while preserving its high technical level and depth.

Please edit the script to:

Infuse compelling storytelling elements and vivid language.
Incorporate analogies, metaphors, or anecdotes to illustrate complex concepts.
Enhance the flow and pacing for better audience engagement.
Maintain appropriate academic language and technical terminology.
Ensure the revised script brings the subject matter to life, making it more relatable 
and engaging for the audience without compromising its technical integrity.

YOU MAY ALSO WANT TO SHORTEN THE SCRIPT IF IT IS TOO LONG. ANYTHING OVER 2000 WORDS IS TOO LONG.
"""

In [15]:
final_script = process_chunk(script, sys_prompt, model="gpt-4o", temperature=0.1)

In [16]:
question_prompt = """
You are a group of highly intelligent graduate students who have just attended a seminar delivered by a 
distinguished professor on advanced topics discussed in the seminar. Based on the lecture, generate a 
list of 5 insightful and challenging questions that delve deeper into the subject matter. 

These questions should reflect a deep understanding of the content and encourage further discussion and exploration. 

Ensure that the questions are open-ended and pertain to complex aspects of the topic covered.

Please output the questions in a structured format that's easy for code to process, such as a JSON array. Format your response as follows:

{
  "questions": [
    "Question 1",
    "Question 2",
    "Question 3",
    "Question 4",
    "Question 5"
  ]
}

Each question should be a string within the array. Do not include any additional text outside of the JSON structure. """

In [17]:
questions = process_chunk(final_script, question_prompt, model="gpt-4o-mini", temperature=0.1)
questions = ast.literal_eval(questions)

In [18]:
ty = ['Thank you for asking!',
      'I appreciate your question!', 
      'Thanks for bringing that up!', 
      'Great question, thanks!', 
      'Thank you for your curiosity!', 
      'I’m glad you asked!', 
      'That’s a fantastic question, thank you!', 
      'I appreciate you reaching out with your question!', 
      'Thanks for your insightful question!',
      'Thank you for engaging with such a great question!'
    ]

conclusions = [
    'Does anyone have any further questions?',
    'Is there anything else we should discuss?',
    'Are there any other questions?',
    'Are there any other inquiries that need addressing?',
    'Is there anything else you\'re curious about?',
    'Would anyone like to ask anything more?',
    'Is there another topic or question you\'d like to explore?',
    'Any other questions or clarifications needed?',
    'Feel free to ask if there\'s anything else on your mind.',
    'Do you have any other questions or thoughts?',
]

In [19]:
def process_answer(index, question):
    print(f"Processing question {question}")
    answer_prompt = f"""
    You are a highly engaging professor who has just given a seminar presentation to a group of professors and graduate students.

    Based on the lecture and the provided raw source information, provide a detailed and insightful answer to the following question:

    {question}

    Instructions for your answer:

    - Ensure the answer is comprehensive but concise (less than 100 words).
    - Structure your response well, reflecting a deep understanding of the content.
    - Encourage further discussion and exploration of the topic.
    - If the question is not related to the topic, politely mention that.
    - If you are unsure about the answer, acknowledge it.

    The script of the seminar is:
    {final_script}

    The raw source information is:
    {processed_text}
    """

    chosen_ty = random.choice(ty)
    ty.remove(chosen_ty)
    answer = chosen_ty + ' '
    answer += process_chunk(final_script, answer_prompt, model="gpt-4o", temperature=0.25) + ' '
    
    chosen_conclusion = random.choice(conclusions)
    conclusions.remove(chosen_conclusion)
    answer += chosen_conclusion + ' '

    return {'question': question, 'answer': answer}

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_answer, i, question) for i, question in enumerate(questions["questions"], 1)]
    answers = [future.result() for future in concurrent.futures.as_completed(futures)]
    answers.sort(key=lambda x: questions["questions"].index(x['question']))


Processing question How do the recent trends in refinancing between public and private markets reflect the broader economic conditions, and what implications might this have for future borrower behavior?
Processing question In what ways might the narrowing gap in origination spreads influence the competitive landscape of direct lending versus public credit, particularly in terms of borrower covenants and lender risk appetite?
Processing question Considering the sector-specific nuances in direct lending, how should lenders adjust their strategies to mitigate risks associated with rising gross leverage in sectors like Energy and Healthcare?
Processing question What role do equity injections from sponsors play in the default landscape, and how might this practice evolve as market conditions continue to fluctuate?
Processing question As private credit continues to grow, what potential regulatory challenges could arise, and how might these impact the strategies of private debt funds and the

In [21]:
voices = [
    "echo",
    "fable",
    "onyx",
    "nova",
    "shimmer",
]
original_voice_id = "alloy"

In [22]:
final_script_content = []
paragraphs = final_script.split('\n\n')
for paragraph in paragraphs:
    final_script_content.append(('professor', paragraph))

In [23]:
combined_audio = b""

def process_chunk_parallel(index, speaker, text):
    print(f"Processing chunk {index} of {len(final_script_content)}")
    return index, text_to_speech_audio_openai(text, original_voice_id)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_chunk_parallel, i, speaker, text) for i, (speaker, text) in enumerate(final_script_content, 1)]
    results = [future.result() for future in concurrent.futures.as_completed(futures)]
    results.sort()  # Ensure the results are in the correct order
    for index, audio in results:
        combined_audio += audio

text = "QUESTIONS?"
combined_audio += text_to_speech_audio_openai(text, original_voice_id)

def process_answer_parallel(index, answer_dict):
    print(f"Processing answer {index} of {len(answers)}")
    voice_id = voices[index-1]
    question = answer_dict['question']
    answer = answer_dict['answer']
    
    audio = text_to_speech_audio_openai(question, voice_id)
    audio += text_to_speech_audio_openai(answer, original_voice_id)
    
    return index, audio

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_answer_parallel, i, answer_dict) for i, answer_dict in enumerate(answers, 1)]
    results = [future.result() for future in concurrent.futures.as_completed(futures)]
    results.sort()  # Ensure the results are in the correct order
    for index, audio in results:
        combined_audio += audio

text = "IF THERE ARE NO OTHER QUESTIONS, I WOULD LIKE TO THANK THE AUDIENCE FOR THEIR ATTENTION TODAY. IF YOU THINK OF QUESTIONS LATER, PLEASE FEEL FREE TO SEND ME A NOTE. HAVE A GREAT DAY"
combined_audio += text_to_speech_audio_openai(text, original_voice_id)

with open(f"./audio/seminar_qa_{filename}.mp3", "wb") as f:
    f.write(combined_audio)


Processing chunk 1 of 12
Processing chunk 2 of 12
Processing chunk 3 of 12
Processing chunk 4 of 12
Processing chunk 5 of 12
Processing chunk 6 of 12
Processing chunk 7 of 12
Processing chunk 8 of 12
Processing chunk 9 of 12
Processing chunk 10 of 12
Processing chunk 11 of 12
Processing chunk 12 of 12
Processing answer 1 of 5Processing answer 2 of 5

Processing answer 3 of 5
Processing answer 4 of 5
Processing answer 5 of 5


In [24]:
client = OpenAI(api_key=OPENAI_API_KEY)
audio_file = open(f"./audio/seminar_qa_{filename}.mp3", "rb")
transcript = client.audio.transcriptions.create(
  file=audio_file,
  model="whisper-1",
  response_format="verbose_json",
  timestamp_granularities=["segment"]
)

print(transcript.words)

None


In [25]:
transcript.segments

[TranscriptionSegment(id=0, avg_logprob=-0.21875, compression_ratio=1.5565749406814575, end=1.7200000286102295, no_speech_prob=0.004673466086387634, seek=0, start=0.0, temperature=0.0, text=' Good afternoon, esteemed colleagues', tokens=[50364, 2205, 6499, 11, 4065, 15485, 7734, 50450]),
 TranscriptionSegment(id=1, avg_logprob=-0.21875, compression_ratio=1.5565749406814575, end=4.460000038146973, no_speech_prob=0.004673466086387634, seek=0, start=1.7200000286102295, temperature=0.0, text=' and fellow seekers of financial wisdom.', tokens=[50450, 293, 7177, 47915, 295, 4669, 10712, 13, 50587]),
 TranscriptionSegment(id=2, avg_logprob=-0.21875, compression_ratio=1.5565749406814575, end=6.039999961853027, no_speech_prob=0.004673466086387634, seek=0, start=4.460000038146973, temperature=0.0, text=' Today, we embark on a journey', tokens=[50587, 2692, 11, 321, 29832, 322, 257, 4671, 50666]),
 TranscriptionSegment(id=3, avg_logprob=-0.21875, compression_ratio=1.5565749406814575, end=9.0, no_