In [81]:
import pytesseract
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor
import pdfminer.high_level
import nest_asyncio

nest_asyncio.apply()

## Text Based PDF (Non-Scanned) to Text Conversion

In [None]:
# Load PDF from .input/ directory
pdf_path = 'input/mml-book.pdf'

# Extract text from PDF if PDF is not image-based
text = pdfminer.high_level.extract_text(pdf_path)

#save text to a file
with open('output/text.txt', 'w') as f:
    f.write(text)

## Image Based PDF (Scanned) to Text Conversion

In [None]:

# Function to process a single image
def process_image(image, page_num):
    text = pytesseract.image_to_string(image)
    # Append the text to the output file
    with open('output/mml-book.txt', 'a') as f:
        f.write(text)
    print(f"Page {page_num} processed")

# Process for image-based PDF
images = convert_from_path(pdf_path)

# Use ThreadPoolExecutor to parallelize the processing running 4 threads at a time
with ThreadPoolExecutor(max_workers=4) as executor:
    # We must use asynchronous processing to avoid blocking the main thread while waiting for the threads to complete
    futures = [executor.submit(process_image, image, i+1) for i, image in enumerate(images)]
    
# Ensure all threads are completed
for future in futures:
    future.result()


Page 2 processed
Page 1 processed
Page 6 processed
Page 3 processed
Page 4 processed
Page 5 processed
Page 7 processed
Page 10 processed
Page 8 processed
Page 9 processed
Page 14 processed
Page 11 processed
Page 13 processed
Page 12 processed
Page 15 processed
Page 16 processed
Page 17 processed
Page 20 processed
Page 19 processed
Page 18 processed
Page 22 processed
Page 23 processed
Page 21 processed
Page 24 processed
Page 25 processed
Page 26 processed
Page 27 processed
Page 28 processed
Page 29 processed
Page 30 processed
Page 31 processed
Page 32 processed
Page 33 processed
Page 34 processed
Page 35 processed
Page 36 processed
Page 37 processed
Page 39 processed
Page 38 processed
Page 40 processed
Page 42 processed
Page 41 processed
Page 43 processed
Page 44 processed
Page 45 processed
Page 48 processed
Page 47 processed
Page 46 processed
Page 49 processed
Page 51 processed
Page 50 processed
Page 52 processed
Page 53 processed
Page 56 processed
Page 54 processed
Page 55 processed
P

In [24]:
#read and print the first 1000 characters of the output file
image_extracted_text = open('output/mml-book.txt', 'r').read(1000)
print(image_extracted_text)

MATICS For
ARNING

Mare Peter Deisenroth..
A. Aldo Faisal

Contents

Foreword 1

Part 1 Mathematical Foundations 9
1 Introduction and Motivation 11
1.1 Finding Words for Intuitions 12
1.2 Two Ways to Read This Book 13
1.3. Exercises and Feedback 16
2 Linear Algebra 17
2.1 Systems of Linear Equations 19
2.2 Matrices 22
2.3 Solving Systems of Linear Equations 27
2.4 Vector Spaces 35
2.5 Linear Independence 40
2.6 Basis and Rank 44
2.7 Linear Mappings 48
2.8 Affine Spaces 61
2.9 Further Reading 63

Exercises 64
3 Analytic Geometry 70
3.1. Norms 71
3.2 Inner Products 72
3.3. Lengths and Distances 75
3.4 Angles and Orthogonality 76
3.5 Orthonormal Basis 78
3.6 Orthogonal Complement 79
3.7 Inner Product of Functions 80
3.8 Orthogonal Projections 81
3.9 Rotations 91
3.10 Further Reading 94

Exercises 96
4 Matrix Decompositions 98
4.1 Determinant and Trace 99

i
This material will be published by Cambridge University Press as Mathematics for Machine Learn-
ing by Marc Peter Deisenroth, A. Aldo

## Knowledge Extraction Using GPT

GPT can be used to extract knowledge from text. The text can be in the form of a PDF, image, or any other format. In this notebook, we will use GPT to extract knowledge from a PDF file. The PDF file can be text-based or image-based. The prompt indicates the type of information we want to extract, and the format of the response.

In [None]:
from openai import AsyncAzureOpenAI
from azure.identity import AzureCliCredential, get_bearer_token_provider
import aiofiles
import asyncio
import csv
import nest_asyncio
import time

# When running in Jupyter Notebook, we need to apply the nest_asyncio patch
nest_asyncio.apply()

In [None]:
credential = AzureCliCredential(tenant_id="")

client = AsyncAzureOpenAI(azure_endpoint=" ", api_version = '2025-01-01-preview', azure_ad_token_provider=get_bearer_token_provider(credential))

In [None]:

# System instruction for GPT
system_prompt = '''Extract any math vocabulary, theorems, mathematical concepts, or jargon in the text.
Return your findings in a CSV format with the following columns:

Vocabulary Definition/Description,

Draw the discription or definition from the text. Try to keep the definition as concise as possible and as close to the original text as possible.

Each row should represent a separate instance found in the text.
Do not add markdown or any other formatting—just raw CSV text.Do not include headers in the system response. Be sure to keep formate csv compliant. If 
the original text has commas, escape them with double quotes. For example, if the original text is "Hello, world!", the CSV should be "Hello, \"world!\"".
Do not include empty vocabulary or definitions. Each row should have a vocabulary and a definition. No null or empty string values.'''

# Read text file
with open('output/mml-book.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Set up batch processing
batch_size = 1000  # Reduce batch size to stay within GPT context limits
batches = [text[i:i+batch_size] for i in range(0, len(text), batch_size)]

# Initialize output CSV file with headers
with open('output/math_vocab.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["Math Vocabulary", "Theorems", "Mathematical Concepts", "Jargon"])

async def process_batch(i, batch):
    """Processes a single batch and writes output asynchronously."""
    try:
        response = await client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": batch}
            ],
            model="gpt-4o-mini"
        )

        response_content = response.choices[0].message.content.strip()

        # Write output asynchronously do not add headers
        async with aiofiles.open('output/math_vocab.csv', 'a', newline='', encoding='utf-8') as f:
            await f.write(response_content + "\n")
            time.sleep(1) # Sleep for 1 second to avoid rate limiting, increase or decrease as needed
        print(f"✅ Batch {i+1} processed")

    except Exception as e:
        print(f"❌ Error processing batch {i+1}: {e}")

async def main():
    """Run all batch processes concurrently."""
    tasks = [process_batch(i, batch) for i, batch in enumerate(batches)]
    await asyncio.gather(*tasks)

# Run asyncio loop properly
if __name__ == "__main__":
    asyncio.run(main())

✅ Batch 16 processed
✅ Batch 21 processed
✅ Batch 36 processed
✅ Batch 83 processed
✅ Batch 15 processed
✅ Batch 108 processed
✅ Batch 32 processed
✅ Batch 8 processed
✅ Batch 7 processed
✅ Batch 29 processed
✅ Batch 22 processed
✅ Batch 57 processed
✅ Batch 20 processed
✅ Batch 14 processed
✅ Batch 30 processed
✅ Batch 12 processed
✅ Batch 11 processed
✅ Batch 28 processed
✅ Batch 5 processed
✅ Batch 23 processed
✅ Batch 9 processed
✅ Batch 35 processed
✅ Batch 53 processed
✅ Batch 39 processed
✅ Batch 60 processed
✅ Batch 86 processed
✅ Batch 41 processed
✅ Batch 131 processed
✅ Batch 26 processed
✅ Batch 45 processed
✅ Batch 48 processed
✅ Batch 31 processed
✅ Batch 34 processed
✅ Batch 56 processed
✅ Batch 68 processed
✅ Batch 25 processed
✅ Batch 44 processed
✅ Batch 62 processed
✅ Batch 46 processed
✅ Batch 102 processed
✅ Batch 6 processed
✅ Batch 82 processed
✅ Batch 58 processed
✅ Batch 38 processed
✅ Batch 87 processed
✅ Batch 76 processed
✅ Batch 85 processed
✅ Batch 42 proc