#PyMupdf4llm for PDF extraction with markdown format

In [3]:
import pymupdf4llm

# Convert the PDF document to Markdown
md_text = pymupdf4llm.to_markdown("english_handbook_pdf\Module Manual SoSe25 Electrical Engineering and Embedded Systems M.Eng_.pdf")

# If you need to save the Markdown text to a file
import pathlib
pathlib.Path("output.md").write_bytes(md_text.encode('utf-8'))


Processing english_handbook_pdf\Module Manual SoSe25 Electrical Engineering and Embedded Systems M.Eng_.pdf...
[                                        ] (0/41[                                        ] ( 1/41)

  md_text = pymupdf4llm.to_markdown("english_handbook_pdf\Module Manual SoSe25 Electrical Engineering and Embedded Systems M.Eng_.pdf")




51210

In [None]:
import pymupdf4llm
import pandas as pd
import os
from together import Together  # Import Together AI
import re
from dotenv import load_dotenv

#loading environment 
load_dotenv()
TOGETHER_API_KEY = os.getenv("API_KEY") #Create Own API key from Together AI 

INPUT_DIRECTORY = "english_handbook_pdf"  # Folder containing PDFs
RESULTS_DIRECTORY = "results"
MAX_TOKENS_FOR_CONTENT = 4000
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K"
SAFETY_MODEL = "meta-llama/Meta-Llama-Guard-3-8B"

# Initialize Together AI client
client = Together(api_key=TOGETHER_API_KEY)

# Extract text from the PDF
def extract_pdf_content(file_path):
    """Extracts text from all pages of a PDF."""
    """Extracts text from a PDF file and converts it to Markdown format."""
    content = pymupdf4llm.to_markdown(file_path)
    return content.strip()

def segment_text_with_table_detection(text, max_tokens, overlap_ratio=0.3):
    """Segment text while ensuring tables are not split."""
    table_keywords = ["Module Number", "Course Title", "ECTS", "Duration", "Assessment", "Workload", "Professor Name/Module Manager"]
    words = text.split()
    segments = []
    step = int(max_tokens * (1 - overlap_ratio))

    inside_table = False
    current_segment = []
    current_token_count = 0

    for word in words:
        if any(keyword in word for keyword in table_keywords):
            inside_table = True  # Detect table start
        elif inside_table and word.strip() == "":
            inside_table = False  # Detect table end

        if current_token_count + len(word.split()) <= max_tokens or inside_table:
            current_segment.append(word)
            current_token_count += len(word.split())
        else:
            segments.append(" ".join(current_segment))
            current_segment = [word]
            current_token_count = len(word.split())

    if current_segment:
        segments.append(" ".join(current_segment))

    return segments


# List all PDF files in the directory
pdf_files = [file for file in os.listdir(INPUT_DIRECTORY) if file.endswith(('.pdf', '.PDF'))]

# DataFrame to store metadata
all_metadata = pd.DataFrame()

# Loop through each PDF file and process
for pdf_file in pdf_files:
    pdf_content = extract_pdf_content(os.path.join(INPUT_DIRECTORY, pdf_file))

    # Segment the PDF content into chunks
    segments = segment_text_with_table_detection(pdf_content, MAX_TOKENS_FOR_CONTENT, overlap_ratio=0.4)



    # Process each segment using Together AI's Llama model
    for segment in reversed(segments):
        messages = [
            {"role": "system", "content": "You are an AI assistant that extracts structured tables from unstructured text. Your task is to detect and reconstruct tables even when they are not clearly formatted."},
            {"role": "user", "content": f"""
            Extract **all tables** from the following text. Some tables span multiple pages, while others fit on a single page. You must detect **both** types.

            **How to Identify Tables in This Text:**
            - A table **contains structured information**, often using columns like **Module Number, Course Title, ECTS, Duration, Professor name**.
            - A table **may not have a clear separator** (like `|`), but it follows a pattern: **columns are grouped together** and followed by structured data.
            - If a list contains **numbers, credits, or structured course details**, assume it is a table.

            **Instructions:**
            - Identify **all tables**, even those that appear in a single-page or span multiple pages.
            - If a table is **not formatted properly**, reconstruct it into a structured table.
            - Ensure **no tables are skipped**.

            **Extracted Text:**
            {segment}

            **Output Format:**
            ```
            Table 1:
            | Column1 | Column2 | Column3 | ...
            | --- | --- | --- | ...
            | Data 1  | Data 2 | Data 3 | ...

            Table 2:
            | Column1 | Column2 | Column3 | ...
            | --- | --- | --- | ...
            ```
            """}
        ]


        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=4048,
            temperature=0.0,
            top_p=0.7,
            top_k=50,
            repetition_penalty=1,
            stop=["<|eot_id|>", "<|eom_id|>"],
            safety_model=SAFETY_MODEL
        )

        metadata_text = response.choices[0].message.content.strip()

        # Convert tabular text to DataFrame (Automatically detect column headers)
        # Convert tabular text to DataFrame (Detect headers dynamically)
    rows = [line.split("|") for line in metadata_text.split("\n") if "|" in line]

    # Print raw AI response for debugging
    print("\n🔹 Raw AI Response:\n")
    print(metadata_text)

    if rows:
        headers = [col.strip() for col in rows[0]]  # Extract headers from the first row
        data_rows = rows[1:]  # Exclude headers from data

        # Ensure column count matches
        max_cols = len(headers)
        cleaned_data_rows = [row[:max_cols] if len(row) >= max_cols else row + [""] * (max_cols - len(row)) for row in data_rows]

        # Create DataFrame
        df = pd.DataFrame(cleaned_data_rows, columns=headers)
        df.insert(0, "File Name", pdf_file)  # Add File Name column
        all_metadata = pd.concat([all_metadata, df], ignore_index=True)  # Append data

# Create "results" directory if it doesn't exist
if not os.path.exists(RESULTS_DIRECTORY):
    os.makedirs(RESULTS_DIRECTORY)

# # Export to CSV and XLSX
# csv_path = os.path.join(RESULTS_DIRECTORY, "course_metadata.csv")
# xlsx_path = os.path.join(RESULTS_DIRECTORY, "course_metadata.xlsx")
# all_metadata.to_csv(csv_path, index=False)
# all_metadata.to_excel(xlsx_path, index=False)

# Define the Markdown file path
# Define the Markdown file path
markdown_path = os.path.join(RESULTS_DIRECTORY, "course_metadata.md")

# Convert DataFrame to Markdown format
def dataframe_to_markdown(df):
    """Convert a DataFrame to a Markdown-formatted table without the 'File Name' column."""
    if df.empty:
        return "No data extracted."
    
    # Remove 'File Name' column if it exists
    if "File Name" in df.columns:
        df = df.drop(columns=["File Name"])

    # Convert DataFrame to Markdown format
    md_table = df.to_markdown(index=False)
    return md_table

# Save the extracted tables in Markdown format
with open(markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write("# Extracted Course Metadata\n\n")
    md_file.write(dataframe_to_markdown(all_metadata))

# Display the extracted Markdown-formatted data
print("\n✅ Extracted Metadata in Markdown format:\n")
print(dataframe_to_markdown(all_metadata))

print(f"\n✅ Metadata extraction completed! Results saved at:\nMarkdown: {markdown_path}")



# Display the extracted metadata with **all detected columns**
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Extracted Course Metadata", dataframe=all_metadata)

from IPython.display import display
display(all_metadata)

# Print extracted metadata to console
print("\n✅ Extracted Metadata Table:\n")
print(all_metadata)

# print(f"\n✅ Metadata extraction completed! Results saved at:\nCSV: {csv_path}\nXLSX: {xlsx_path}")


Processing english_handbook_pdf\Module_Handbook_BIDA_eng_emden_leer.pdf...

🔹 Raw AI Response:

Here are the extracted tables from the text:

**Table 1: Module Handbook Master’s Study Program Business Intelligence and Data Analytics (M.Eng.)**

| Module Number | Course Title | ECTS | Duration | Professor name |
| --- | --- | --- | --- | --- |
| 1 | Introduction to Data Sciences | 5 | 3 Semester | Prof. Dr. Joachim Schwarz |
| 2 | Communication & Culture | 5 | 3 Semester | Prof. Maria Krüger-Basener |
| 3 | Computer Sciences | 5 | 3 Semester | Prof. Dr. Rüdiger Götting |
| 4 | Machine Learning | 5 | 3 Semester | Prof. Dr. Elmar Wings |
| 5 | Data Management | 5 | 3 Semester | Dr. Tirazheh Zare Garizy |
| 6 | Controlling | 5 | 3 Semester | Prof. Dr. Carsten Wilken |
| 7 | Business Analytics | 5 | 3 Semester | Prof. Dr. Elmar Wings |
| 8 | Project T / Project B | 5 | 3 Semester | Prof. Dr. Elmar Wings |
| 9 | Master Thesis and Colloquium | 30 | 3 Semester | Professor / lecturer of Master 

Unnamed: 0,File Name,Unnamed: 2,Module Number,Course Title,ECTS,Duration,Professor name,Unnamed: 8
0,Module_Handbook_BIDA_eng_emden_leer.pdf,,---,---,---,---,---,
1,Module_Handbook_BIDA_eng_emden_leer.pdf,,1,Introduction to Data Sciences,5,3 Semester,Prof. Dr. Joachim Schwarz,
2,Module_Handbook_BIDA_eng_emden_leer.pdf,,2,Communication & Culture,5,3 Semester,Prof. Maria Krüger-Basener,
3,Module_Handbook_BIDA_eng_emden_leer.pdf,,3,Computer Sciences,5,3 Semester,Prof. Dr. Rüdiger Götting,
4,Module_Handbook_BIDA_eng_emden_leer.pdf,,4,Machine Learning,5,3 Semester,Prof. Dr. Elmar Wings,
...,...,...,...,...,...,...,...,...
73,Module_Handbook_BIDA_eng_emden_leer.pdf,,3.6 Quality Management,Prof. Dr. Monika Blattmeier,Seminar form lectures presentations and paper...,Mandatory elective module,60,90
74,Module_Handbook_BIDA_eng_emden_leer.pdf,,3.7 Sustainable Innovation Management,Prof. Dr.-Eng. Armando W. Colombo,Lecture,Mandatory elective module,60,90
75,Module_Handbook_BIDA_eng_emden_leer.pdf,,3.8 Advanced Project Management,Prof. Dr. Andreas Haja,"Lecture, group discussion, case studies",Mandatory elective module,60,90
76,Module_Handbook_BIDA_eng_emden_leer.pdf,,3.9 Current Topic T,Lecturer of the study program,Solving of a problem independently under the ...,Mandatory elective module,30,120



✅ Extracted Metadata Table:

                                  File Name     \
0   Module_Handbook_BIDA_eng_emden_leer.pdf      
1   Module_Handbook_BIDA_eng_emden_leer.pdf      
2   Module_Handbook_BIDA_eng_emden_leer.pdf      
3   Module_Handbook_BIDA_eng_emden_leer.pdf      
4   Module_Handbook_BIDA_eng_emden_leer.pdf      
..                                      ... ..   
73  Module_Handbook_BIDA_eng_emden_leer.pdf      
74  Module_Handbook_BIDA_eng_emden_leer.pdf      
75  Module_Handbook_BIDA_eng_emden_leer.pdf      
76  Module_Handbook_BIDA_eng_emden_leer.pdf      
77  Module_Handbook_BIDA_eng_emden_leer.pdf      

                              Module Number  \
0                                      ---    
1                                        1    
2                                        2    
3                                        3    
4                                        4    
..                                      ...   
73                  3.6 Quality Manageme

Trying to merge: