In [1]:
pip install pymupdf tqdm pytesseract pdf2image

Note: you may need to restart the kernel to use updated packages.


In [2]:
import concurrent.futures
import os
import pymupdf
from tqdm import tqdm
import csv
import pytesseract
from pdf2image import convert_from_path

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'E:\Tesseract-OCR\tesseract.exe'

def convert_pdf_to_txt(pdf_file):
    output_folder = 'E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\TXTs'  # Desired folder for TXT files
    os.makedirs(output_folder, exist_ok=True)  # Ensure the folder exists
    
    txt_file = os.path.join(output_folder, os.path.splitext(os.path.basename(pdf_file))[0] + '.txt')
    
    try:
        # Attempt to extract text without OCR
        text = ""
        with pymupdf.open(pdf_file) as pdf:
            for page in pdf:
                text += page.get_text()

        # Set a minimum length requirement for extracted text; perform OCR if below this length
        MIN_TEXT_LENGTH = 200  # Adjust this value based on your requirements
        
        if len(text.strip()) < MIN_TEXT_LENGTH:  # Check if text is too short
            images = convert_from_path(pdf_file)
            ocr_text = ""
            for image in images:
                ocr_text += pytesseract.image_to_string(image)
            text = ocr_text  # Replace with OCR text if OCR was performed

        # Save the text content to a file
        with open(txt_file, 'w', encoding='utf-8') as file:
            file.write(text)
        
        return f"Converted {pdf_file} to {txt_file}"
    except Exception as e:
        return f"Error converting {pdf_file}: {e}"

# Folder containing PDFs
pdf_folder = 'E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\PDFs'
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Convert PDFs to text files using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    results = list(tqdm(executor.map(convert_pdf_to_txt, pdf_files), total=len(pdf_files)))

# Print the results for feedback
for result in results:
    print(result)

  0%|          | 0/270 [00:00<?, ?it/s]

100%|██████████| 270/270 [01:59<00:00,  2.25it/s]

Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._11_2022.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._11_2022.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._17_q_2023.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._17_q_2023.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._19_2022.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._19_2022.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._2015_2022.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._2015_2022.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._2016_2022.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._2016_2022.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._21_2021.pdf to E:\SCP Judgements\Small Corpus (Metadata incl.)\TXTs\c.a._21_2021.txt
Converted E:\SCP Judgements\Small Corpus (Metadata incl.)\PDFs\c.a._27




In [4]:
def get_metadata_mapping(metadata_csv_path):
    """Reads metadata from CSV and returns a dictionary mapping cleaned PDF titles to metadata."""
    metadata_mapping = {}
    with open(metadata_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Clean up the PDF Title by replacing '%20' with spaces
            pdf_title = row["PDF Title"].replace('%20', ' ')
            metadata_mapping[pdf_title] = {
                "Case Subject": row["Case Subject"],
                "Case No": row["Case No"],
                "Case Title": row["Case Title"],
                "Author Judge": row["Author Judge"],
                "Judgment Date": row["Judgment Date"]
            }
    return metadata_mapping

def convert_txt_to_dict(txt_file, metadata):
    """Reads content from TXT file and combines it with the matched metadata."""
    with open(txt_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    document_data = {
        "title": os.path.splitext(os.path.basename(txt_file))[0] + ".pdf",
        "content": content,
        "metadata": metadata
    }
    return document_data

# Define paths
txt_folder = 'E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\TXTs'
metadata_csv_path = 'E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\Metadata\\judgements_metadata.csv'
output_file = 'E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\CSV\\SCP_Judgements_with_Metadata.csv'

# Ensure output folder exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Load metadata from CSV
metadata_mapping = get_metadata_mapping(metadata_csv_path)

# Collect document data
documents = []
txt_files = [os.path.join(txt_folder, f) for f in os.listdir(txt_folder) if f.endswith('.txt')]

for txt_file in tqdm(txt_files):
    txt_filename = os.path.basename(txt_file).replace('.txt', '.pdf')
    # Get metadata by matching the TXT file name to the metadata CSV's cleaned PDF Title column
    if txt_filename in metadata_mapping:
        metadata = metadata_mapping[txt_filename]
        document_data = convert_txt_to_dict(txt_file, metadata)
        documents.append(document_data)
    else:
        print(f"No metadata found for {txt_filename}")

# Save the combined data to a new CSV file
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["title", "Case Subject", "Case No", "Case Title", "Author Judge", "Judgment Date", "content"])  # Header

    for doc in documents:
        writer.writerow([
            doc["title"],
            doc["metadata"]["Case Subject"],
            doc["metadata"]["Case No"],
            doc["metadata"]["Case Title"],
            doc["metadata"]["Author Judge"],
            doc["metadata"]["Judgment Date"],
            doc["content"]
        ])

print(f"Saved all document data to {output_file}")

100%|██████████| 270/270 [00:02<00:00, 96.76it/s] 


Saved all document data to E:\SCP Judgements\Small Corpus (Metadata incl.)\CSV\SCP_Judgements_with_Metadata.csv
