In [None]:
import os
import re
import fitz
import pandas as pd

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text().replace('\n', ' ') 
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def test_extract_text_from_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    print(f"Odczytany tekst z pliku {pdf_path}:")
    print(text)

def extract_text_after_keywords(pdf_path, keywords):
    text = extract_text_from_pdf(pdf_path)

    found_keyword = None
    for keyword in keywords:
        if keyword in text:
            keyword_start = text.find(keyword)
            keyword_text = text[keyword_start:].strip()
            found_keyword = keyword_text
            break 

    if found_keyword:
        return found_keyword  
    else:
        return None 

def extract_numbers_with_dot_or_space_dot(text):
    numbers = re.findall(r'(?<=\s)\d+\.(?=\s|$)', text)
    unique_numbers = sorted(set(int(num[:-1]) for num in numbers))

    if 1 not in unique_numbers:
        return ["1"]

    filtered_numbers = []
    previous_number = 0
    for current_number in unique_numbers:
        if current_number == previous_number + 1:
            filtered_numbers.append(current_number)
            previous_number = current_number
        elif previous_number == 0 and current_number == 1:
            filtered_numbers.append(current_number)
            previous_number = current_number
        else:
            break

    return [f"{num}." for num in filtered_numbers]

def process_pdfs_in_folder(folder_path, keywords):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Przetwarzanie pliku: {filename}")

            extracted_text = extract_text_after_keywords(pdf_path, keywords)

            if extracted_text:
                numbers_with_dot = extract_numbers_with_dot_or_space_dot(extracted_text)
                results.append({"Plik": filename, "Wyodrębnione liczby": numbers_with_dot})
            else:
                results.append({"Plik": filename, "Wyodrębnione liczby": ["1"]})

    return results

def save_results_to_excel(results, folder_path):
    output_path = os.path.join(folder_path, "results.xlsx")
    if os.path.exists(output_path):
        existing_df = pd.read_excel(output_path)
        new_df = pd.DataFrame(results)
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        combined_df = pd.DataFrame(results)
    combined_df.to_excel(output_path, index=False)
    print(f"Zapisano dane do pliku: {output_path}")

folder_path = "D:/AA Praktyki/Farmaceutyczne_patenty" 
keywords = ["Zastrzeżenia patentowe", "Zastrzeżenie patentowe"," Z a s t r z e ż e n i a  p a t e n t o w e"," Z a s t r z e ż e n i e  p a t e n t o w e"]  # Domyślne słowa kluczowe
results = process_pdfs_in_folder(folder_path, keywords)
save_results_to_excel(results, folder_path)
