In [None]:
import os
import markdown
import json
import yaml
from xml.etree import ElementTree as ET
import csv
import PyPDF2

def test_llm(data, model):
    # Dummy function to test LLM
    return "Success"

class PDFConverter:
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir
    
    def get_pdf_files(self):
        return [f for f in os.listdir(self.input_dir) if f.endswith('.pdf')]
    
    def extract_text_from_pdf(self, pdf_path):
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            return {f'Page {i + 1}': page.extract_text() for i, page in enumerate(reader.pages)}

    def convert_to_markdown(self, pages):
        return "\n\n".join(f"## {page_title}\n\n{text}" for page_title, text in pages.items())

    def convert_to_json(self, pages):
        return json.dumps(pages, indent=4)

    def convert_to_yaml(self, pages):
        return yaml.dump(pages)

    def convert_to_csv(self, pages):
        output = []
        for page_title, text in pages.items():
            output.append([page_title, text])
        
        csv_output = "\n".join([",".join(line) for line in output])
        return csv_output

    def convert_to_xml(self, pages):
        root = ET.Element("Document")
        for page_title, text in pages.items():
            page_element = ET.SubElement(root, "Page", name=page_title)
            content = ET.SubElement(page_element, "Content")
            content.text = text
        return ET.tostring(root, encoding='unicode', method='xml')

    def save_file(self, directory, filename, content):
        full_path = os.path.join(directory, filename)
        os.makedirs(directory, exist_ok=True)
        with open(full_path, 'w') as file:
            file.write(content)
    
    def process_files(self):
        pdf_files = self.get_pdf_files()
        for pdf_file in pdf_files:
            pdf_path = os.path.join(self.input_dir, pdf_file)
            filename = os.path.splitext(pdf_file)[0]
            pages = self.extract_text_from_pdf(pdf_path)

            # Saving different formats
            self.save_file(os.path.join(self.output_dir, "Markdown"), f"{filename}.md", self.convert_to_markdown(pages))
            self.save_file(os.path.join(self.output_dir, "JSON"), f"{filename}.json", self.convert_to_json(pages))
            self.save_file(os.path.join(self.output_dir, "YAML"), f"{filename}.yaml", self.convert_to_yaml(pages))
            self.save_file(os.path.join(self.output_dir, "XML"), f"{filename}.xml", self.convert_to_xml(pages))
            self.save_file(os.path.join(self.output_dir, "CSV"), f"{filename}.csv", self.convert_to_csv(pages))

def main():
    path_in = "PDF/"
    path_out = "output/"
    converter = PDFConverter(path_in, path_out)
    converter.process_files()

if __name__ == "__main__":
    # main()