In [18]:
import os
import fitz  # PyMuPDF
import pandas as pd

In [24]:
# Function to read PDF and convert to text
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [20]:
# Understand the Data (analyze PDF files)
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

In [21]:
# Load and Preprocess the Text
def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        lines = content.split('\n')
        preprocessed_data.append({
            "file_path": file_path,
            "lines": lines,
            "line_count": len(lines)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None


In [25]:
# Chunk the text data
def chunk_code(preprocessed_code, chunk_size=50):
    code_chunks = []
    for index, row in preprocessed_code.iterrows():
        lines = row['lines']
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i + chunk_size]
            code_chunks.append({
                "file_path": row['file_path'],
                "chunk_start": i,
                "chunk_end": i + len(chunk),
                "chunk_content": "\n".join(chunk)
            })
    return pd.DataFrame(code_chunks)

In [26]:
# Example usage
pdf_directory = "/Users/ashwinikumar/AI_Bootcamp/Student_AI_repos/final_project/data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# Debug statement to check pdf_structure
print(f"PDF Structure: {pdf_structure}")

preprocessed_pdf = load_and_preprocess(pdf_structure)

# Debug statement to check preprocessed_pdf
print(f"Preprocessed PDF: {preprocessed_pdf}")

if preprocessed_pdf is not None:
    pdf_chunks = chunk_code(preprocessed_pdf)
    print(pdf_chunks.head())
else:
    print("Preprocessed PDF data is None")

Preprocessed PDF: None
Preprocessed PDF data is None
