-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import PyPDF2 | ||
import sqlite3 | ||
import csv | ||
import os | ||
|
||
class PDFQuestionParser: | ||
def __init__(self, pdf_path, database_name): | ||
self.pdf_path = pdf_path | ||
self.database_name = database_name | ||
self.qa_data = [] | ||
|
||
def extract_qa_from_pdf(self): | ||
with open(self.pdf_path, "rb") as pdf_file: | ||
pdf_reader = PyPDF2.PdfFileReader(pdf_file) | ||
num_pages = pdf_reader.numPages | ||
current_question = "" | ||
current_answers = {} | ||
current_answer_prefix = "" | ||
for page_num in range(num_pages): | ||
page = pdf_reader.getPage(page_num) | ||
text = page.extractText() | ||
lines = text.split('\n') | ||
for line in lines: | ||
if line.startswith("QUESTION:"): | ||
if current_question: | ||
self.qa_data.append((current_question, current_answers)) | ||
current_question = line[10:].strip() | ||
current_answers = {} | ||
elif current_question and line.startswith("Answer:"): | ||
current_answer_prefix = line[7:8] | ||
current_answer = line[9:].strip() | ||
current_answers[current_answer_prefix] = current_answer | ||
elif current_answer_prefix and line.startswith(current_answer_prefix): | ||
current_answer += " " + line.strip() | ||
if current_question: | ||
self.qa_data.append((current_question, current_answers)) | ||
|
||
def create_database(self): | ||
conn = sqlite3.connect(self.database_name) | ||
cursor = conn.cursor() | ||
|
||
cursor.execute(""" | ||
CREATE TABLE IF NOT EXISTS questions ( | ||
id INTEGER PRIMARY KEY, | ||
question TEXT, | ||
answer_a TEXT, | ||
answer_b TEXT, | ||
answer_c TEXT, | ||
answer_d TEXT | ||
) | ||
""") | ||
|
||
conn.commit() | ||
conn.close() | ||
|
||
def insert_qa_into_database(self): | ||
conn = sqlite3.connect(self.database_name) | ||
cursor = conn.cursor() | ||
|
||
for question, answers in self.qa_data: | ||
cursor.execute( | ||
"INSERT INTO questions (question, answer_a, answer_b, answer_c, answer_d) VALUES (?, ?, ?, ?, ?)", | ||
(question, answers.get("a", ""), answers.get("b", ""), answers.get("c", ""), answers.get("d", "")) | ||
) | ||
|
||
conn.commit() | ||
conn.close() | ||
|
||
def export_database_to_csv(self, csv_file_path): | ||
conn = sqlite3.connect(self.database_name) | ||
cursor = conn.cursor() | ||
|
||
cursor.execute("SELECT * FROM questions") | ||
rows = cursor.fetchall() | ||
|
||
conn.close() | ||
|
||
with open(csv_file_path, "w", newline="") as csv_file: | ||
csv_writer = csv.writer(csv_file) | ||
csv_writer.writerow(["ID", "Question", "Answer A", "Answer B", "Answer C", "Answer D"]) | ||
|
||
for row in rows: | ||
csv_writer.writerow(row) | ||
|
||
def display_qa(self): | ||
conn = sqlite3.connect(self.database_name) | ||
cursor = conn.cursor() | ||
|
||
cursor.execute("SELECT * FROM questions") | ||
rows = cursor.fetchall() | ||
|
||
conn.close() | ||
|
||
for row in rows: | ||
_, question, answer_a, answer_b, answer_c, answer_d = row | ||
print("Question:", question) | ||
print("Answers:") | ||
print("A. ", answer_a) | ||
print("B. ", answer_b) | ||
print("C. ", answer_c) | ||
print("D. ", answer_d) | ||
input("Press Enter to continue...") | ||
|
||
def run(self): | ||
while True: | ||
print("Menu:") | ||
print("1. Extract and store QA from PDF") | ||
print("2. Count items in the database and store count in a file") | ||
print("3. Export database to CSV") | ||
print("4. Display QA from the database") | ||
print("5. Exit") | ||
|
||
choice = input("Enter your choice: ") | ||
|
||
if choice == "1": | ||
self.extract_qa_from_pdf() | ||
self.create_database() | ||
self.insert_qa_into_database() | ||
print("Questions and answers extracted and stored in the database.") | ||
elif choice == "2": | ||
# Implementation for counting items and storing count | ||
elif choice == "3": | ||
csv_file_path = input("Enter the path for the CSV export: ") | ||
csv_file_path = os.path.normpath(csv_file_path) | ||
self.export_database_to_csv(csv_file_path) | ||
print(f"Database exported to '{csv_file_path}'.") | ||
elif choice == "4": | ||
self.display_qa() | ||
elif choice == "5": | ||
print("Exiting the application.") | ||
break | ||
else: | ||
print("Invalid choice. Please choose again.") | ||
|
||
def main(): | ||
print("PDF Parsing and Scanning Application") | ||
|
||
pdf_path = input("Enter the path to the PDF file: ") | ||
pdf_path = os.path.normpath(pdf_path) | ||
if not os.path.exists(pdf_path): | ||
print("Error: The provided PDF file does not exist.") | ||
return | ||
|
||
database_name = "qa_database.db" | ||
|
||
parser = PDFQuestionParser(pdf_path, database_name) | ||
|
||
if os.path.exists(database_name): | ||
database_size = os.path.getsize(database_name) | ||
print(f"Database '{database_name}' found, size: {database_size} bytes") | ||
else: | ||
print("No database found. Proceeding to application.") | ||
|
||
parser.run() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
jknox@rbsul22.54195:1692568345 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import os | ||
import sqlite3 | ||
import PyPDF4 | ||
|
||
def create_database(database_name): | ||
conn = sqlite3.connect(database_name) | ||
cursor = conn.cursor() | ||
|
||
cursor.execute(""" | ||
CREATE TABLE IF NOT EXISTS questions ( | ||
id INTEGER PRIMARY KEY, | ||
question_text TEXT | ||
) | ||
""") | ||
|
||
conn.commit() | ||
conn.close() | ||
|
||
def insert_question_into_database(database_name, question_text): | ||
conn = sqlite3.connect(database_name) | ||
cursor = conn.cursor() | ||
|
||
cursor.execute("INSERT INTO questions (question_text) VALUES (?)", (question_text,)) | ||
|
||
conn.commit() | ||
conn.close() | ||
|
||
def find_questions_in_directory(directory, database_name): | ||
pdf_files = [file for file in os.listdir(directory) if file.lower().endswith('.pdf')] | ||
|
||
for pdf_file in pdf_files: | ||
pdf_path = os.path.join(directory, pdf_file) | ||
with open(pdf_path, 'rb') as pdf_file: | ||
pdf_reader = PyPDF4.PdfFileReader(pdf_file) | ||
for page_num in range(len(pdf_reader.pages)): | ||
text = pdf_reader.pages[page_num].extractText() | ||
print(".", end='', flush=True) | ||
if "QUESTION:" in text: | ||
question_text = text.split("QUESTION:")[1].split("A.")[0].strip() | ||
insert_question_into_database(database_name, question_text) | ||
print("Q: " + question_text) | ||
break | ||
|
||
def main(): | ||
data_directory = os.path.expanduser("~/data/src/parse/") | ||
database_name = "question_database.db" | ||
|
||
print(f"Checking PDFs in {data_directory} for questions labeled as 'QUESTION:'") | ||
|
||
create_database(database_name) | ||
find_questions_in_directory(data_directory, database_name) | ||
|
||
print("Questions stored in the database.") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
import PyPDF4 | ||
|
||
def find_questions_in_directory(directory): | ||
pdf_files = [file for file in os.listdir(directory) if file.lower().endswith('.pdf')] | ||
question_files = [] | ||
|
||
for pdf_file in pdf_files: | ||
pdf_path = os.path.join(directory, pdf_file) | ||
with open(pdf_path, 'rb') as pdf_file: | ||
pdf_reader = PyPDF4.PdfReader(pdf_file) | ||
for page_num in range(len(pdf_reader.pages)): | ||
text = pdf_reader.pages[page_num].extract_text() | ||
if "QUESTION:" in text: | ||
question_files.append(pdf_file.name) | ||
break | ||
|
||
return question_files | ||
|
||
def main(): | ||
data_directory = os.path.expanduser("~/data/src/parse/") | ||
|
||
print("Checking PDFs for questions labeled as 'QUESTION:'") | ||
|
||
question_files = find_questions_in_directory(data_directory) | ||
|
||
if question_files: | ||
print("Question files found:") | ||
for file in question_files: | ||
print(file) | ||
else: | ||
print("No question files found.") | ||
|
||
if __name__ == "__main__": | ||
main() |