In [8]:
import json
from bs4 import BeautifulSoup
import re
import shutil
import os

In [9]:
to_process_directory = rf"../Study questions to process"
processed_directory = rf"../Study Questions processed"

default_raw_db_path = "../study_quiz_raw_questions_db.json"
default_processed_db_path = "../study_quiz_processed_questions_db.json"
if not os.path.exists(default_raw_db_path):
        with open(default_raw_db_path, 'w', encoding='utf-8') as f:
            json.dump({"quizzes": []}, f)


In [10]:
def clean_text(text):
    """Clean up text by removing HTML tags, extra newlines, and normalizing whitespace."""
    if not text:
        return ""
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Replace HTML entities
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    
    # Replace newlines followed by spaces
    text = re.sub(r'\n\s+', ' ', text)
    
    # Replace any remaining newlines with spaces
    text = re.sub(r'\n+', ' ', text)
    
    # Normalize whitespace (multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def extract_quiz_to_json(html_path):
    with open(html_path, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Get quiz title from <title> tag
    title_tag = soup.find("title")
    quiz_title = clean_text(title_tag.text) if title_tag else ""
    quiz_title = quiz_title.replace("Joshua Cheng's Quiz History: ", "").strip()
    
    questions = []
    for q_holder in soup.find_all("div", class_="question_holder"):
        question = {}

        # Try to get question text
        q_text = q_holder.find("div", class_="original_question_text")
        if q_text and q_text.text.strip():
            question_text = clean_text(q_text.text)
        else:
            q_display = q_holder.find("div", class_="display_question")
            if q_display:
                # Sometimes question text is in aria-label or inside the div
                question_text = clean_text(q_display.get("aria-label", ""))
                if not question_text:
                    question_text = clean_text(q_display.text)
            else:
                question_text = ""
        question["question"] = question_text

        answers = []
        correct_answer = None
        for ans_div in q_holder.find_all("div", class_="answer"):
            ans = {}
            ans_text_div = ans_div.find("div", class_="answer_text")
            ans_text = clean_text(ans_text_div.text) if ans_text_div else ""
            ans["text"] = ans_text

            # Detect correct answer by class or presence of correct arrow
            is_correct = (
                "correct_answer" in ans_div.get("class", []) or
                "correct" in ans_div.get("class", []) or
                ans_div.find("span", class_="answer_arrow correct")
            )
            ans["is_correct"] = bool(is_correct)
            if ans["is_correct"]:
                correct_answer = ans_text

            # Get comment if present
            comment_div = ans_div.find("div", class_="quiz_comment")
            if comment_div:
                comment_text = clean_text(comment_div.text)
                ans["comment"] = comment_text
            else:
                ans["comment"] = None

            answers.append(ans)

        question["answers"] = answers
        question["correct_answer"] = correct_answer

        # Get any general comments after answers (sometimes outside answer divs)
        after_answers = q_holder.find("div", class_="quiz_comment")
        if after_answers:
            question["general_comment"] = clean_text(after_answers.text)
        else:
            question["general_comment"] = None

        questions.append(question)

    # Save to JSON
    output = {
        "quiz_title": quiz_title,
        "questions": questions
    }
    return output

def moved_processed_file(file):
    shutil.move(rf"{os.path.join(to_process_directory, file)}", rf"{os.path.join(processed_directory, file)}")

def get_processed_quizes(db_path=default_raw_db_path, debug=0):
    with open(db_path, 'r', encoding='utf-8') as f:
        db = json.load(f)
    quizzes = [quiz["quiz_title"] for quiz in db.get("quizzes", [])]
    if debug:
        print(f"Processed quizzes: {quizzes}")
    return quizzes

def insert_json_quiz_into_db(quiz_data, db_path=default_raw_db_path):
    if not os.path.exists(db_path):
        with open(db_path, 'w', encoding='utf-8') as f:
            json.dump({"quizzes": []}, f)

    # Read current database
    with open(db_path, 'r', encoding='utf-8') as f:
        db = json.load(f)

    # Check for existing titles and increment suffix if needed
    base_title = quiz_data["quiz_title"]
    existing_titles = [quiz["quiz_title"] for quiz in db.get("quizzes", [])]
    new_title = base_title
    suffix = 2
    while new_title in existing_titles:
        new_title = f"{base_title}_{suffix}"
        suffix += 1
    quiz_data["quiz_title"] = new_title

    # Add new quiz to database
    db["quizzes"].append(quiz_data)

    # Write updated database
    with open(db_path, 'w', encoding='utf-8') as f:
        json.dump(db, f, indent=2, ensure_ascii=False)

    print(f"Added quiz '{new_title}' to database.")
    return True

# Now update your file processing function to use this
def process_file(file_name):
    file_path = os.path.join(to_process_directory, file_name)
    quiz_data = extract_quiz_to_json(file_path)
    
    if insert_json_quiz_into_db(quiz_data):
        os.makedirs(processed_directory, exist_ok=True)
        
        shutil.move(file_path, os.path.join(processed_directory, file_name))
        # print(f"Moved {file_name} to processed directory.")
    else:
        print(f"Skipped {file_name}, quiz already in database.")

def process_directory(folder_path=to_process_directory):
    for file in os.listdir(folder_path):
        if file.endswith(".html"):
            process_file(file)

In [11]:
get_processed_quizes(debug=1)

Processed quizzes: []


[]

In [12]:
process_directory()

Added quiz 'Quiz 1 (due Thursday, Sept. 11 at 11:59 PM): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'Quiz 2 (due Thursday, Sept. 11 at 11:59 PM): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1a level 1a (course resources and policies): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1a level 1b (course resources and policies): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1a level 1c (course resources and policies): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1a level 1d (course resources and policies): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1a level 2 (course resources and policies): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ topic 1b level 1a (Chapter 1): BIOL 1009 (070-082) General Biology (Fall 2025)' to database.
Added quiz 'SQ 

In [13]:
def refine_quiz_db(raw_db):
    # Track unique questions and assign IDs
    question_bank = {}
    question_id_counter = 1
    refined_quizzes = []

    for quiz in raw_db["quizzes"]:
        refined_quiz = {
            "quiz_title": quiz["quiz_title"],
            "questions": []
        }
        unique_count = 0

        for q in quiz["questions"]:
            q_text = q["question"].strip()
            # Check if question is unique
            if q_text not in question_bank:
                question_bank[q_text] = question_id_counter
                unique = True
                q_id = question_id_counter
                question_id_counter += 1
                unique_count += 1
            else:
                unique = False
                q_id = question_bank[q_text]

            # Build options list
            options = []
            for ans in q["answers"]:
                options.append({
                    "text": ans["text"],
                    "is_correct": ans["is_correct"],
                    "comment": ans.get("comment")
                })

            refined_quiz["questions"].append({
                "question": q_text,
                "unique": unique,
                "id": q_id,
                "options": options
            })

        refined_quiz["unique_questions"] = unique_count
        refined_quizzes.append(refined_quiz)

    return refined_quizzes

# Usage:
with open(default_raw_db_path, encoding="utf-8") as f:
    raw_db = json.load(f)

refined_db = refine_quiz_db(raw_db)

with open(default_processed_db_path, "w", encoding="utf-8") as f:
    json.dump(refined_db, f, indent=2, ensure_ascii=False)

In [None]:
def print_quiz_titles_and_unique_questions_table(db_path="study_quiz_processed_questions_db.json"):
    with open(db_path, encoding="utf-8") as f:
        quizzes = json.load(f)
    print(f"{'Quiz Title':<60} | {'Unique Questions':<15}")
    print("-" * 80)
    for quiz in quizzes:
        title = quiz.get('quiz_title', '(no title)')
        # if "SQ topic 2 level" not in title:
        #     continue
        unique_q = quiz.get('unique_questions', 0)
        print(f"{title[:57]:<60} | {unique_q:<15}")

print_quiz_titles_and_unique_questions_table(db_path=default_processed_db_path)

Quiz Title                                                   | Unique Questions
--------------------------------------------------------------------------------
Quiz 1 (due Thursday, Sept. 11 at 11:59 PM): BIOL 1009 (0    | 28             
Quiz 2 (due Thursday, Sept. 11 at 11:59 PM): BIOL 1009 (0    | 25             
SQ topic 1a level 1a (course resources and policies): BIO    | 0              
SQ topic 1a level 1b (course resources and policies): BIO    | 0              
SQ topic 1a level 1c (course resources and policies): BIO    | 0              
SQ topic 1a level 1d (course resources and policies): BIO    | 0              
SQ topic 1a level 2 (course resources and policies): BIOL    | 0              
SQ topic 1b level 1a (Chapter 1): BIOL 1009 (070-082) Gen    | 7              
SQ topic 1b level 1b (Chapter 1): BIOL 1009 (070-082) Gen    | 5              
SQ topic 1b level 1c (Chapter 1): BIOL 1009 (070-082) Gen    | 1              
SQ topic 1b level 1d (Chapter 1): BIOL 1009 (070-

: 