In [None]:
%pip install python-docx

In [None]:
import os

# Define the input folder containing the .docx files
input_folder = 'C:\\REFACTOR\\docx_input'

# Define the output folder where the cleaned CSVs will be saved
output_folder = 'C:\\REFACTOR\\docx_output'

# ensure that the folders exists
os.makedirs(input_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)
print(f"Input folder: {input_folder}")
print(f"Output folder: {output_folder}")


In [None]:
import glob
import os

# Ensure input_folder is defined from the setup step
# input_folder = './input_docx' # Assuming this is defined in a previous cell

docx_files = glob.glob(os.path.join(input_folder, '*.docx'))
print(f"Found {len(docx_files)} DOCX files:")
for docx_file in docx_files:
    print(docx_file)

In [None]:
if not docx_files:
    print("No DOCX files found. Nothing to process.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        # The rest of the processing for each file will go here in subsequent steps

In [None]:
from docx import Document

if not docx_files:
    print("No DOCX files found. Nothing to process.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        try:
            document = Document(docx_file)
            print(f"Successfully opened {docx_file}")
            # The rest of the processing for the document will go here
        except Exception as e:
            print(f"Error opening {docx_file}: {e}")

In [None]:
from docx import Document

# Assuming docx_files is populated from a previous step
docx_files = glob.glob(os.path.join(input_folder, '*.docx')) # Example from previous step

if not docx_files:
    print("No DOCX files found. Cannot process tables.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        try:
            document = Document(docx_file)
            course_outline_found = False

            # Look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    print(f"Found 'COURSE OUTLINE' section in {docx_file}")
                    break # Stop searching for the section once found

            if not course_outline_found:
                print(f"'COURSE OUTLINE' section not found in {docx_file}")
            # The rest of the processing (finding tables) will go here in subsequent steps

        except Exception as e:
            print(f"Error processing {docx_file}: {e}")

In [None]:
import re # Need re for cleaning in a later step, but including it here for the overall process flow
from docx import Document

# Assuming docx_files is populated from a previous step
# docx_files = glob.glob(os.path.join(input_folder, '*.docx')) # Example from previous step

if not docx_files:
    print("No DOCX files found. Cannot process tables.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        try:
            document = Document(docx_file)
            course_outline_found = False
            # Updated target names to include variations and partial matches (excluding CO aligned to)
            target_table_names = ["Learning Outcomes", "Deliverables", "Assessment", "Instructional Materials"]
            extracted_tables = {} # Dictionary to store extracted tables by name

            # First, look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    print(f"Found 'COURSE OUTLINE' section in {docx_file}")
                    break

            # If "COURSE OUTLINE" is found, look for target tables
            if course_outline_found:
                print(f"  Examining {len(document.tables)} tables in document...")
                for i, table in enumerate(document.tables):
                    if table.rows and table.rows[0].cells:
                        # Get all text from the first row to analyze the table structure
                        first_row_all_text = []
                        for cell in table.rows[0].cells:
                            first_row_all_text.append(cell.text.strip())
                        
                        full_first_row = " | ".join(first_row_all_text)
                        print(f"    Table {i+1} first row: {full_first_row}")
                        
                        # Check if this looks like the main course outline table (contains multiple target columns)
                        targets_found_in_table = []
                        for target_name in target_table_names:
                            for cell_text in first_row_all_text:
                                if target_name.lower() in cell_text.lower():
                                    targets_found_in_table.append(target_name)
                                    break
                        
                        if len(targets_found_in_table) >= 2:  # If table contains 2+ target columns
                            print(f"      *** MAIN TABLE FOUND: Contains {targets_found_in_table}")
                            extracted_tables["Main_Course_Table"] = table
                            # Also store individual references for compatibility
                            for target in targets_found_in_table:
                                extracted_tables[target] = table

            if not course_outline_found:
                print(f"'COURSE OUTLINE' section not found in {docx_file}")
            elif not extracted_tables:
                 print(f"No target tables found in the 'COURSE OUTLINE' section of {docx_file}")
            else:
                print(f"Extracted {len(extracted_tables)} target table references from {docx_file}.")
                print(f"  Found tables: {list(extracted_tables.keys())}")
                # The actual processing of table data (rows, cells, cleaning) will happen in subsequent steps

        except Exception as e:
            print(f"Error processing {docx_file}: {e}")
        
        # Stop after processing the first file for debugging
        break

In [None]:
from docx import Document

# Assuming docx_files is populated from a previous step
# Assuming the loop to iterate through docx_files is in place

# For demonstration purposes, if no files were found in previous steps:
if not docx_files:
    print("No DOCX files found. Cannot process tables.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        try:
            document = Document(docx_file)
            course_outline_found = False
            # List to store tables found after "COURSE OUTLINE"
            tables_after_course_outline = []

            # First, look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    print(f"Found 'COURSE OUTLINE' section in {docx_file}")
                    break

            # If "COURSE OUTLINE" is found, collect all tables
            if course_outline_found:
                for table in document.tables:
                    # Add all tables to the list (we'll filter them later if needed)
                    tables_after_course_outline.append(table)
                    print(f"  Found a table after 'COURSE OUTLINE'. Total tables found so far: {len(tables_after_course_outline)}")

            if not course_outline_found:
                print(f"'COURSE OUTLINE' section not found in {docx_file}")
            elif not tables_after_course_outline:
                 print(f"No tables found after the 'COURSE OUTLINE' section in {docx_file}")
            else:
                print(f"Found {len(tables_after_course_outline)} tables after 'COURSE OUTLINE' section in {docx_file}.")
                # The list tables_after_course_outline now contains all tables
                # after the "COURSE OUTLINE" section for the current file.
                # These tables will be processed in subsequent steps.

        except Exception as e:
            print(f"Error processing {docx_file}: {e}")

In [None]:
import re
import os
from docx import Document

# Assuming docx_files is populated from a previous step
# Assuming the loop to iterate through docx_files is in place
# Assuming the code to find "COURSE OUTLINE" and identify extracted_tables is in place

# For demonstration purposes, if no files were found in previous steps:
if not docx_files:
    print("No DOCX files found. Cannot process tables.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        try:
            document = Document(docx_file) # Assuming Document is available from a previous import
            course_outline_found = False
            target_table_names = ["Learning Outcomes", "Deliverables Outcomes", "Assessment"]
            extracted_tables = {}

            # --- Locate "COURSE OUTLINE" section and Identify target tables ---
            # First, look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    # print(f"Found 'COURSE OUTLINE' section in {os.path.basename(docx_file)}") # Uncomment for debugging
                    break

            # If "COURSE OUTLINE" is found, look for target tables
            if course_outline_found:
                for table in document.tables:
                    # Check if the first cell of the table contains one of the target names
                    if table.rows and table.rows[0].cells:
                        first_cell_text = table.rows[0].cells[0].text.strip()
                        if first_cell_text in target_table_names:
                            if first_cell_text not in extracted_tables: # Avoid processing the same table if it appears multiple times
                                table_name = first_cell_text
                                # print(f"  Found target table: {table_name}") # Uncomment for debugging
                                extracted_tables[table_name] = table

            if not course_outline_found:
                print(f"'COURSE OUTLINE' section not found in {os.path.basename(docx_file)}")
            elif not extracted_tables:
                 print(f"No target tables found in the 'COURSE OUTLINE' section of {os.path.basename(docx_file)}")
            else:
                # print(f"Extracted {len(extracted_tables)} target tables from {os.path.basename(docx_file)}.") # Uncomment for debugging

                # --- Iterate through table data and Clean data with regex (Current Subtask) ---
                print("  Processing table data and applying regex cleaning:")
                for table_name, table in extracted_tables.items():
                    print(f"    Processing table: {table_name}")
                    for i, row in enumerate(table.rows):
                         # Optional: Skip header row if needed, but for raw cell extraction, process all rows
                         # if i == 0:
                         #     continue

                         for cell in row.cells:
                            # Get the text content of the cell
                            cell_text = cell.text

                            # Use regex to replace bullet point characters with dashes
                            cleaned_text_bullets = re.sub(r'[\u2022\u2023\u2043\u00B7]', '-', cell_text)

                            # Use regex to replace other special characters with commas
                            # This pattern keeps letters, numbers, whitespace, periods, commas, and hyphens
                            cleaned_text_final = re.sub(r'[^a-zA-Z0-9\s.,-]', ',', cleaned_text_bullets)

                            # For this subtask, print the cleaned text
                            # In the next step, this cleaned_text_final will be stored.
                            # print(f"      Original: '{cell_text[:50]}...'") # Print first 50 chars for brevity
                            # print(f"      Cleaned:  '{cleaned_text_final[:50]}...'") # Print first 50 chars for brevity

        except Exception as e:
            print(f"Error processing {os.path.basename(docx_file)}: {e}")

In [None]:
import re
import os
from docx import Document
# Assuming docx_files is populated from a previous step
# Assuming the loop to iterate through docx_files is in place
# Assuming the code to find "COURSE OUTLINE" and identify extracted_tables is in place

# Initialize a list to store the combined cleaned data from all files and tables
all_files_combined_data = []

# For demonstration purposes, if no files were found in previous steps:
if not docx_files:
    print("No DOCX files found. Cannot process tables.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        current_file_extracted_data = [] # Initialize list to hold data for the current file
        try:
            # Assuming Document is available from a previous import
            document = Document(docx_file)
            course_outline_found = False
            target_table_names = ["Learning Outcomes", "Deliverables", "Assessment", "Instructional Materials"]
            extracted_tables = {}

            # --- Locate "COURSE OUTLINE" section and Identify target tables ---
            # First, look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    break

            # If "COURSE OUTLINE" is found, look for the main course table
            if course_outline_found:
                for table in document.tables:
                    if table.rows and table.rows[0].cells:
                        # Get all text from the first row to analyze the table structure
                        first_row_all_text = []
                        for cell in table.rows[0].cells:
                            first_row_all_text.append(cell.text.strip())
                        
                        # Check if this looks like the main course outline table (contains multiple target columns)
                        targets_found_in_table = []
                        for target_name in target_table_names:
                            for cell_text in first_row_all_text:
                                if target_name.lower() in cell_text.lower():
                                    targets_found_in_table.append(target_name)
                                    break
                        
                        if len(targets_found_in_table) >= 2:  # If table contains 2+ target columns
                            extracted_tables["Main_Course_Table"] = table
                            break  # Found the main table, stop looking

            if not course_outline_found:
                print(f"'COURSE OUTLINE' section not found in {os.path.basename(docx_file)}")
            elif not extracted_tables:
                 print(f"No target tables found in the 'COURSE OUTLINE' section of {os.path.basename(docx_file)}")
            else:
                # --- Iterate through table data, Clean data with regex, and Store extracted data ---
                print(f"  Processing main course table with {len(extracted_tables['Main_Course_Table'].rows)} rows")
                table = extracted_tables["Main_Course_Table"]
                
                for i, row in enumerate(table.rows):
                    # Initialize a dictionary to hold data for the current row, including metadata
                    row_data = {
                        "File": os.path.basename(docx_file), # Store the original filename
                        "Table": "Course_Outline", # Store the table name
                        "Row": i + 1  # Add row number for reference
                    }
                    # Initialize a list for cleaned cell content for easy column creation later
                    cleaned_cell_contents = []
                    for j, cell in enumerate(row.cells):
                        cell_text = cell.text

                        # Use regex to replace bullet point characters with spaces instead of dashes
                        cleaned_text_bullets = re.sub(r'[\u2022\u2023\u2043\u00B7]', ' ', cell_text)

                        # Remove characters that can cause Excel formula errors completely
                        # Remove equal signs, plus signs, at signs, and dashes to avoid ALL formula triggers
                        cleaned_text_safe = re.sub(r'[=+@-]', ' ', cleaned_text_bullets)

                        # Use regex to replace other special characters with spaces
                        # This pattern keeps only letters, numbers, whitespace, periods, and commas
                        cleaned_text_final = re.sub(r'[^a-zA-Z0-9\s.,]', ' ', cleaned_text_safe)
                        
                        # Clean up multiple spaces and trim
                        cleaned_text_final = re.sub(r'\s+', ' ', cleaned_text_final).strip()

                        # Special handling for Week column (first column) to prevent date conversion issues
                        if j == 0 and i > 0:  # First column (Week) and not header row
                            # Replace dashes with commas to prevent Excel date conversion (e.g., "4-5" -> "4,5")
                            cleaned_text_final = re.sub(r'-', ',', cleaned_text_final)
                            # Clean up periods at the end of week numbers (e.g., "1." -> "1")
                            cleaned_text_final = re.sub(r'^(\d+)\.', r'\1', cleaned_text_final.strip())

                        # Append the cleaned text to the list
                        cleaned_cell_contents.append(cleaned_text_final)

                    # Add the cleaned cell contents as columns to the row_data dictionary
                    # Use the actual column headers from the first row if available
                    if i == 0:  # This is the header row, store column names
                        column_headers = cleaned_cell_contents
                    else:  # Data rows
                        for k, cleaned_content in enumerate(cleaned_cell_contents):
                            if k < len(column_headers):
                                # Use actual column name from header
                                column_name = re.sub(r'[^a-zA-Z0-9_]', '_', column_headers[k].strip())
                                if not column_name:
                                    column_name = f"Column_{k+1}"
                                
                                # Skip the "CO aligned to" column
                                if "CO" not in column_headers[k] or "aligned" not in column_headers[k]:
                                    row_data[column_name] = cleaned_content
                            else:
                                row_data[f"Column_{k+1}"] = cleaned_content

                        # Append the processed row data to the list for the current file (skip header row)
                        current_file_extracted_data.append(row_data)

            # After processing all tables in the current file, extend the combined list
            all_files_combined_data.extend(current_file_extracted_data)

        except Exception as e:
            print(f"Error processing {os.path.basename(docx_file)}: {e}")

In [None]:
# responsible for processing the docx files and counting rows

# Assuming all_files_combined_data is populated from the previous step where data for the current file was appended.
# This step is implicitly handled by extending all_files_combined_data within the file processing loop
# in the previous code block.
# For clarity, we can add a print statement here to confirm the status of the combined data after the loop finishes.

if 'all_files_combined_data' in locals() and all_files_combined_data:
    print(f"\nFinished processing all files. Total combined rows extracted: {len(all_files_combined_data)}")
else:
    print("\nFinished processing all files. No combined data was extracted.")

# The all_files_combined_data list now holds the data from all processed files.
# The next step will be to save this data to a CSV file.

In [None]:
# responsible for saving the combined data to a CSV file

import pandas as pd
import os
import re # Need re for safe filename creation

# Assuming all_files_combined_data is populated from the previous steps
# Assuming output_folder is defined from a previous step

if not all_files_combined_data:
    print("No data was extracted. The combined CSV file will not be created.")
else:
    print("Saving combined cleaned data to a single CSV file.")
    try:
        # Create a pandas DataFrame from the list of dictionaries
        df_combined = pd.DataFrame(all_files_combined_data)

        # Define the output filename and path
        output_filename = "combined_cleaned_extracted_data.csv"
        # Ensure output_folder is defined, if not, use current directory as fallback
        if 'output_folder' not in locals() or not os.path.exists(output_folder):
             print(f"Warning: output_folder not defined or does not exist. Saving to current directory: {output_filename}")
             output_filepath = output_filename
        else:
            output_filepath = os.path.join(output_folder, output_filename)


        # Save the DataFrame to a CSV file
        df_combined.to_csv(output_filepath, index=False)
        print(f"Successfully saved the combined cleaned data to '{output_filepath}'")

    except Exception as e:
        print(f"An error occurred while saving the combined CSV: {e}")

In [None]:
import pandas as pd
import os
import re
from docx import Document

# Assuming docx_files is populated from a previous step
# Assuming the loop to iterate through docx_files is in place
# Assuming the code to find "COURSE OUTLINE", identify extracted_tables, process rows/cells,
# and populate current_file_extracted_data for the current file is in place within the file loop.
# Assuming output_folder is defined from a previous step.

# For demonstration purposes, since no files were found in previous steps,
# the following code will only execute if there were files to process and data was extracted for the current file.

# Inside the loop processing each docx_file:
# After the block that populates current_file_extracted_data for the current file:
# Example structure (this code would conceptually follow the data extraction loop for a single file):
# if current_file_extracted_data:
#     # Create DataFrame, generate filename, and save CSV

if not docx_files:
    print("No DOCX files found. Cannot save individual CSVs.")
else:
    for docx_file in docx_files:
        print(f"Processing file: {docx_file}")
        current_file_extracted_data = [] # This would be populated by previous steps

        # --- Start of conceptual data extraction and cleaning for the current file ---
        # This part is included conceptually to show where the saving logic fits.
        # In the actual execution flow, this data would come from the previous steps.
        try:
            document = Document(docx_file)
            course_outline_found = False
            target_table_names = ["Learning Outcomes", "Deliverables", "Assessment", "Instructional Materials"]
            extracted_tables = {}

            # First, look for "COURSE OUTLINE" in paragraphs
            for paragraph in document.paragraphs:
                if "COURSE OUTLINE" in paragraph.text.strip().upper():
                    course_outline_found = True
                    break

            # If "COURSE OUTLINE" is found, look for the main course table
            if course_outline_found:
                for table in document.tables:
                    if table.rows and table.rows[0].cells:
                        # Get all text from the first row to analyze the table structure
                        first_row_all_text = []
                        for cell in table.rows[0].cells:
                            first_row_all_text.append(cell.text.strip())
                        
                        # Check if this looks like the main course outline table (contains multiple target columns)
                        targets_found_in_table = []
                        for target_name in target_table_names:
                            for cell_text in first_row_all_text:
                                if target_name.lower() in cell_text.lower():
                                    targets_found_in_table.append(target_name)
                                    break
                        
                        if len(targets_found_in_table) >= 2:  # If table contains 2+ target columns
                            extracted_tables["Main_Course_Table"] = table
                            break  # Found the main table, stop looking

            if not course_outline_found or not extracted_tables:
                print(f"  No 'COURSE OUTLINE' section or target tables found in {os.path.basename(docx_file)}. Skipping saving.")
            else:
                # Process the main course table (same logic as combined processing)
                table = extracted_tables["Main_Course_Table"]
                print(f"  Processing main course table with {len(table.rows)} rows")
                
                for i, row in enumerate(table.rows):
                    row_data = {
                        "File": os.path.basename(docx_file),
                        "Table": "Course_Outline",
                        "Row": i + 1
                    }
                    cleaned_cell_contents = []
                    for j, cell in enumerate(row.cells):
                        cell_text = cell.text
                        # Use regex to replace bullet point characters with spaces instead of dashes
                        cleaned_text_bullets = re.sub(r'[\u2022\u2023\u2043\u00B7]', ' ', cell_text)
                        
                        # Remove characters that can cause Excel formula errors completely
                        # Remove equal signs, plus signs, at signs, and dashes to avoid ALL formula triggers
                        cleaned_text_safe = re.sub(r'[=+@-]', ' ', cleaned_text_bullets)

                        # Use regex to replace other special characters with spaces
                        # This pattern keeps only letters, numbers, whitespace, periods, and commas
                        cleaned_text_final = re.sub(r'[^a-zA-Z0-9\s.,]', ' ', cleaned_text_safe)
                        
                        # Clean up multiple spaces and trim
                        cleaned_text_final = re.sub(r'\s+', ' ', cleaned_text_final).strip()
                        
                        # Special handling for Week column (first column) to prevent date conversion issues
                        if j == 0 and i > 0:  # First column (Week) and not header row
                            # Replace dashes with commas to prevent Excel date conversion (e.g., "4-5" -> "4,5")
                            cleaned_text_final = re.sub(r'-', ',', cleaned_text_final)
                            # Clean up periods at the end of week numbers (e.g., "1." -> "1")
                            cleaned_text_final = re.sub(r'^(\d+)\.', r'\1', cleaned_text_final.strip())
                            
                        cleaned_cell_contents.append(cleaned_text_final)

                    # Use the actual column headers from the first row if available
                    if i == 0:  # This is the header row, store column names
                        column_headers = cleaned_cell_contents
                    else:  # Data rows
                        for k, cleaned_content in enumerate(cleaned_cell_contents):
                            if k < len(column_headers):
                                # Use actual column name from header
                                column_name = re.sub(r'[^a-zA-Z0-9_]', '_', column_headers[k].strip())
                                if not column_name:
                                    column_name = f"Column_{k+1}"
                                
                                # Skip the "CO aligned to" column
                                if "CO" not in column_headers[k] or "aligned" not in column_headers[k]:
                                    row_data[column_name] = cleaned_content
                            else:
                                row_data[f"Column_{k+1}"] = cleaned_content

                        # Append the processed row data to the list for the current file (skip header row)
                        current_file_extracted_data.append(row_data)
        except Exception as e:
            print(f"Error during data extraction and cleaning for {os.path.basename(docx_file)}: {e}")
        # --- End of conceptual data extraction and cleaning for the current file ---


        # --- Saving the data for the current file (Current Subtask) ---
        if current_file_extracted_data:
            print(f"  Data extracted for {os.path.basename(docx_file)}. Preparing to save CSV.")
            try:
                # Create a pandas DataFrame from the list of dictionaries for the current file
                df_current_file = pd.DataFrame(current_file_extracted_data)

                # Generate a safe output filename based on the original DOCX filename
                original_filename_base = os.path.splitext(os.path.basename(docx_file))[0]
                safe_filename_base = re.sub(r'[^\w.-]', '_', original_filename_base)
                output_filename = f"cleaned_extracted_data_{safe_filename_base}.csv"
                # Ensure output_folder is defined and exists
                if 'output_folder' not in locals() or not os.path.exists(output_folder):
                    print(f"Warning: output_folder not defined or does not exist ('{output_folder}'). Saving '{output_filename}' to current directory.")
                    output_filepath = output_filename
                else:
                    output_filepath = os.path.join(output_folder, output_filename)

                # Save the DataFrame to a CSV file
                df_current_file.to_csv(output_filepath, index=False)
                print(f"  Successfully saved cleaned data for {os.path.basename(docx_file)} to '{output_filepath}'")

            except Exception as e:
                print(f"Error saving cleaned data for {os.path.basename(docx_file)} to CSV: {e}")
        else:
            print(f"No data extracted for {os.path.basename(docx_file)}. Skipping saving individual CSV.")

In [None]:
# Debug: Check Week column content in a few documents
import re
from docx import Document

# Check the first 3 documents to see their Week column content
debug_count = 0
for docx_file in docx_files[:3]:  # Only check first 3 files
    debug_count += 1
    print(f"\nDEBUG FILE {debug_count}: {os.path.basename(docx_file)}")
    
    try:
        document = Document(docx_file)
        course_outline_found = False
        target_table_names = ["Learning Outcomes", "Deliverables", "Assessment", "Instructional Materials"]
        
        # Find COURSE OUTLINE section
        for paragraph in document.paragraphs:
            if "COURSE OUTLINE" in paragraph.text.strip().upper():
                course_outline_found = True
                break
        
        if course_outline_found:
            # Find the main table
            for table in document.tables:
                if table.rows and table.rows[0].cells:
                    first_row_all_text = []
                    for cell in table.rows[0].cells:
                        first_row_all_text.append(cell.text.strip())
                    
                    # Check if this is the main course table
                    targets_found = []
                    for target_name in target_table_names:
                        for cell_text in first_row_all_text:
                            if target_name.lower() in cell_text.lower():
                                targets_found.append(target_name)
                                break
                    
                    if len(targets_found) >= 2:  # Found main table
                        print(f"Main table found with {len(table.rows)} rows")
                        print(f"Headers: {first_row_all_text}")
                        
                        # Show first few Week column values
                        print("\nWeek column content (first 10 rows):")
                        for i, row in enumerate(table.rows[:10]):  # First 10 rows
                            if row.cells:
                                week_content = row.cells[0].text.strip()
                                if i == 0:
                                    print(f"  Row {i+1} (header): '{week_content}'")
                                else:
                                    print(f"  Row {i+1}: '{week_content}'")
                        break
        else:
            print("COURSE OUTLINE section not found")
            
    except Exception as e:
        print(f"Error processing: {e}")

In [None]:
# Debug: Check which documents might not have main tables detected
import re
from docx import Document

successful_extractions = 0
failed_extractions = 0
failed_files = []

print("Checking all documents for main table detection:")
print("=" * 60)

for docx_file in docx_files:
    filename = os.path.basename(docx_file)
    try:
        document = Document(docx_file)
        course_outline_found = False
        target_table_names = ["Learning Outcomes", "Deliverables", "Assessment", "Instructional Materials"]
        main_table_found = False
        
        # Find COURSE OUTLINE section
        for paragraph in document.paragraphs:
            if "COURSE OUTLINE" in paragraph.text.strip().upper():
                course_outline_found = True
                break
        
        if course_outline_found:
            # Check each table for main table characteristics
            for table in document.tables:
                if table.rows and table.rows[0].cells:
                    first_row_all_text = []
                    for cell in table.rows[0].cells:
                        first_row_all_text.append(cell.text.strip())
                    
                    # Check if this is the main course table
                    targets_found = []
                    for target_name in target_table_names:
                        for cell_text in first_row_all_text:
                            if target_name.lower() in cell_text.lower():
                                targets_found.append(target_name)
                                break
                    
                    if len(targets_found) >= 2:  # Found main table
                        main_table_found = True
                        successful_extractions += 1
                        print(f"{filename}: Found main table with {len(table.rows)} rows")
                        break
            
            if not main_table_found:
                failed_extractions += 1
                failed_files.append(docx_file)
                print(f"{filename}: COURSE OUTLINE found but no main table detected")
        else:
            failed_extractions += 1
            failed_files.append(docx_file)
            print(f"{filename}: No COURSE OUTLINE section found")
            
    except Exception as e:
        failed_extractions += 1
        failed_files.append(docx_file)
        print(f"{filename}: Error - {e}")

print("=" * 60)
print(f"Summary:")
print(f"Successful extractions: {successful_extractions}")
print(f"Failed extractions: {failed_extractions}")
print(f"Total files: {len(docx_files)}")

if failed_files:
    print(f"\nFiled files ({len(failed_files)}):")
    for failed_file in failed_files[:10]:  # Show first 10
        print(f"  - {os.path.basename(failed_file)}")
    if len(failed_files) > 10:
        print(f"  ... and {len(failed_files) - 10} more")

In [None]:
# Check all CSV files for Excel formula triggers
import pandas as pd
import os

formula_triggers = ['=', '+', '-', '@']

print("Checking CSV files for Excel formula triggers...")
print("=" * 50)

# Get all CSV files in the output folder
csv_files_in_output = [f for f in os.listdir(output_folder) if f.endswith('.csv')]

for csv_filename in csv_files_in_output:
    csv_path = os.path.join(output_folder, csv_filename)
    if os.path.exists(csv_path):
        print(f"\n{csv_filename}:")
        try:
            # Read the raw CSV content to check for formula triggers
            with open(csv_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            total_triggers = 0
            for trigger in formula_triggers:
                count = content.count(trigger)
                if trigger == '-':
                    # For minus signs, only count those at the beginning of cells (more dangerous)
                    lines = content.split('\n')
                    dangerous_minus = 0
                    for line in lines:
                        cells = line.split(',')
                        for cell in cells:
                            if cell.strip().startswith('-') and len(cell.strip()) > 1:
                                dangerous_minus += 1
                    print(f"   Leading '-': {dangerous_minus}")
                    total_triggers += dangerous_minus
                else:
                    print(f"   '{trigger}': {count}")
                    total_triggers += count
            
            if total_triggers == 0:
                print("   Clean - no formula triggers found")
            else:
                print(f"   Warning: {total_triggers} potential triggers found")
                
            # Check total rows
            df = pd.read_csv(csv_path)
            print(f"   Rows: {len(df)}")
                
        except Exception as e:
            print(f"   Error: {e}")
    else:
        print(f"{csv_filename}: Not found")

print("\nCheck complete.")