In [None]:
%pip install python-docx

In [None]:
from docx import Document
import pandas as pd
import glob
import os
import re

# === SETTINGS ===
input_folder = "docx_folder"   # source folder with Word files
output_folder = "csv_outputs" # destination folder for CSVs
# Define canonical headers and their possible variations in source documents
header_mapping = {
    # Added more variations for Week to capture numbered lists and common patterns
    "Week": ["Week", "Week No", "Week #", "Week Number", "Week/Date", "Weeks", "WK"],
    "Learning Outcomes": ["Learning Outcomes", "Learning Outcome", "Learning\nOutcomes", "Learning Outcomes (LO) / Learner and Learning Outcomes (LLO)", "Learning Outcomes\n(At the end of the session, students are expected to:)", "Learning Outcomes\n(At the end of the session, students are expected to :)", "Learning\nOutcomes", "Learning \nOutcomes"],
    # Added more variations for Deliverables Outcomes
    "Deliverables": ["Deliverables Outcomes", "Deliverables/Outcomes", "Deliverables", "Deliverables/\nOutcomes", "Deliverables\n/ Outcomes", "Deliverables/ Outcomes", "Deliverables/\xa0 Outcomes", "Deliverables/\xa0\nOutcomes", "Deliverable / Outcomes", "Deliverables\n/ Outcomes/Rubrics"],
    # Added more variations for Assessments
    "Assessments": ["Assessments", "Assessment", "Assessment Task", "Assessment\n/ Output"]
}

os.makedirs(output_folder, exist_ok=True)

# === FUNCTIONS ===
def normalize_header(text):
    """
    Normalize header text: lowercase, remove spaces, slashes, punctuation.
    E.g., 'Deliverables/ Outcomes' -> 'deliverablesoutcomes'
    """
    return re.sub(r'[^a-z0-9]', '', text.lower())

# Normalize header mapping for easier lookup
normalized_header_mapping = {
    normalize_header(k): [normalize_header(v) for v in values]
    for k, values in header_mapping.items()
}

# Get a set of all possible normalized variations we are looking for
all_normalized_variations = set()
for variations in normalized_header_mapping.values():
    all_normalized_variations.update(variations)


# === MAIN LOOP ===
for file in glob.glob(os.path.join(input_folder, "*.docx")):
    doc = Document(file)
    file_found_relevant_table = False # Track if any relevant table was found in the file

    print(f"\nProcessing DOCX file: {os.path.basename(file)}")

    for i, table in enumerate(doc.tables):
        # Check if the table has at least one row (the header row)
        if len(table.rows) > 0:
            headers = [cell.text.strip() for cell in table.rows[0].cells]
            normalized_headers = [normalize_header(h) for h in headers]

            # Check if a significant number of wanted headers are present as substrings in the first row headers
            matched_canonical_headers = []
            col_indices = []
            final_headers = []

            for canonical_header, possible_variations in header_mapping.items():
                normalized_canonical_header = normalize_header(canonical_header)
                found_variation_index = -1
                # Check if any of the normalized variations are substrings of the normalized headers in the table
                for variation in possible_variations:
                    normalized_variation = normalize_header(variation)
                    for j, normalized_table_header in enumerate(normalized_headers):
                        # Use exact match or significant substring match
                        if normalized_variation == normalized_table_header or (len(normalized_variation) > 3 and normalized_variation in normalized_table_header) or (len(normalized_table_header) > 3 and normalized_table_header in normalized_variation):
                            found_variation_index = j # Found a match at this column index
                            break # Found a match for this canonical header, move to the next canonical header
                    if found_variation_index != -1:
                        break # Found a match for this canonical header variation

                if found_variation_index != -1:
                    col_indices.append(found_variation_index)
                    final_headers.append(canonical_header) # Use the canonical header in the final DataFrame
                    matched_canonical_headers.append(canonical_header)


            # If at least 2 canonical headers were matched, consider this a relevant table
            if len(matched_canonical_headers) >= 2:
                print(f"  Found potentially relevant Table {i+1}.")
                print(f"    Headers in Table {i+1} (first row): {headers}")
                print(f"    Matched canonical headers: {matched_canonical_headers}")


                 # Extract data if wanted columns are found (should be true based on the >=2 check)
                if col_indices:
                    data = []
                    for row_idx, row in enumerate(table.rows[1:]): # Skip header row, iterate with index
                        row_data = []
                        for j, header in zip(col_indices, final_headers):
                            cell_text = ""
                            if j < len(row.cells): # Check if cell index is valid
                                cell_text = row.cells[j].text.strip()

                            # --- Week Number Extraction Debugging and Improvement in DOCX processing ---
                            if header == "Week":
                                print(f"      Table {i+1}, Row {row_idx+2} (Data Row), Week Cell Raw Text: '{cell_text}'") # Debugging print

                                # Attempt to find sequences of digits that look like week numbers.
                                # Look for digits potentially followed by common separators or at word boundaries.
                                # This regex is made more flexible to capture numbers in various contexts.
                                numbers = re.findall(r'\b\d+\b(?:[-\.\,]\s*\b\d+\b)*', cell_text) # Find numbers, potentially followed by range/list separators

                                # If no multi-number patterns found, try to find single numbers more broadly
                                if not numbers:
                                    numbers = re.findall(r'\d+', cell_text) # Just find any digits if specific patterns fail


                                extracted_week_str = ','.join(sorted(list(set(numbers)), key=lambda x: int(x) if x.isdigit() else 0)) if numbers else '' # Sort unique numbers (handle non-digit sort key for safety)

                                print(f"      Table {i+1}, Row {row_idx+2}, Extracted Week Numbers: '{extracted_week_str}'") # Debugging print
                                cell_text = extracted_week_str # Use the extracted numbers for the Week column


                             # --- Handle Hyphens and special characters in other columns (like Deliverables) ---
                             # Replace newlines and multiple spaces
                            cell_text = cell_text.replace('\n', ' ').replace('\r', ' ') # Replace newlines in cell text
                            cell_text = re.sub(r'\s+', ' ', cell_text).strip() # Replace multiple spaces with single space and strip whitespace
                            # Explicitly handle common unreadable characters that might appear
                            cell_text = cell_text.replace('â€¦', '...')
                            cell_text = cell_text.replace('â€“', '-')
                            cell_text = cell_text.replace('â€”', '--')


                            row_data.append(cell_text) # Add cleaned cell text to row data
                        data.append(row_data)

                    # Create DataFrame and save to CSV
                    df = pd.DataFrame(data, columns=final_headers)

                    # Ensure all four canonical columns exist, even if they weren't in the original table
                    all_canonical_headers = ["Week", "Learning Outcomes", "Deliverables", "Assessments"]
                    for canonical_header in all_canonical_headers:
                        if canonical_header not in df.columns:
                            df[canonical_header] = "" # Add missing column with empty strings

                    # Reorder columns to the desired canonical order
                    df = df[all_canonical_headers]

                    # Remove rows where all four canonical columns are empty
                    mask = (df['Week'].astype(str).str.len() > 0) | \
                           (df['Learning Outcomes'].astype(str).str.len() > 0) | \
                           (df['Deliverables'].astype(str).str.len() > 0) | \
                           (df['Assessments'].astype(str).str.len() > 0)
                    df = df[mask]


                    output_filename = os.path.join(output_folder, os.path.splitext(os.path.basename(file))[0] + ".csv")
                    df.to_csv(output_filename, index=False) # pandas to_csv handles quoting fields with commas/special chars
                    print(f"    Successfully extracted data from Table {i+1} to {os.path.basename(output_filename)}")
                    file_found_relevant_table = True
                    # Don't break, continue checking other tables in the same file in case there are multiple relevant tables


    if not file_found_relevant_table:
        print(f"\nNo relevant table with at least 2 of the main headers {list(header_mapping.keys())} found in {os.path.basename(file)}")


print("\nProcessing complete.")

In [None]:
import pandas as pd
import glob
import os

# Print the current working directory
cwd = os.getcwd()
print(f"Current working directory: {cwd}")

# Use relative path for the source folder
source_csv_folder = "csv_outputs"

# Check if the folder exists
if not os.path.exists(source_csv_folder):
    print(f"The folder '{source_csv_folder}' does not exist.")
else:
    # List all files in the folder for debugging
    all_files = os.listdir(source_csv_folder)
    print(f"Files in '{source_csv_folder}': {all_files}")

    # Get a list of all CSV files in the source folder
    csv_files = glob.glob(os.path.join(source_csv_folder, "*.csv"))
    print(f"CSV files found: {csv_files}")

    # Change the output folder for the combined CSV
    output_folder = "combined_csv"
    combined_csv_filename = "combined_syllabi_data.csv"
    combined_csv_filepath = os.path.join(output_folder, combined_csv_filename)

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Check if there are any CSV files to combine
    if not csv_files:
        print(f"No CSV files found in {source_csv_folder} to combine.")
    else:
        # Create an empty list to store DataFrames
        df_list = []

        # Read each CSV file into a DataFrame and append to the list
        for file in csv_files:
            try:
                df = pd.read_csv(file, dtype={'Week': str})
                df_list.append(df)
                print(f"Read {os.path.basename(file)}")
            except Exception as e:
                print(f"Error reading {os.path.basename(file)}: {e}")

        # Concatenate all DataFrames into a single DataFrame
        if df_list:
            combined_df = pd.concat(df_list, ignore_index=True)

            # Save the combined DataFrame to a new CSV file in the specified output folder
            combined_df.to_csv(combined_csv_filepath, index=False)

            print(f"\nSuccessfully combined {len(csv_files)} CSV files into {combined_csv_filename}")
            print(f"Combined CSV saved to: {combined_csv_filepath}")
        else:
            print("No DataFrames were successfully read to combine.")