In [1]:
%pip install python-docx

Note: you may need to restart the kernel to use updated packages.


    extract-msg (<=0.29.*)
                 ~~~~~~~^


In [2]:
from docx import Document
import pandas as pd
import glob
import os
import re

# === SETTINGS ===
input_folder = "docx_folder"   # source folder with Word files
output_folder = "csv_outputs" # destination folder for CSVs
# Define canonical headers and their possible variations in source documents
header_mapping = {
    "Week": ["Week"], # added Week to the mapping so that it can be recognized
    "Learning Outcomes": ["Learning Outcomes", "Learning Outcome", "Learning\nOutcomes"],
    "Deliverables Outcomes": ["Deliverables Outcomes", "Deliverables/Outcomes", "Deliverables", "Deliverables/\nOutcomes", "Deliverables\n/ Outcomes"],
    "Assessments": ["Assessments", "Assessment"]
}

os.makedirs(output_folder, exist_ok=True)

# === FUNCTIONS ===
def normalize_header(text):
    """
    Normalize header text: lowercase, remove spaces, slashes, punctuation.
    E.g., 'Deliverables/ Outcomes' -> 'deliverablesoutcomes'
    """
    return re.sub(r'[^a-z0-9]', '', text.lower())

# Normalize header mapping for easier lookup
normalized_header_mapping = {
    normalize_header(k): [normalize_header(v) for v in values]
    for k, values in header_mapping.items()
}

# Get a set of all possible normalized variations we are looking for
all_normalized_variations = set()
for variations in normalized_header_mapping.values():
    all_normalized_variations.update(variations)


# === MAIN LOOP ===
for file in glob.glob(os.path.join(input_folder, "*.docx")):
    doc = Document(file)
    file_found_relevant_table = False # Track if any relevant table was found in the file


    for i, table in enumerate(doc.tables):
        # Check if the table has at least one row (the header row)
        if len(table.rows) > 0:
            headers = [cell.text.strip() for cell in table.rows[0].cells]
            normalized_headers = [normalize_header(h) for h in headers]

            # Check if a significant number of wanted headers are present as substrings in the first row headers
            matched_canonical_headers = []
            col_indices = []
            final_headers = []

            for canonical_header, possible_variations in header_mapping.items():
                normalized_canonical_header = normalize_header(canonical_header)
                found_variation_index = -1
                # Check if any of the normalized variations are substrings of the normalized headers in the table
                for variation in possible_variations:
                    normalized_variation = normalize_header(variation)
                    for j, normalized_table_header in enumerate(normalized_headers):
                        if normalized_variation in normalized_table_header:
                            found_variation_index = j # Found a match at this column index
                            break # Found a match for this canonical header, move to the next canonical header
                    if found_variation_index != -1:
                        break # Found a match for this canonical header variation

                if found_variation_index != -1:
                    col_indices.append(found_variation_index)
                    final_headers.append(canonical_header) # Use the canonical header in the final DataFrame
                    matched_canonical_headers.append(canonical_header)


            # If at least 2 canonical headers were matched as substrings, consider this a relevant table
            if len(matched_canonical_headers) >= 2:
                print(f"\nProcessing file: {os.path.basename(file)}") # Print file name only when a relevant table is found
                print(f"  Found potentially relevant Table {i+1}.")
                print(f"    Headers in Table {i+1} (first row): {headers}")
                print(f"    Matched canonical headers: {matched_canonical_headers}")


                 # Extract data if wanted columns are found (should be true based on the >=2 check)
                if col_indices:
                    data = []
                    for row in table.rows[1:]: # Skip header row
                        row_data = []
                        for j, header in zip(col_indices, final_headers):
                            cell_text = ""
                            if j < len(row.cells): # Check if cell index is valid
                                cell_text = row.cells[j].text.strip()
                            
                            # replace hypen symbol to comma in defining week dates
                            if header == "Week":
                                cell_text = cell_text.replace("-", ",")
                                
                            row_data.append(cell_text) # Add cell text to row data
                        data.append(row_data)     

                    # Create DataFrame and save to CSV
                    df = pd.DataFrame(data, columns=final_headers)
                    output_filename = os.path.join(output_folder, os.path.splitext(os.path.basename(file))[0] + ".csv")
                    df.to_csv(output_filename, index=False)
                    print(f"    Successfully extracted data from Table {i+1} to {os.path.basename(output_filename)}")
                    file_found_relevant_table = True
                    # Don't break, continue checking other tables in the same file in case there are multiple relevant tables


    if not file_found_relevant_table:
        print(f"\nNo relevant table with at least 2 of the main headers {list(header_mapping.keys())} found in {os.path.basename(file)}")


print("\nProcessing complete.")


Processing file: AUTOMAT_SYLLABUS_2ndTerm_2024_ver3.0.docx
  Found potentially relevant Table 7.
    Headers in Table 7 (first row): ['Week', 'Topics', 'Learning Activities', 'Learning Outcomes', 'CO aligned to:', 'Instructional Materials', 'Deliverables/\nOutcomes', 'Assessment']
    Matched canonical headers: ['Week', 'Learning Outcomes', 'Deliverables Outcomes', 'Assessments']
    Successfully extracted data from Table 7 to AUTOMAT_SYLLABUS_2ndTerm_2024_ver3.0.csv

Processing file: CLDCOMP_Syllabus 2023-2024.docx
  Found potentially relevant Table 5.
    Headers in Table 5 (first row): ['Week', 'Topics', 'Learning Activities', 'Learning Outcomes', 'Instructional Materials', 'Deliverables/\nOutcomes', 'Assessment']
    Matched canonical headers: ['Week', 'Learning Outcomes', 'Deliverables Outcomes', 'Assessments']
    Successfully extracted data from Table 5 to CLDCOMP_Syllabus 2023-2024.csv

Processing file: CLDSRV2_Syllabus 2023_2024.docx
  Found potentially relevant Table 5.
    

In [3]:
import pandas as pd
import glob
import os

# Print the current working directory
cwd = os.getcwd()
print(f"Current working directory: {cwd}")

# Use relative path for the source folder
source_csv_folder = "csv_outputs"

# Check if the folder exists
if not os.path.exists(source_csv_folder):
    print(f"The folder '{source_csv_folder}' does not exist.")
else:
    # List all files in the folder for debugging
    all_files = os.listdir(source_csv_folder)
    print(f"Files in '{source_csv_folder}': {all_files}")

    # Get a list of all CSV files in the source folder
    csv_files = glob.glob(os.path.join(source_csv_folder, "*.csv"))
    print(f"CSV files found: {csv_files}")

    # Change the output folder for the combined CSV
    output_folder = "combined_csv"
    combined_csv_filename = "combined_syllabi_data.csv"
    combined_csv_filepath = os.path.join(output_folder, combined_csv_filename)

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Check if there are any CSV files to combine
    if not csv_files:
        print(f"No CSV files found in {source_csv_folder} to combine.")
    else:
        # Create an empty list to store DataFrames
        df_list = []

        # Read each CSV file into a DataFrame and append to the list
        for file in csv_files:
            try:
                df = pd.read_csv(file, dtype={'Week': str})
                df_list.append(df)
                print(f"Read {os.path.basename(file)}")
            except Exception as e:
                print(f"Error reading {os.path.basename(file)}: {e}")

        # Concatenate all DataFrames into a single DataFrame
        if df_list:
            combined_df = pd.concat(df_list, ignore_index=True)

            # Save the combined DataFrame to a new CSV file in the specified output folder
            combined_df.to_csv(combined_csv_filepath, index=False)

            print(f"\nSuccessfully combined {len(csv_files)} CSV files into {combined_csv_filename}")
            print(f"Combined CSV saved to: {combined_csv_filepath}")
        else:
            print("No DataFrames were successfully read to combine.")

Current working directory: c:\REPO\Syllabi-Verification-Py-Model\Data Processing
Files in 'csv_outputs': ['AUTOMAT_SYLLABUS_2ndTerm_2024_ver3.0.csv', 'CLDCOMP_Syllabus 2023-2024.csv', 'CLDSRV2_Syllabus 2023_2024.csv', 'CLDSRV2_Syllabus_2024.csv', 'COMPORG_SYLLABUS_AY2024_2025.csv', 'COMSEC2_Syllabus_2024.csv', 'COMSEC3_Syllabus_2024 - 2025.csv', 'COMSECT_Syllabus_2024.csv', 'Course Syllabus PHYSICS1 Natural Physics 1 for IT.csv', 'Course Syllabus PHYSICS2 Natural Physics 2 for IT.csv', 'CRISKMA Syllabus AY 2024 - 2025.csv', 'CSPROJ2 Course Syllabus AY2024-2025 (KRC)v.03.csv', 'DASTRUC_Syllabus T1 AY 2024-2025 Revise version.csv', 'DATAMA1_SYLLABUS_2024_1st_term_Version.csv', 'DATAMA1_SYLLABUS_Ver3.0_2023.csv', 'DATAMA2_SYLLABUS_1st_Term_2024_version.csv', 'DATAMA2_SYLLABUS_2nd_Term_2024_version3.0.csv', 'DESALGO_Syllabus_2024.csv', 'ENTJAVA (Using C# .NET) Course Syllabus 2023-2024.csv', 'ICTSRV1_Syllabus_2024.csv', 'INFOSEC_Syllabus_2023.csv', 'INFOSEC_Syllabus_T1_AY_2024_2025.csv', '