In [None]:
%pip install pandas

In [None]:

%pip install glob2

In [None]:
import pandas as pd
import glob
import os
import re
from pathlib import Path
import logging
from typing import List, Dict, Optional, Tuple

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("packages are imported successfully")

In [None]:
import os 

INPUT_CSV_DIRECTORY = "csv_outputs"          
OUTPUT_DIRECTORY = "cleaned_csv"             
COMBINED_OUTPUT_DIR = "combined_csv"         

# define headers and their variations (coming from the docx extraction)
HEADER_MAPPING = {
    "Week": [
        "Week", "Week No", "Week #", "Week Number", "Week/Date"
    ],
    "Learning Outcomes": [
        "Learning Outcomes", "Learning Outcome", "Learning\\nOutcomes",
        "Learning Outcomes (LO) / Learner and Learning Outcomes (LLO)",
        "Learning Outcomes\\n(At the end of the session, students are expected to:)",
        "Learning Outcomes\\n(At the end of the session, students are expected to :)",
        "Learning\\nOutcomes", "Learning \\nOutcomes"
    ],
    "Deliverables": [
        "Deliverables Outcomes", "Deliverables/Outcomes", "Deliverables", 
        "Deliverables/\\nOutcomes", "Deliverables\\n/ Outcomes", "Deliverables/ Outcomes",
        "Deliverables/\\xa0 Outcomes", "Deliverables/\\xa0\\nOutcomes", 
        "Deliverable / Outcomes", "Deliverables\\n/ Outcomes/Rubrics"
    ],
    "Assessments": [
        "Assessments", "Assessment", "Assessment Task", "Assessment\\n/ Output"
    ]
}

# create output directories 
os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)
os.makedirs(COMBINED_OUTPUT_DIR, exist_ok=True)

print(f"Input directory: {INPUT_CSV_DIRECTORY}")
print(f"Output directory: {OUTPUT_DIRECTORY}")
print(f"Combined output directory: {COMBINED_OUTPUT_DIR}")
print("Configuration completed successfully")

In [None]:
def normalize_header(text):
    """
    Normalize header text: lowercase, remove spaces, slashes, punctuation.
    Same logic as your DOCX extraction script.
    """
    return re.sub(r'[^a-z0-9]', '', text.lower())

def find_csv_files(directory):
    """Find all CSV files in the specified directory."""
    csv_pattern = os.path.join(directory, "*.csv")
    csv_files = glob.glob(csv_pattern)
    return csv_files

def match_headers_to_canonical(df_headers, header_mapping):
    """
    Match DataFrame headers to canonical headers using substring matching.
    Returns matched canonical headers, column indices, and final header names.
    """
    matched_canonical_headers = []
    col_indices = []
    final_headers = []

    # Normalize input headers
    normalized_df_headers = [normalize_header(h) for h in df_headers]

    for canonical_header, possible_variations in header_mapping.items():
        found_variation_index = -1

        # Check if any variation matches as substring
        for variation in possible_variations:
            normalized_variation = normalize_header(variation)

            for j, normalized_table_header in enumerate(normalized_df_headers):
                if normalized_variation in normalized_table_header or normalized_table_header in normalized_variation:
                    found_variation_index = j
                    break

            if found_variation_index != -1:
                break

        if found_variation_index != -1:
            col_indices.append(found_variation_index)
            final_headers.append(canonical_header)
            matched_canonical_headers.append(canonical_header)

    return matched_canonical_headers, col_indices, final_headers

def clean_text_data(df):
    """Clean text data removing common DOCX extraction artifacts, including bullet points and more unreadable characters, and formats bullet points as alphabetical lists."""
    df_clean = df.copy()

    # Define common bullet point characters
    bullet_chars_pattern = r'[•\u2022\uf0b7\uf0a7\uf0d8»]'

    for col in df_clean.columns:
        if col in ["Learning Outcomes", "Deliverables", "Assessments"] and df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].astype(str)

            # Replace unreadable characters first
            df_clean[col] = df_clean[col].str.replace('\\n', ' ', regex=False)
            df_clean[col] = df_clean[col].str.replace('\\xa0', ' ', regex=False) # Non-breaking space
            df_clean[col] = df_clean[col].str.replace('\\u2013', '-', regex=False) # En dash
            df_clean[col] = df_clean[col].str.replace('\\u2014', '--', regex=False) # Em dash
            df_clean[col] = df_clean[col].str.replace('\\u2019', "'", regex=False) # Right single quotation mark (apostrophe)
            df_clean[col] = df_clean[col].str.replace('\\u201c', '"', regex=False) # Left double quotation mark
            df_clean[col] = df_clean[col].str.replace('\\u201d', '"', regex=False) # Right double quotation mark
            df_clean[col] = df_clean[col].str.replace('\\u2026', '...', regex=False) # Horizontal ellipsis (...)

            # Add specific replacements for reported unreadable characters
            df_clean[col] = df_clean[col].str.replace('â€¦', '...', regex=False) # Horizontal ellipsis
            df_clean[col] = df_clean[col].str.replace('â€“', '-', regex=False) # En dash
            df_clean[col] = df_clean[col].str.replace('â€”', '--', regex=False) # Em dash
            df_clean[col] = df_clean[col].str.replace('â€™', "'", regex=False) # Apostrophe
            df_clean[col] = df_clean[col].str.replace('â€œ', '"', regex=False) # Left double quote
            df_clean[col] = df_clean[col].str.replace('â€', '"', regex=False) # Right double quote

            # Replace 'nan' string with empty string
            df_clean[col] = df_clean[col].replace(['nan', 'NaN'], '', regex=True)

            # Normalize text: replace multiple spaces, strip whitespace
            df_clean[col] = df_clean[col].str.replace('  +', ' ', regex=True).str.strip()


            # --- New: Format bullet points as alphabetical list ---
            def format_as_alphabetical_list(text):
                if not text:
                    return ""
                # Split the text by bullet points and potential leading whitespace/dashes/newlines
                # Use a broad split pattern that includes bullet chars, newlines, and leading dash/space
                items = re.split(r'(?:' + bullet_chars_pattern + r'|\n|\r|^\s*[\-\–—]\s*)', text)

                # Filter out empty strings resulting from the split
                items = [item.strip() for item in items if item.strip()]

                if not items:
                    return text # Return original text if no list items found

                # Format items as a., b., c.
                formatted_items = []
                for i, item in enumerate(items):
                    # Ensure item doesn't already start with a list marker (like a number or letter list)
                    if not re.match(r'^\s*([a-z]\.|\d+\.)\s', item, re.IGNORECASE):
                         formatted_items.append(f"{chr(ord('a') + i)}. {item}")
                    else:
                         formatted_items.append(item) # Keep existing list formatting if present

                return ' '.join(formatted_items) # Join with space, or consider newline '\n'

            df_clean[col] = df_clean[col].apply(format_as_alphabetical_list)
            # --- End New ---

            # Remove leading bullet points/dashes that might remain if not fully captured by split
            df_clean[col] = df_clean[col].str.replace(r'^\s*[' + bullet_chars_pattern + r'\-—–]\s*', '', regex=True)

            # Final strip after formatting
            df_clean[col] = df_clean[col].str.strip()


    return df_clean

print("functions defined successfully")

In [None]:
import os
import re
import pandas as pd
import glob

INPUT_CSV_DIRECTORY = "csv_outputs"
OUTPUT_DIRECTORY = "cleaned_csv"
COMBINED_OUTPUT_DIR = "combined_csv"

def normalize_header(text):
    """
    Normalize header text: lowercase, remove spaces, slashes, punctuation.
    Same logic as your DOCX extraction script.
    """
    return re.sub(r'[^a-z0-9]', '', text.lower())

def find_csv_files(directory):
    """Find all CSV files in the specified directory."""
    csv_pattern = os.path.join(directory, "*.csv")
    csv_files = glob.glob(csv_pattern)
    return csv_files

def match_headers_to_canonical(df_headers, header_mapping):
    """
    Match DataFrame headers to canonical headers using substring matching.
    Returns matched canonical headers, column indices, and final header names.
    """
    matched_canonical_headers = []
    col_indices = []
    final_headers = []

    # Normalize input headers
    normalized_df_headers = [normalize_header(h) for h in df_headers]

    for canonical_header, possible_variations in header_mapping.items():
        found_variation_index = -1

        # Check if any variation matches as substring
        for variation in possible_variations:
            normalized_variation = normalize_header(variation)

            for j, normalized_table_header in enumerate(normalized_df_headers):
                if normalized_variation in normalized_table_header or normalized_table_header in normalized_variation:
                    found_variation_index = j
                    break

            if found_variation_index != -1:
                break

        if found_variation_index != -1:
            col_indices.append(found_variation_index)
            final_headers.append(canonical_header)
            matched_canonical_headers.append(canonical_header)

    return matched_canonical_headers, col_indices, final_headers

def clean_text_data(df):
    """Clean text data removing common DOCX extraction artifacts, including bullet points and more unreadable characters."""
    df_clean = df.copy()

    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Ensure column is string type before applying string methods
            df_clean[col] = df_clean[col].astype(str)

            # Strip whitespace
            df_clean[col] = df_clean[col].str.strip()

            # Replace 'nan' string with empty string
            df_clean[col] = df_clean[col].replace(['nan', 'NaN'], '', regex=True)

            # Clean up common formatting issues from DOCX extraction
            df_clean[col] = df_clean[col].str.replace('\\n', ' ', regex=False)
            df_clean[col] = df_clean[col].str.replace('\\xa0', ' ', regex=False) # Non-breaking space
            df_clean[col] = df_clean[col].str.replace('\\u2022', ' ', regex=False) # Bullet point (•)
            df_clean[col] = df_clean[col].str.replace('\\uf0b7', ' ', regex=False) # Another common bullet point character
            df_clean[col] = df_clean[col].str.replace('\\uf0a7', ' ', regex=False) # Yet another bullet point character
            df_clean[col] = df_clean[col].str.replace('\\uf0d8', ' ', regex=False) # Arrow bullet point

            # Add specific replacements for reported unreadable characters
            df_clean[col] = df_clean[col].str.replace('â€¦', '...', regex=False) # Horizontal ellipsis
            df_clean[col] = df_clean[col].str.replace('â€“', '-', regex=False) # En dash
            df_clean[col] = df_clean[col].str.replace('â€”', '--', regex=False) # Em dash

            # Remove leading bullet points and other non-alphanumeric characters that might appear at the start
            df_clean[col] = df_clean[col].str.replace(r'^\s*[•\u2022\uf0b7\uf0a7\uf0d8»\-—–]\s*', '', regex=True) # Added dash types and »

            # Replace multiple spaces with a single space
            df_clean[col] = df_clean[col].str.replace('  +', ' ', regex=True)

            # Remove leading/trailing spaces again after cleaning
            df_clean[col] = df_clean[col].str.strip()

    return df_clean


def process_single_csv(file_path, header_mapping, add_source_info=True):
    """Process a single CSV file and extract target columns."""
    try:
        filename = os.path.basename(file_path)
        print(f"Processing: {filename}")

        # read CSV with encoding handling
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file_path, encoding='latin-1')
                print(f"  Used latin-1 encoding")
            except UnicodeDecodeError:
                df = pd.read_csv(file_path, encoding='cp1252')
                print(f"  Used cp1252 encoding")
            except Exception as e:
                 print(f"  Error reading CSV with common encodings: {e}")
                 return None


        if df.empty:
            print(f"  Warning: Empty file")
            return None # return None for empty files

        print(f"  Original shape: {df.shape}")
        print(f"  Original headers: {list(df.columns)}")

        # match headers to canonical ones
        df_headers = list(df.columns)
        matched_canonical_headers, col_indices, final_headers = match_headers_to_canonical(df_headers, header_mapping)

        print(f"  Matched canonical headers: {matched_canonical_headers}")

        # extract the relevant columns
        # Ensure col_indices are valid for the current dataframe columns
        valid_col_indices = [i for i in col_indices if i < len(df.columns)]
        selected_columns = [df.columns[i] for i in valid_col_indices]

        # Map selected columns to final headers, only for valid indices
        column_rename_map = dict(zip(selected_columns, [final_headers[col_indices.index(i)] for i in valid_col_indices]))


        # Create a new DataFrame with only the canonical headers, filled with empty strings initially
        all_canonical_headers = ["Week", "Learning Outcomes", "Deliverables", "Assessments"]
        extracted_df = pd.DataFrame(columns=all_canonical_headers)

        # Populate the new DataFrame with data from matched columns
        if selected_columns:
             temp_df = df[selected_columns].copy()
             temp_df.rename(columns=column_rename_map, inplace=True)
             # Copy data into the correctly structured extracted_df, ensuring columns align
             for col in all_canonical_headers:
                 if col in temp_df.columns:
                     extracted_df[col] = temp_df[col]


        # Clean the data
        cleaned_df = clean_text_data(extracted_df)

        # Remove completely empty rows before specific column cleaning
        # Use .str.strip() to consider rows with only whitespace in canonical columns as empty
        # Using replace('', pd.NA) and dropna(how='all') is a robust way to remove rows where all *selected* cols are empty after cleaning
        cleaned_df = cleaned_df.replace('', pd.NA)
        # Only drop rows if ALL canonical columns are NA after cleaning
        cleaned_df = cleaned_df.dropna(subset=all_canonical_headers, how='all').fillna('')


        # Ensure all target columns are string type before using .str methods
        for col in ["Week", "Learning Outcomes", "Deliverables", "Assessments"]:
            if col in cleaned_df.columns:
                    cleaned_df[col] = cleaned_df[col].astype(str)

        # Remove only "nan" and "NaN" strings, but keep all other characters
        for col in ["Week", "Learning Outcomes", "Deliverables", "Assessments"]:
            if col in cleaned_df.columns:
                cleaned_df[col] = cleaned_df[col].replace(['nan', 'NaN'], '', regex=True)

        # For "Week" column: replace hyphens/en-dashes/periods/spaces/tabs with comma, extract all numbers, and join as comma-separated
        if "Week" in cleaned_df.columns:
            # Replace various separators with comma
            cleaned_df["Week"] = cleaned_df["Week"].str.replace(r'[\s\u2013\u2014\u2012\-\.]', ',', regex=True)
            # Extract all numbers
            cleaned_df["Week"] = cleaned_df["Week"].apply(lambda x: ','.join(re.findall(r'\d+', str(x)))) # Ensure x is string
            # Remove any accidental double commas and leading/trailing commas
            cleaned_df["Week"] = cleaned_df["Week"].str.replace(r',+', ',', regex=True).str.strip(',')

            # Ensure the cleaned "Week" column only contains numbers and commas
            cleaned_df["Week"] = cleaned_df["Week"].apply(lambda x: re.sub(r'[^0-9,]', '', str(x)))

        # For other columns (Deliverables, Assessments, Learning Outcomes), ensure hyphens/dashes are just text
        for col in ["Learning Outcomes", "Deliverables", "Assessments"]:
            if col in cleaned_df.columns:
                 # Replace common dash types with standard hyphen
                 cleaned_df[col] = cleaned_df[col].str.replace(r'[\u2013\u2014\u2012]', '-', regex=True)
                 # Clean leading hyphens or bullet-like characters if not already done by clean_text_data
                 cleaned_df[col] = cleaned_df[col].str.replace(r'^\s*[\-•»]\s*', '', regex=True) # Add hyphen to leading removal

        # --- Drop columns that are entirely empty after cleaning and row filtering ---
        # Check for columns where all values are either empty strings or NaN
        cols_to_drop = []
        for col in cleaned_df.columns:
             # Consider columns with only whitespace as empty for dropping
             if cleaned_df[col].astype(str).str.strip().replace('', pd.NA).isnull().all():
                 cols_to_drop.append(col)

        if cols_to_drop:
             print(f"  Dropping empty columns: {cols_to_drop}")
             cleaned_df = cleaned_df.drop(columns=cols_to_drop)
        # --- END NEW ---


        # add source information if requested
        if add_source_info:
            cleaned_df['Source_File'] = filename

            # extract course code from filename
            course_match = re.match(r'^([A-Z]+[0-9]*)', filename)
            if course_match:
                cleaned_df['Course_Code'] = course_match.group(1)
            else:
                cleaned_df['Course_Code'] = ''

        # reset index
        cleaned_df = cleaned_df.reset_index(drop=True)

        print(f"  Final shape: {cleaned_df.shape}")
        print(f"  Successfully processed")

        return cleaned_df

    except Exception as e:
        print(f"  Error processing {os.path.basename(file_path)}: {str(e)}")
        # return None in case of unexpected errors during processing
        return None


# process all CSV files
csv_files = find_csv_files(INPUT_CSV_DIRECTORY)
print(f"Starting to process {len(csv_files)} CSV files...")
print("=" * 60)

processed_files = []
failed_files = []
all_dataframes = []

for file_path in csv_files:
    # pass the HEADER_MAPPING from cell 4 to the processing function
    cleaned_df = process_single_csv(file_path, HEADER_MAPPING, add_source_info=True)

    # check if processing was successful (function did not return None)
    if cleaned_df is not None:
        processed_files.append(file_path)
        # append the DataFrame only if processing was successful and it's not empty
        if not cleaned_df.empty:
             all_dataframes.append(cleaned_df)
             print(f"  Success: {len(cleaned_df)} rows extracted")
        else:
             # file processed successfully but resulted in an empty DataFrame after cleaning
             print(f"  Success: No rows extracted after cleaning")

    else:
        # file processing failed (due to empty file or unexpected error)
        failed_files.append(file_path)
        print(f"  Failed to process")


print("=" * 60)
print(f"  Processing Summary:")
print(f"  Total files: {len(csv_files)}")
print(f"  Successfully processed: {len(processed_files)}")
print(f"  Failed: {len(failed_files)}")
print(f"  Success rate: {(len(processed_files)/len(csv_files)*100 if len(csv_files) > 0 else 0):.1f}%") # Handle division by zero

if failed_files:
    print(f"\nFailed files:")
    for failed_file in failed_files:
        print(f"  - {os.path.basename(failed_file)}")

    # a process where in it states what file failed and why
    print("\nheaders:")
    for failed_file in failed_files:
        try:
            # Read only a few lines to get headers without loading the whole potentially problematic file
            with open(failed_file, 'r', encoding='utf-8') as f:
                 first_line = f.readline().strip()
                 headers = first_line.split(',') # Simple split assuming comma delimiter
            print(f"  - {os.path.basename(failed_file)} headers: {headers}")
        except Exception as e:
            print(f"  - {os.path.basename(failed_file)} error reading headers: {e}")


print("\nIndividual processing completed")

In [None]:
def save_individual_files(dataframes_list, original_files_list, output_directory):
    """Save each cleaned DataFrame as an individual CSV file."""
    saved_files = []
    
    print(f"Saving {len(dataframes_list)} cleaned CSV files...")
    print("-" * 40)
    
    for df, original_file_path in zip(dataframes_list, original_files_list):
        try:
            # Create output filename
            original_name = Path(original_file_path).stem
            cleaned_filename = f"{original_name}_cleaned.csv"
            output_path = Path(output_directory) / cleaned_filename
            
            # Save the file
            df.to_csv(output_path, index=False)
            saved_files.append(str(output_path))
            
            print(f"Saved: {cleaned_filename} ({len(df)} rows)")
            
        except Exception as e:
            print(f"Error saving {os.path.basename(original_file_path)}: {str(e)}")
    
    return saved_files

# Save individual cleaned files
if all_dataframes:
    saved_individual_files = save_individual_files(all_dataframes, processed_files, OUTPUT_DIRECTORY)
    
    print("-" * 40)
    print(f"Individual file saving completed")
    print(f"Saved {len(saved_individual_files)} files to: {OUTPUT_DIRECTORY}")
    
    # Show first few saved files
    print(f"\nFirst 5 saved files:")
    for i, saved_file in enumerate(saved_individual_files[:5]):
        print(f"  {i+1}. {os.path.basename(saved_file)}")
    
    if len(saved_individual_files) > 5:
        print(f"  ... and {len(saved_individual_files) - 5} more files")
        
else:
    print("No dataframes to save - check previous steps for errors")

print("\nIndividual file saving completed")

In [None]:
def create_combined_dataset(dataframes_list, output_directory, filename="combined_cleaned_syllabi_data.csv"):
    """Combine all cleaned DataFrames into a single CSV file."""
    
    if not dataframes_list:
        print("No dataframes to combine")
        return None
    
    print(f"Creating combined dataset from {len(dataframes_list)} files...")
    
    # Combine all dataframes
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    # Sort by source file for better organization
    if 'Source_File' in combined_df.columns:
        combined_df = combined_df.sort_values('Source_File').reset_index(drop=True)
        print("Sorted by source file")
    
    # Create output path
    output_path = Path(output_directory) / filename
    
    # Save combined file
    combined_df.to_csv(output_path, index=False)
    
    print(f"Combined dataset saved to: {output_path}")
    print(f"Total rows in combined dataset: {len(combined_df)}")
    print(f"Columns: {list(combined_df.columns)}")
    
    # Show summary by source file
    if 'Source_File' in combined_df.columns:
        file_counts = combined_df['Source_File'].value_counts()
        print(f"\nRows per source file:")
        print(f"  Total unique files: {len(file_counts)}")
        print(f"  Average rows per file: {file_counts.mean():.1f}")
        print(f"  Min rows per file: {file_counts.min()}")
        print(f"  Max rows per file: {file_counts.max()}")
        
        print(f"\nTop 10 files by row count:")
        for i, (filename, count) in enumerate(file_counts.head(10).items()):
            print(f"  {i+1}. {filename}: {count} rows")
    
    # Show data quality summary
    print(f"\nData quality summary:")
    for col in ['Learning Outcomes', 'Deliverables', 'Assessments']:
        if col in combined_df.columns:
            non_empty = combined_df[col].str.len() > 0
            non_empty_count = non_empty.sum()
            percentage = (non_empty_count / len(combined_df)) * 100
            print(f"  {col}: {non_empty_count}/{len(combined_df)} ({percentage:.1f}%) non-empty")
    
    return str(output_path)

# Create combined dataset
if all_dataframes:
    combined_file_path = create_combined_dataset(
        all_dataframes, 
        COMBINED_OUTPUT_DIR, 
        "cleanbatch_syllabi_data.csv"
    )
    
    print(f"\nCombined dataset creation completed")
    print(f"File location: {combined_file_path}")
    
else:
    print("No dataframes available for combining")

print("\nCombined dataset ready")

In [None]:
def generate_final_summary():
    """Generate a comprehensive summary of the entire process."""
    
    print("=" * 60)
    print("FINAL PROCESSING SUMMARY")
    print("=" * 60)
    
    # Input summary
    print(f"\nINPUT:")
    print(f"  Source directory: {INPUT_CSV_DIRECTORY}")
    csv_files = find_csv_files(INPUT_CSV_DIRECTORY)
    print(f"  Total CSV files found: {len(csv_files)}")
    
    # Processing summary
    print(f"\nPROCESSING RESULTS:")
    print(f"  Successfully processed: {len(processed_files)}")
    print(f"  Failed to process: {len(failed_files)}")
    print(f"  Success rate: {(len(processed_files)/len(csv_files)*100):.1f}%")
    
    if all_dataframes:
        total_rows = sum(len(df) for df in all_dataframes)
        print(f"  Total rows extracted: {total_rows}")
        print(f"  Average rows per file: {total_rows/len(all_dataframes):.1f}")
    
    # Output summary
    print(f"\nOUTPUT:")
    print(f"  Individual cleaned files: {OUTPUT_DIRECTORY}")
    if all_dataframes:
        print(f"    Number of files: {len(all_dataframes)}")
    
    print(f"  Combined dataset: {COMBINED_OUTPUT_DIR}")
    
    # Check if combined file exists and get its info
    combined_file = Path(COMBINED_OUTPUT_DIR) / "combined_cleaned_syllabi_data.csv"
    if combined_file.exists():
        try:
            combined_df = pd.read_csv(combined_file)
            print(f"    Combined file rows: {len(combined_df)}")
            print(f"    Combined file columns: {list(combined_df.columns)}")
        except Exception as e:
            print(f"    Error reading combined file: {e}")
    
    # Failed files details
    if failed_files:
        print(f"\nFAILED FILES:")
        for i, failed_file in enumerate(failed_files, 1):
            print(f"  {i}. {os.path.basename(failed_file)}")
    
    # Header mapping summary
    print(f"\nHEADER MAPPING USED:")
    for canonical, variations in HEADER_MAPPING.items():
        print(f"  {canonical}:")
        for variation in variations[:3]:  # Show first 3 variations
            print(f"    - {variation}")
        if len(variations) > 3:
            print(f"    - ... and {len(variations)-3} more variations")
    
    print("=" * 60)
    print("PROCESS COMPLETED SUCCESSFULLY")
    print("=" * 60)

def validate_output():
    """Validate the output files."""
    
    print("\nVALIDATION:")
    print("-" * 30)
    
    # Check individual files
    individual_files = glob.glob(os.path.join(OUTPUT_DIRECTORY, "*_cleaned.csv"))
    print(f"Individual cleaned files: {len(individual_files)} found")
    
    # Check combined file
    combined_file = Path(COMBINED_OUTPUT_DIR) / "combined_cleaned_syllabi_data.csv"
    if combined_file.exists():
        print(f"Combined file: EXISTS")
        
        # Quick validation of combined file
        try:
            df_combined = pd.read_csv(combined_file)
            print(f"  Shape: {df_combined.shape}")
            
            # Check required columns
            required_cols = ['Learning Outcomes', 'Deliverables', 'Assessments']
            missing_cols = [col for col in required_cols if col not in df_combined.columns]
            
            if missing_cols:
                print(f"  Warning: Missing columns: {missing_cols}")
            else:
                print(f"  All required columns present: {required_cols}")
            
            # Check data completeness
            print(f"  Data completeness:")
            for col in required_cols:
                if col in df_combined.columns:
                    non_empty = (df_combined[col].astype(str).str.len() > 0) & (df_combined[col] != 'nan')
                    count = non_empty.sum()
                    percent = (count / len(df_combined)) * 100
                    print(f"    {col}: {count}/{len(df_combined)} ({percent:.1f}%) non-empty")
            
        except Exception as e:
            print(f"  Error validating combined file: {e}")
    else:
        print(f"Combined file: NOT FOUND")
    
    print("-" * 30)
    print("Validation completed")

# Generate final summary
generate_final_summary()

# Validate output
validate_output()

print("\n" + "=" * 60)
print("ALL STEPS COMPLETED")
print("=" * 60)
print(f"\nYour cleaned data is ready:")
print(f"1. Individual files: {OUTPUT_DIRECTORY}")
print(f"2. Combined file: {COMBINED_OUTPUT_DIR}/cleanbatch_syllabi_data.csv")