In [None]:
# Step 1: Install required packages and imports
# Run this cell first

import pandas as pd
import glob
import os
import re
from pathlib import Path
import logging
from typing import List, Dict, Optional, Tuple

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("All packages imported successfully")

In [None]:
# Step 2: Configuration and Settings
# Define your file paths and header mappings

# CHANGE THESE PATHS TO MATCH YOUR SETUP
INPUT_CSV_DIRECTORY = "/content/csv_outputs"          # Where your CSV files from DOCX extraction are
OUTPUT_DIRECTORY = "/content/cleaned_csv"             # Where you want cleaned files to go
COMBINED_OUTPUT_DIR = "/content/combined_csv"         # Where you want the final combined file

# Define canonical headers and their possible variations (based on your DOCX extraction)
HEADER_MAPPING = {
    "Learning Outcomes": [
        "Learning Outcomes", "Learning Outcome", "Learning\\nOutcomes",
        "Learning Outcomes (LO) / Learner and Learning Outcomes (LLO)",
        "Learning Outcomes\\n(At the end of the session, students are expected to:)",
        "Learning Outcomes\\n(At the end of the session, students are expected to :)",
        "Learning\\nOutcomes", "Learning \\nOutcomes"
    ],
    "Deliverables": [
        "Deliverables Outcomes", "Deliverables/Outcomes", "Deliverables", 
        "Deliverables/\\nOutcomes", "Deliverables\\n/ Outcomes", "Deliverables/ Outcomes",
        "Deliverables/\\xa0 Outcomes", "Deliverables/\\xa0\\nOutcomes", 
        "Deliverable / Outcomes", "Deliverables\\n/ Outcomes/Rubrics"
    ],
    "Assessments": [
        "Assessments", "Assessment", "Assessment Task", "Assessment\\n/ Output"
    ]
}

# Create output directories
os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)
os.makedirs(COMBINED_OUTPUT_DIR, exist_ok=True)

print(f"Input directory: {INPUT_CSV_DIRECTORY}")
print(f"Output directory: {OUTPUT_DIRECTORY}")
print(f"Combined output directory: {COMBINED_OUTPUT_DIR}")
print("Configuration completed successfully")

In [None]:
# Step 3: Define helper functions
# These functions will be used in the main processing steps

def normalize_header(text):
    """
    Normalize header text: lowercase, remove spaces, slashes, punctuation.
    Same logic as your DOCX extraction script.
    """
    return re.sub(r'[^a-z0-9]', '', text.lower())

def find_csv_files(directory):
    """Find all CSV files in the specified directory."""
    csv_pattern = os.path.join(directory, "*.csv")
    csv_files = glob.glob(csv_pattern)
    return csv_files

def match_headers_to_canonical(df_headers, header_mapping):
    """
    Match DataFrame headers to canonical headers using substring matching.
    Returns matched canonical headers, column indices, and final header names.
    """
    matched_canonical_headers = []
    col_indices = []
    final_headers = []
    
    # Normalize input headers
    normalized_df_headers = [normalize_header(h) for h in df_headers]
    
    for canonical_header, possible_variations in header_mapping.items():
        found_variation_index = -1
        
        # Check if any variation matches as substring
        for variation in possible_variations:
            normalized_variation = normalize_header(variation)
            
            for j, normalized_table_header in enumerate(normalized_df_headers):
                if normalized_variation in normalized_table_header or normalized_table_header in normalized_variation:
                    found_variation_index = j
                    break
                    
            if found_variation_index != -1:
                break
        
        if found_variation_index != -1:
            col_indices.append(found_variation_index)
            final_headers.append(canonical_header)
            matched_canonical_headers.append(canonical_header)
            
    return matched_canonical_headers, col_indices, final_headers

def clean_text_data(df):
    """Clean text data removing common DOCX extraction artifacts."""
    df_clean = df.copy()
    
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Strip whitespace
            df_clean[col] = df_clean[col].astype(str).str.strip()
            
            # Replace 'nan' string with empty string
            df_clean[col] = df_clean[col].replace(['nan', 'NaN'], '')
            
            # Clean up common formatting issues from DOCX extraction
            df_clean[col] = df_clean[col].str.replace('\\n', ' ', regex=False)
            df_clean[col] = df_clean[col].str.replace('\\xa0', ' ', regex=False)
            df_clean[col] = df_clean[col].str.replace('  +', ' ', regex=True)  # Multiple spaces to single
    
    return df_clean

print("Helper functions defined successfully")

In [None]:
# Step 4: Analyze your CSV files (Optional but recommended)
# This helps you understand what headers are in your files

def analyze_csv_headers(input_directory):
    """Analyze headers across all CSV files."""
    csv_files = find_csv_files(input_directory)
    header_analysis = {}
    
    print(f"Found {len(csv_files)} CSV files to analyze")
    print("Analyzing headers...")
    
    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path, nrows=0)  # Read only headers
            filename = os.path.basename(file_path)
            header_analysis[filename] = list(df.columns)
        except Exception as e:
            filename = os.path.basename(file_path)
            header_analysis[filename] = f"Error: {str(e)}"
    
    return header_analysis

# Run the analysis
print("Starting header analysis...")
headers = analyze_csv_headers(INPUT_CSV_DIRECTORY)

# Show sample of headers from first 5 files
print(f"\nTotal files found: {len(headers)}")
print("\nSample headers from first 5 files:")
print("-" * 50)

count = 0
for filename, file_headers in headers.items():
    if count < 5:
        if isinstance(file_headers, list):
            print(f"{filename}:")
            print(f"  Headers: {file_headers}")
            print()
        else:
            print(f"{filename}: {file_headers}")
            print()
        count += 1

# Check which files have the target headers
print("Checking for target headers in all files...")
target_header_count = {"Learning Outcomes": 0, "Deliverables": 0, "Assessments": 0}

for filename, file_headers in headers.items():
    if isinstance(file_headers, list):
        normalized_headers = [normalize_header(h) for h in file_headers]
        
        # Check each canonical header
        for canonical_header, variations in HEADER_MAPPING.items():
            found = False
            for variation in variations:
                normalized_variation = normalize_header(variation)
                for norm_header in normalized_headers:
                    if normalized_variation in norm_header or norm_header in normalized_variation:
                        found = True
                        break
                if found:
                    break
            if found:
                target_header_count[canonical_header] += 1

print("\nTarget header distribution:")
for header, count in target_header_count.items():
    print(f"  {header}: found in {count}/{len(headers)} files")

print("\nHeader analysis completed")

In [None]:
# Step 5: Process individual CSV files and extract target columns

def process_single_csv(file_path, header_mapping, add_source_info=True):
    """Process a single CSV file and extract target columns."""
    try:
        filename = os.path.basename(file_path)
        print(f"Processing: {filename}")
        
        # Read CSV with encoding handling
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file_path, encoding='latin-1')
                print(f"  Used latin-1 encoding")
            except UnicodeDecodeError:
                df = pd.read_csv(file_path, encoding='cp1252')
                print(f"  Used cp1252 encoding")
        
        if df.empty:
            print(f"  Warning: Empty file")
            return None
        
        print(f"  Original shape: {df.shape}")
        print(f"  Original headers: {list(df.columns)}")
        
        # Match headers to canonical ones
        df_headers = list(df.columns)
        matched_canonical_headers, col_indices, final_headers = match_headers_to_canonical(df_headers, header_mapping)
        
        # Check if we have at least 2 matched headers
        if len(matched_canonical_headers) < 2:
            print(f"  Warning: Only found {len(matched_canonical_headers)} relevant headers: {matched_canonical_headers}")
            return None
            
        print(f"  Matched canonical headers: {matched_canonical_headers}")
        
        # Extract the relevant columns
        selected_columns = [df.columns[i] for i in col_indices]
        extracted_df = df[selected_columns].copy()
        
        # Rename columns to canonical names
        column_rename_map = dict(zip(selected_columns, final_headers))
        extracted_df.rename(columns=column_rename_map, inplace=True)
        
        # Ensure all three canonical columns exist
        all_canonical_headers = ["Learning Outcomes", "Deliverables", "Assessments"]
        for canonical_header in all_canonical_headers:
            if canonical_header not in extracted_df.columns:
                extracted_df[canonical_header] = ""
                
        # Reorder columns
        extracted_df = extracted_df[all_canonical_headers]
        
        # Clean the data
        cleaned_df = clean_text_data(extracted_df)
        
        # Remove completely empty rows
        cleaned_df = cleaned_df.dropna(how='all')
        
        # Remove rows where all target columns are empty
        mask = (cleaned_df['Learning Outcomes'].str.len() > 0) | \
               (cleaned_df['Deliverables'].str.len() > 0) | \
               (cleaned_df['Assessments'].str.len() > 0)
        cleaned_df = cleaned_df[mask]
        
        # Add source information if requested
        if add_source_info:
            cleaned_df['Source_File'] = filename
            
            # Extract course code from filename
            course_match = re.match(r'^([A-Z]+[0-9]*)', filename)
            if course_match:
                cleaned_df['Course_Code'] = course_match.group(1)
            else:
                cleaned_df['Course_Code'] = ''
        
        # Reset index
        cleaned_df = cleaned_df.reset_index(drop=True)
        
        print(f"  Final shape: {cleaned_df.shape}")
        print(f"  Successfully processed")
        
        return cleaned_df
        
    except Exception as e:
        print(f"  Error processing {os.path.basename(file_path)}: {str(e)}")
        return None

# Process all CSV files
csv_files = find_csv_files(INPUT_CSV_DIRECTORY)
print(f"Starting to process {len(csv_files)} CSV files...")
print("=" * 60)

processed_files = []
failed_files = []
all_dataframes = []

for file_path in csv_files:
    cleaned_df = process_single_csv(file_path, HEADER_MAPPING, add_source_info=True)
    
    if cleaned_df is not None and not cleaned_df.empty:
        processed_files.append(file_path)
        all_dataframes.append(cleaned_df)
        print(f"  Success: {len(cleaned_df)} rows extracted")
    else:
        failed_files.append(file_path)
        print(f"  Failed to process")
    
    print()

print("=" * 60)
print(f"Processing Summary:")
print(f"  Total files: {len(csv_files)}")
print(f"  Successfully processed: {len(processed_files)}")
print(f"  Failed: {len(failed_files)}")
print(f"  Total rows extracted: {sum(len(df) for df in all_dataframes)}")

if failed_files:
    print(f"\nFailed files:")
    for failed_file in failed_files:
        print(f"  - {os.path.basename(failed_file)}")

print("\nIndividual processing completed")

In [None]:
# Step 6: Save individual cleaned CSV files

def save_individual_files(dataframes_list, original_files_list, output_directory):
    """Save each cleaned DataFrame as an individual CSV file."""
    saved_files = []
    
    print(f"Saving {len(dataframes_list)} cleaned CSV files...")
    print("-" * 40)
    
    for df, original_file_path in zip(dataframes_list, original_files_list):
        try:
            # Create output filename
            original_name = Path(original_file_path).stem
            cleaned_filename = f"{original_name}_cleaned.csv"
            output_path = Path(output_directory) / cleaned_filename
            
            # Save the file
            df.to_csv(output_path, index=False)
            saved_files.append(str(output_path))
            
            print(f"Saved: {cleaned_filename} ({len(df)} rows)")
            
        except Exception as e:
            print(f"Error saving {os.path.basename(original_file_path)}: {str(e)}")
    
    return saved_files

# Save individual cleaned files
if all_dataframes:
    saved_individual_files = save_individual_files(all_dataframes, processed_files, OUTPUT_DIRECTORY)
    
    print("-" * 40)
    print(f"Individual file saving completed")
    print(f"Saved {len(saved_individual_files)} files to: {OUTPUT_DIRECTORY}")
    
    # Show first few saved files
    print(f"\nFirst 5 saved files:")
    for i, saved_file in enumerate(saved_individual_files[:5]):
        print(f"  {i+1}. {os.path.basename(saved_file)}")
    
    if len(saved_individual_files) > 5:
        print(f"  ... and {len(saved_individual_files) - 5} more files")
        
else:
    print("No dataframes to save - check previous steps for errors")

print("\nIndividual file saving completed")

In [None]:
# Step 7: Create combined dataset (like your original second cell)

def create_combined_dataset(dataframes_list, output_directory, filename="combined_cleaned_syllabi_data.csv"):
    """Combine all cleaned DataFrames into a single CSV file."""
    
    if not dataframes_list:
        print("No dataframes to combine")
        return None
    
    print(f"Creating combined dataset from {len(dataframes_list)} files...")
    
    # Combine all dataframes
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    # Sort by source file for better organization
    if 'Source_File' in combined_df.columns:
        combined_df = combined_df.sort_values('Source_File').reset_index(drop=True)
        print("Sorted by source file")
    
    # Create output path
    output_path = Path(output_directory) / filename
    
    # Save combined file
    combined_df.to_csv(output_path, index=False)
    
    print(f"Combined dataset saved to: {output_path}")
    print(f"Total rows in combined dataset: {len(combined_df)}")
    print(f"Columns: {list(combined_df.columns)}")
    
    # Show summary by source file
    if 'Source_File' in combined_df.columns:
        file_counts = combined_df['Source_File'].value_counts()
        print(f"\nRows per source file:")
        print(f"  Total unique files: {len(file_counts)}")
        print(f"  Average rows per file: {file_counts.mean():.1f}")
        print(f"  Min rows per file: {file_counts.min()}")
        print(f"  Max rows per file: {file_counts.max()}")
        
        print(f"\nTop 10 files by row count:")
        for i, (filename, count) in enumerate(file_counts.head(10).items()):
            print(f"  {i+1}. {filename}: {count} rows")
    
    # Show data quality summary
    print(f"\nData quality summary:")
    for col in ['Learning Outcomes', 'Deliverables', 'Assessments']:
        if col in combined_df.columns:
            non_empty = combined_df[col].str.len() > 0
            non_empty_count = non_empty.sum()
            percentage = (non_empty_count / len(combined_df)) * 100
            print(f"  {col}: {non_empty_count}/{len(combined_df)} ({percentage:.1f}%) non-empty")
    
    return str(output_path)

# Create combined dataset
if all_dataframes:
    combined_file_path = create_combined_dataset(
        all_dataframes, 
        COMBINED_OUTPUT_DIR, 
        "combined_cleaned_syllabi_data.csv"
    )
    
    print(f"\nCombined dataset creation completed")
    print(f"File location: {combined_file_path}")
    
else:
    print("No dataframes available for combining")

print("\nCombined dataset ready")

In [None]:
# Step 8: Final Summary and Validation

def generate_final_summary():
    """Generate a comprehensive summary of the entire process."""
    
    print("=" * 60)
    print("FINAL PROCESSING SUMMARY")
    print("=" * 60)
    
    # Input summary
    print(f"\nINPUT:")
    print(f"  Source directory: {INPUT_CSV_DIRECTORY}")
    csv_files = find_csv_files(INPUT_CSV_DIRECTORY)
    print(f"  Total CSV files found: {len(csv_files)}")
    
    # Processing summary
    print(f"\nPROCESSING RESULTS:")
    print(f"  Successfully processed: {len(processed_files)}")
    print(f"  Failed to process: {len(failed_files)}")
    print(f"  Success rate: {(len(processed_files)/len(csv_files)*100):.1f}%")
    
    if all_dataframes:
        total_rows = sum(len(df) for df in all_dataframes)
        print(f"  Total rows extracted: {total_rows}")
        print(f"  Average rows per file: {total_rows/len(all_dataframes):.1f}")
    
    # Output summary
    print(f"\nOUTPUT:")
    print(f"  Individual cleaned files: {OUTPUT_DIRECTORY}")
    if all_dataframes:
        print(f"    Number of files: {len(all_dataframes)}")
    
    print(f"  Combined dataset: {COMBINED_OUTPUT_DIR}")
    
    # Check if combined file exists and get its info
    combined_file = Path(COMBINED_OUTPUT_DIR) / "combined_cleaned_syllabi_data.csv"
    if combined_file.exists():
        try:
            combined_df = pd.read_csv(combined_file)
            print(f"    Combined file rows: {len(combined_df)}")
            print(f"    Combined file columns: {list(combined_df.columns)}")
        except Exception as e:
            print(f"    Error reading combined file: {e}")
    
    # Failed files details
    if failed_files:
        print(f"\nFAILED FILES:")
        for i, failed_file in enumerate(failed_files, 1):
            print(f"  {i}. {os.path.basename(failed_file)}")
    
    # Header mapping summary
    print(f"\nHEADER MAPPING USED:")
    for canonical, variations in HEADER_MAPPING.items():
        print(f"  {canonical}:")
        for variation in variations[:3]:  # Show first 3 variations
            print(f"    - {variation}")
        if len(variations) > 3:
            print(f"    - ... and {len(variations)-3} more variations")
    
    print("=" * 60)
    print("PROCESS COMPLETED SUCCESSFULLY")
    print("=" * 60)

def validate_output():
    """Validate the output files."""
    
    print("\nVALIDATION:")
    print("-" * 30)
    
    # Check individual files
    individual_files = glob.glob(os.path.join(OUTPUT_DIRECTORY, "*_cleaned.csv"))
    print(f"Individual cleaned files: {len(individual_files)} found")
    
    # Check combined file
    combined_file = Path(COMBINED_OUTPUT_DIR) / "combined_cleaned_syllabi_data.csv"
    if combined_file.exists():
        print(f"Combined file: EXISTS")
        
        # Quick validation of combined file
        try:
            df_combined = pd.read_csv(combined_file)
            print(f"  Shape: {df_combined.shape}")
            
            # Check required columns
            required_cols = ['Learning Outcomes', 'Deliverables', 'Assessments']
            missing_cols = [col for col in required_cols if col not in df_combined.columns]
            
            if missing_cols:
                print(f"  Warning: Missing columns: {missing_cols}")
            else:
                print(f"  All required columns present: {required_cols}")
            
            # Check data completeness
            print(f"  Data completeness:")
            for col in required_cols:
                if col in df_combined.columns:
                    non_empty = (df_combined[col].astype(str).str.len() > 0) & (df_combined[col] != 'nan')
                    count = non_empty.sum()
                    percent = (count / len(df_combined)) * 100
                    print(f"    {col}: {count}/{len(df_combined)} ({percent:.1f}%) non-empty")
            
        except Exception as e:
            print(f"  Error validating combined file: {e}")
    else:
        print(f"Combined file: NOT FOUND")
    
    print("-" * 30)
    print("Validation completed")

# Generate final summary
generate_final_summary()

# Validate output
validate_output()

print("\n" + "=" * 60)
print("ALL STEPS COMPLETED")
print("=" * 60)
print(f"\nYour cleaned data is ready:")
print(f"1. Individual files: {OUTPUT_DIRECTORY}")
print(f"2. Combined file: {COMBINED_OUTPUT_DIR}/combined_cleaned_syllabi_data.csv")