In [None]:
%pip install fuzzywuzzy python-levenshtein pandas numpy

In [10]:
# Import Required Libraries
import pandas as pd
import numpy as np
import os
import glob
import re
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz, process
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configuration - Set your folder paths here
input_folder = 'C:\\REFACTOR\\docx_output'  # Where your cleaned CSV files are located
output_folder = 'C:\\REFACTOR\\clean_output'  # Where filtered results will be saved

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

print(f"Input folder: {input_folder}")
print(f"Output folder: {output_folder}")

# Load and explore CSV files
csv_pattern = os.path.join(input_folder, 'cleaned_extracted_data_*.csv')
csv_files = glob.glob(csv_pattern)

print(f"\nFound {len(csv_files)} CSV files to process:")
for i, file in enumerate(csv_files[:5], 1):  # Show first 5 files
    print(f"  {i}. {os.path.basename(file)}")
if len(csv_files) > 5:
    print(f"  ... and {len(csv_files) - 5} more files")

if not csv_files:
    print("No CSV files found! Please check the input folder path.")
else:
    print(f"\nReady to process {len(csv_files)} files")

In [None]:
# Define Target Columns for Extraction
target_columns_config = {
    'learning_activities': {
        'output_name': 'Learning_Activities',
        'keywords': ['Learning_Activities', 'learning activities', 'activities', 'learning activity', 'activity']
    },
    'learning_outcomes': {
        'output_name': 'Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO',
        'keywords': ['Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO', 'learning outcomes', 'outcomes', 'learning outcome', 'LO', 'LLO']
    },
    'assessment': {
        'output_name': 'Assessment',
        'keywords': ['Assessment', 'assessment', 'assessments', 'evaluation', 'grade', 'grading']
    }
}

similarity_threshold = 60  # Minimum similarity score for fuzzy matching (0-100)

print("Target Columns Configuration:")
for key, config in target_columns_config.items():
    print(f"   {config['output_name']}:")
    print(f"     Keywords: {', '.join(config['keywords'])}")
print(f"\nSimilarity threshold: {similarity_threshold}% (minimum match confidence)")

# Explore sample CSV structure
if csv_files:
    print(f"\nSample CSV Structure (from {os.path.basename(csv_files[0])}):")
    try:
        sample_df = pd.read_csv(csv_files[0])
        print(f"Rows: {len(sample_df)}")
        print(f"Columns: {len(sample_df.columns)}")
        print("Column names:")
        for i, col in enumerate(sample_df.columns, 1):
            print(f"     {i:2d}. {col}")
    except Exception as e:
        print(f"   Error reading sample file: {e}")

In [None]:
# Create Data Cleaning Functions
def calculate_similarity(text1, text2):
    """Calculate similarity between two strings using multiple ML methods"""
    # Normalize strings - remove special characters and convert to lowercase
    text1_norm = re.sub(r'[^a-zA-Z0-9]', '', str(text1).lower())
    text2_norm = re.sub(r'[^a-zA-Z0-9]', '', str(text2).lower())
    
    if not text1_norm or not text2_norm:
        return 0
    
    # Multiple similarity metrics for robust matching
    ratio = SequenceMatcher(None, text1_norm, text2_norm).ratio() * 100
    fuzzy_ratio = fuzz.ratio(text1_norm, text2_norm)
    fuzzy_partial = fuzz.partial_ratio(text1_norm, text2_norm)
    fuzzy_token = fuzz.token_sort_ratio(text1_norm, text2_norm)
    
    # Return the maximum similarity score
    return max(ratio, fuzzy_ratio, fuzzy_partial, fuzzy_token)

def find_best_column_match(available_columns, target_config):
    """Use ML techniques to find the best matching column"""
    keywords = target_config['keywords']
    best_match = None
    best_score = 0
    
    for col in available_columns:
        col_clean = str(col).strip()
        if not col_clean or col_clean.lower() in ['unnamed', 'column', 'nan']:
            continue
            
        # Calculate similarity against all target keywords
        max_similarity = 0
        for keyword in keywords:
            similarity = calculate_similarity(col_clean, keyword)
            max_similarity = max(max_similarity, similarity)
        
        # Update best match if this column has higher similarity
        if max_similarity > best_score and max_similarity >= similarity_threshold:
            best_score = max_similarity
            best_match = col_clean
    
    return best_match, best_score

def clean_cell_data(value):
    """Clean individual cell data"""
    if pd.isna(value) or value == '' or str(value).lower() in ['nan', 'none']:
        return ''
    
    # Convert to string and clean
    cleaned = str(value).strip()
    
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    
    return cleaned

print("Functions ready")

In [None]:
# Extract and Filter Individual CSV Files
def process_single_csv(csv_path):
    """Process a single CSV file and extract target columns"""
    filename = os.path.basename(csv_path)
    print(f"\nProcessing: {filename}")
    
    try:
        # Load CSV
        df = pd.read_csv(csv_path)
        print(f"   Loaded: {len(df)} rows, {len(df.columns)} columns")
        
        # Find best matching columns using ML
        matched_columns = {}
        detection_scores = {}
        
        for target_type, config in target_columns_config.items():
            best_match, score = find_best_column_match(df.columns, config)
            if best_match:
                matched_columns[target_type] = best_match
                detection_scores[target_type] = score
                print(f"   {config['output_name']}: '{best_match}' (confidence: {score:.1f}%)")
            else:
                print(f"   {config['output_name']}: No suitable match found")
        
        # Extract and clean data
        extracted_data = []
        
        for index, row in df.iterrows():
            row_data = {
                'File': filename,
                'Row': index + 1
            }
            
            # Extract each target column if found
            for target_type, config in target_columns_config.items():
                output_name = config['output_name']
                
                if target_type in matched_columns:
                    original_value = row[matched_columns[target_type]]
                    cleaned_value = clean_cell_data(original_value)
                    row_data[output_name] = cleaned_value
                else:
                    row_data[output_name] = ''  # Empty if column not found
            
            extracted_data.append(row_data)
        
        print(f"   Extracted {len(extracted_data)} rows with {len(matched_columns)}/3 target columns")
        return extracted_data, matched_columns, detection_scores
        
    except Exception as e:
        print(f"   Error processing: {e}")
        return [], {}, {}

# Initialize storage for all extracted data
all_filtered_data = []
processing_stats = {
    'files_processed': 0,
    'files_with_all_columns': 0,
    'files_with_partial_columns': 0,
    'total_rows_extracted': 0,
    'column_detection_details': {}
}

print("Starting individual CSV processing...")

In [None]:
# Process All CSV Files and Save Individual Filtered Files
for csv_file in csv_files:
    # Process the CSV file
    extracted_data, matched_columns, detection_scores = process_single_csv(csv_file)
    
    if extracted_data:
        # Store data for combined processing
        all_filtered_data.extend(extracted_data)
        
        # Save individual filtered CSV
        filename = os.path.basename(csv_file)
        base_name = os.path.splitext(filename)[0]
        
        # Remove 'cleaned_extracted_data_' prefix if present
        if base_name.startswith('cleaned_extracted_data_'):
            base_name = base_name.replace('cleaned_extracted_data_', '')
        
        # Create filtered filename
        filtered_filename = f"filtered_extracted_data_{base_name}.csv"
        output_path = os.path.join(output_folder, filtered_filename)
        
        try:
            # Save filtered data
            df_filtered = pd.DataFrame(extracted_data)
            df_filtered.to_csv(output_path, index=False)
            print(f"   Saved: {filtered_filename}")
        except Exception as e:
            print(f"   Error saving: {e}")
        
        # Update statistics
        columns_found = len(matched_columns)
        if columns_found == 3:
            processing_stats['files_with_all_columns'] += 1
        elif columns_found > 0:
            processing_stats['files_with_partial_columns'] += 1
            
        processing_stats['total_rows_extracted'] += len(extracted_data)
        processing_stats['column_detection_details'][filename] = detection_scores
    
    processing_stats['files_processed'] += 1

print("Individual CSV processing completed!")
print(f"Files processed: {processing_stats['files_processed']}")
print(f"Total rows extracted: {processing_stats['total_rows_extracted']}")

In [None]:
# Combine All Filtered Data
print("\nCreating combined filtered dataset...")

if all_filtered_data:
    try:
        # Create combined DataFrame
        df_combined = pd.DataFrame(all_filtered_data)
        
        # Save combined filtered CSV
        combined_path = os.path.join(output_folder, 'combined_filtered_extracted_data.csv')
        df_combined.to_csv(combined_path, index=False)
        
        print("Saved: combined_filtered_extracted_data.csv")
        print(f"Total rows: {len(df_combined)}")
        print(f"Columns: {list(df_combined.columns)}")
            
    except Exception as e:
        print(f"Error creating combined file: {e}")
        df_combined = pd.DataFrame()
else:
    print("No data to combine!")
    df_combined = pd.DataFrame()

In [None]:
# Create Column-Specific Combined Files
import time
execution_id = int(time.time() * 1000) % 10000  # Simple execution tracker
print(f"Creating column-specific combined files... (exec: {execution_id})")

if not df_combined.empty:
    # 1. All Learning Activities
    learning_activities_data = []
    for _, row in df_combined.iterrows():
        value = str(row['Learning_Activities']).strip() if pd.notna(row['Learning_Activities']) else ''
        if value:  # Only include non-empty entries
            learning_activities_data.append({
                'File': row['File'],
                'Row': row['Row'],
                'Learning_Activities': value
            })
    
    if learning_activities_data:
        df_activities = pd.DataFrame(learning_activities_data)
        # Remove duplicates based on content (keep first occurrence)
        df_activities = df_activities.drop_duplicates(subset=['Learning_Activities'], keep='first')
        activities_path = os.path.join(output_folder, 'all_learning_activities.csv')
        df_activities.to_csv(activities_path, index=False)
        print(f"Saved: all_learning_activities.csv ({len(df_activities)} entries)")
    
    # 2. All Learning Outcomes
    learning_outcomes_data = []
    for _, row in df_combined.iterrows():
        value = str(row['Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO']).strip() if pd.notna(row['Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO']) else ''
        if value:  # Only include non-empty entries
            learning_outcomes_data.append({
                'File': row['File'],
                'Row': row['Row'],
                'Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO': value
            })
    
    if learning_outcomes_data:
        df_outcomes = pd.DataFrame(learning_outcomes_data)
        # Remove duplicates based on content (keep first occurrence)
        df_outcomes = df_outcomes.drop_duplicates(subset=['Learning_Outcomes_LO_Learner_and_Learning_Outcomes_LLO'], keep='first')
        outcomes_path = os.path.join(output_folder, 'all_learning_outcomes.csv')
        df_outcomes.to_csv(outcomes_path, index=False)
        print(f"Saved: all_learning_outcomes.csv ({len(df_outcomes)} entries)")
    
    # 3. All Assessments
    assessment_data = []
    for _, row in df_combined.iterrows():
        value = str(row['Assessment']).strip() if pd.notna(row['Assessment']) else ''
        if value:  # Only include non-empty entries
            assessment_data.append({
                'File': row['File'],
                'Row': row['Row'],
                'Assessment': value
            })
    
    if assessment_data:
        df_assessments = pd.DataFrame(assessment_data)
        # Remove duplicates based on content (keep first occurrence)
        df_assessments = df_assessments.drop_duplicates(subset=['Assessment'], keep='first')
        assessments_path = os.path.join(output_folder, 'all_assessment.csv')
        df_assessments.to_csv(assessments_path, index=False)
        print(f"Saved: all_assessment.csv ({len(df_assessments)} entries)")
        
else:
    print("No combined data available for column-specific files")