In [None]:
import os
import re
import uuid
import pandas as pd
import shutil
from datetime import datetime

class FileAnonymizer:
    def __init__(self, input_folder, output_folder):
        """
        Initialize the FileAnonymizer with paths and counters.
        
        Args:
            input_folder (str): Path to folder containing original files
            output_folder (str): Path where anonymized files will be saved
        """
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.ndax_counter = 1001  # Counter for .ndax files
        self.dat_counter = 2001   # Counter for .dat files
        self.mapping_data = []    # Store mapping information
        self.operator_mapping = {} # Store operator name mappings
        
        # Create output directory if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)
    
    def anonymize_operator_in_content(self, content):
        """
        Anonymize operator names in file content.
        
        Args:
            content (str): Original file content
            
        Returns:
            str: Content with anonymized operator names
        """
        operator_pattern = re.compile(r"Operator:\t([a-zA-Z]+)")
        operators = set(re.findall(operator_pattern, content))
        
        # Create mapping for new operators
        for operator in operators:
            if operator not in self.operator_mapping:
                self.operator_mapping[operator] = f"Operator_{uuid.uuid4().hex[:8]}"
        
        # Replace operators in content
        for operator, anon_id in self.operator_mapping.items():
            content = content.replace(f"Operator:\t{operator}", f"Operator:\t{anon_id}")
        
        return content
    
    def process_file(self, filename):
        """
        Process a single file for anonymization.
        
        Args:
            filename (str): Name of the file to process
        """
        file_path = os.path.join(self.input_folder, filename)
        file_extension = os.path.splitext(filename)[1].lower()
        
        if file_extension == '.ndax':
            new_filename = f"{self.ndax_counter}{file_extension}"
            self.ndax_counter += 1
            # Simple copy for .ndax files
            shutil.copy2(file_path, os.path.join(self.output_folder, new_filename))
            
        elif file_extension == '.dat':
            new_filename = f"{self.dat_counter}{file_extension}"
            self.dat_counter += 1
            
            # Read, anonymize operators, and write content
            with open(file_path, 'r') as f:
                content = f.read()
            
            anonymized_content = self.anonymize_operator_in_content(content)
            
            with open(os.path.join(self.output_folder, new_filename), 'w') as f:
                f.write(anonymized_content)
        
        else:
            return  # Skip other file types
        
        # Store mapping information
        self.mapping_data.append({
            'Original_Filename': filename,
            'Anonymized_Filename': new_filename,
            'Date_Anonymized': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })
    
    def save_mapping_files(self):
        """Save both the file mapping and operator mapping to files."""
        # Save file mapping to Excel
        key_file_path = os.path.join(self.output_folder, 'anonymization_key.xlsx')
        mapping_df = pd.DataFrame(self.mapping_data)
        
        # Create Excel writer with xlsxwriter engine
        with pd.ExcelWriter(key_file_path, engine='openpyxl') as writer:
            mapping_df.to_excel(writer, index=False, sheet_name='File Mapping')
        
        # Save operator mapping if any exists
        if self.operator_mapping:
            operator_key_path = os.path.join(self.output_folder, 'operator_key.txt')
            with open(operator_key_path, 'w') as f:
                for original, anonymized in self.operator_mapping.items():
                    f.write(f"{anonymized}: {original}\n")
    
    def process_all_files(self):
        """Process all files in the input directory."""
        # Get all files in input directory
        files = sorted(os.listdir(self.input_folder))
        
        # Process each file
        for filename in files:
            if filename.lower().endswith(('.ndax', '.dat')):
                self.process_file(filename)
        
        # Save mapping files
        self.save_mapping_files()
        
        # Print summary
        print(f"\nAnonymization Summary:")
        print(f"Total files processed: {len(self.mapping_data)}")
        print(f"NDAX files processed: {self.ndax_counter - 1001}")
        print(f"DAT files processed: {self.dat_counter - 2001}")
        print(f"Operators anonymized: {len(self.operator_mapping)}")
        print(f"\nMapping files saved to:")
        print(f"- File mapping: {os.path.join(self.output_folder, 'anonymization_key.xlsx')}")
        if self.operator_mapping:
            print(f"- Operator mapping: {os.path.join(self.output_folder, 'operator_key.txt')}")

def main():
    # Example usage
    input_folder = r"C:\path\to\input\folder"  # Update this path
    output_folder = r"C:\path\to\output\folder"  # Update this path
    
    # Create and run anonymizer
    anonymizer = FileAnonymizer(input_folder, output_folder)
    anonymizer.process_all_files()

if __name__ == "__main__":
    main()