# 🔐 Encrypto-Anonymizer (Local Version)

**A locally-running tool to anonymize and encrypt CSV data securely.**

## 🔒 About the Encrypto-Anonymizer
- All files are processed locally on your machine
- No data is uploaded to any cloud service
- Files are stored only where you explicitly save them
- Perfect for sensitive student data or any confidential information

## 📦 Setup: Install required packages

In [None]:
# Install required packages using pip
# If using UV, you can add these to your pyproject.toml instead
import sys
!{sys.executable} -m pip install pandas cryptography

## 📁 Step 1: Load a CSV file from local filesystem

In [1]:
import pandas as pd
import os
from pathlib import Path

# Specify the path to your CSV file
csv_path = "FILE.CSV"
csv_path = Path(csv_path).expanduser()  # Handle ~ in paths

if not csv_path.exists():
    print(f"Error: File not found at {csv_path}")
else:
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows from {csv_path.name}")
    print("\nFirst 5 rows:")
    display(df.head())
    print("\nColumn names:", list(df.columns))

SyntaxError: unterminated string literal (detected at line 6) (2080404564.py, line 6)

## 🎯 Step 2: Select columns to anonymize

In [None]:
# You can modify this list or make it interactive
columns_to_anonymize = ['Name', 'Email', 'ID']  # Change as needed

# Interactive column selection
print("Available columns:", list(df.columns))
print("\nCurrent columns to anonymize:", columns_to_anonymize)
print("\nWould you like to modify the columns to anonymize? (y/n)")
if input().lower() == 'y':
    columns_to_anonymize = []
    print("Enter column names one by one (press Enter without text to finish):")
    while True:
        col = input().strip()
        if not col:
            break
        if col in df.columns:
            columns_to_anonymize.append(col)
        else:
            print(f"Warning: '{col}' not found in dataframe")

print("Selected columns to anonymize:", columns_to_anonymize)

## 🔀 Step 3: Anonymize and create encrypted mapping

**Options:**
- `use_composite_key = True`: Creates a single anonymous ID for all selected columns in a row (linked anonymization)
- `use_composite_key = False`: Creates separate anonymous IDs for each value (traditional behavior)

The code now:
- Checks if values already exist in the mapping to avoid duplicates
- Can create composite keys for multiple fields
- Maintains a reverse mapping for efficiency

In [None]:
import uuid
from cryptography.fernet import Fernet
import json

# Generate a secure encryption key
fernet_key = Fernet.generate_key()
fernet = Fernet(fernet_key)

# Single mapping dictionary to store all mappings
mapping = {}
# Reverse mapping to check if a value already has an ID
reverse_mapping = {}

anon_df = df.copy()

# Check if we want to create a composite key for all selected columns
use_composite_key = True  # Set to False if you want separate mappings per value

if use_composite_key and len(columns_to_anonymize) > 1:
    # Create composite key based on all selected columns
    print("Using composite key for multiple columns...")
    for index in range(len(df)):
        # Create a composite key from all anonymized columns for this row
        composite_parts = []
        for col in columns_to_anonymize:
            val = str(df.iloc[index][col])
            composite_parts.append(val)
        
        # Create a unique identifier for this combination
        composite_key = "|".join(composite_parts)
        
        # Check if this combination already has an ID
        if composite_key in reverse_mapping:
            anon_id = reverse_mapping[composite_key]
        else:
            anon_id = str(uuid.uuid4())
            # Encrypt each value separately but store under same ID
            encrypted_values = {}
            for i, col in enumerate(columns_to_anonymize):
                encrypted_val = fernet.encrypt(composite_parts[i].encode()).decode()
                encrypted_values[col] = encrypted_val
            
            mapping[anon_id] = encrypted_values
            reverse_mapping[composite_key] = anon_id
        
        # Apply the same anonymous ID to all columns for this row
        for col in columns_to_anonymize:
            anon_df.loc[index, col] = anon_id
else:
    # Original behavior - separate mapping per value
    print("Using separate mappings per value...")
    for col in columns_to_anonymize:
        new_ids = []
        for val in df[col]:
            str_val = str(val)
            
            # Check if this value already has a mapping
            if str_val in reverse_mapping:
                anon_id = reverse_mapping[str_val]
            else:
                anon_id = str(uuid.uuid4())
                encrypted_val = fernet.encrypt(str_val.encode()).decode()
                mapping[anon_id] = encrypted_val
                reverse_mapping[str_val] = anon_id
            
            new_ids.append(anon_id)
        anon_df[col] = new_ids

# Save files to a specified directory
output_dir = Path(input("Enter output directory (default: current directory): ").strip() or ".")
output_dir = output_dir.expanduser()
output_dir.mkdir(exist_ok=True)

# Save anonymized data
anon_path = output_dir / "anonymized.csv"
anon_df.to_csv(anon_path, index=False)

# Save mapping
mapping_path = output_dir / "mapping.json"
with open(mapping_path, "w") as f:
    json.dump(mapping, f, indent=2)

# Save encryption key
key_path = output_dir / "key.key"
with open(key_path, "wb") as f:
    f.write(fernet_key)

print(f"✅ Done anonymizing. Files saved to:")
print(f"  - Anonymized data: {anon_path}")
print(f"  - Mapping: {mapping_path}")
print(f"  - Encryption key: {key_path}")
print("\n⚠️  Keep the mapping.json and key.key files secure!")

## 🔓 Step 4: Load anonymized file + mapping + key to de-anonymize

In [None]:
# Load files for de-anonymization
print("Enter paths to the required files:")
anon_path = Path(input("Anonymized CSV file: ").strip()).expanduser()
mapping_path = Path(input("Mapping JSON file: ").strip()).expanduser()
key_path = Path(input("Encryption key file: ").strip()).expanduser()

# Verify files exist
missing_files = []
for path, name in [(anon_path, "Anonymized CSV"), (mapping_path, "Mapping"), (key_path, "Key")]:
    if not path.exists():
        missing_files.append(f"{name}: {path}")

if missing_files:
    print("Error: The following files were not found:")
    for f in missing_files:
        print(f"  - {f}")
else:
    # Load the files
    anon_df = pd.read_csv(anon_path)
    
    with open(mapping_path) as f:
        mapping = json.load(f)
    
    with open(key_path, "rb") as f:
        fernet = Fernet(f.read())
    
    print("✅ Files loaded successfully")

## 🔁 Step 5: De-anonymize and restore original values

In [None]:
restored_df = anon_df.copy()

# Automatically detect which columns were anonymized
# (columns that contain UUID-like values)
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$')

anonymized_columns = []
for col in anon_df.columns:
    # Check if the first non-null value looks like a UUID
    first_val = anon_df[col].dropna().iloc[0] if not anon_df[col].dropna().empty else None
    if first_val and isinstance(first_val, str) and uuid_pattern.match(first_val):
        anonymized_columns.append(col)

print(f"Detected anonymized columns: {anonymized_columns}")

# De-anonymize the detected columns
for col in anonymized_columns:
    original_values = []
    for anon_id in anon_df[col]:
        if pd.isna(anon_id):
            original_values.append(None)
        elif anon_id in mapping:
            if isinstance(mapping[anon_id], dict):
                # Composite key mapping - extract value for specific column
                if col in mapping[anon_id]:
                    encrypted_val = mapping[anon_id][col]
                    decrypted_val = fernet.decrypt(encrypted_val.encode()).decode()
                else:
                    decrypted_val = anon_id  # Column not in mapping, keep as is
            else:
                # Simple mapping - decrypt direct value
                decrypted_val = fernet.decrypt(mapping[anon_id].encode()).decode()
            original_values.append(decrypted_val)
        else:
            print(f"Warning: No mapping found for ID {anon_id}")
            original_values.append(anon_id)
    
    restored_df[col] = original_values

# Save restored file
output_path = Path(input("Enter path for restored CSV (default: restored.csv): ").strip() or "restored.csv")
output_path = output_path.expanduser()
restored_df.to_csv(output_path, index=False)
print(f"✅ Restoration complete. File saved to: {output_path}")
print("\nFirst 5 rows of restored data:")
display(restored_df.head())