# Privacy Configuration in Genesis

Configure privacy settings for synthetic data generation.

In [None]:
import numpy as np
import pandas as pd

from genesis import SyntheticGenerator, PrivacyConfig, QualityEvaluator
from genesis.privacy.anonymity import check_k_anonymity, check_l_diversity
from genesis.analyzers.privacy import PrivacyAnalyzer

## Sample Sensitive Data

In [None]:
np.random.seed(42)
n = 1000

# Healthcare-like data with sensitive attributes
data = pd.DataFrame({
    'age': np.random.randint(18, 90, n),
    'zipcode': np.random.choice(['10001', '10002', '10003', '10004', '10005'], n),
    'gender': np.random.choice(['M', 'F'], n),
    'blood_pressure': np.random.normal(120, 15, n),
    'cholesterol': np.random.normal(200, 40, n),
    'diagnosis': np.random.choice(['Healthy', 'Hypertension', 'Diabetes', 'Heart Disease'], n, 
                                   p=[0.6, 0.2, 0.1, 0.1])  # Sensitive!
})

data.head()

## Privacy Risk Analysis

In [None]:
# Analyze privacy risks
analyzer = PrivacyAnalyzer()
risk = analyzer.analyze(data)

print(f"Overall Risk Score: {risk.overall_risk_score:.2f}")
print(f"K-Anonymity Estimate: {risk.k_anonymity_estimate}")
print(f"Quasi-identifiers: {risk.quasi_identifiers}")
print(f"Sensitive Attributes: {risk.sensitive_attributes}")

## Privacy Level Presets

In [None]:
# Low privacy (maximum utility)
low_privacy = PrivacyConfig(privacy_level='low')

# Medium privacy (balanced)
medium_privacy = PrivacyConfig(privacy_level='medium')

# High privacy (maximum protection)
high_privacy = PrivacyConfig(privacy_level='high')

print("High Privacy Config:")
print(f"  Differential Privacy: {high_privacy.enable_differential_privacy}")
print(f"  Epsilon: {high_privacy.epsilon}")
print(f"  K-Anonymity: {high_privacy.k_anonymity}")
print(f"  Suppress Rare: {high_privacy.suppress_rare_categories}")

## Differential Privacy

In [None]:
# Configure differential privacy
dp_config = PrivacyConfig(
    enable_differential_privacy=True,
    epsilon=1.0,  # Privacy budget (lower = more private)
    delta=1e-5,
)

dp_generator = SyntheticGenerator(method='gaussian_copula', privacy=dp_config)
dp_generator.fit(data, discrete_columns=['zipcode', 'gender', 'diagnosis'])
dp_synthetic = dp_generator.generate(n_samples=1000)

dp_synthetic.head()

## K-Anonymity

In [None]:
# Check k-anonymity on synthetic data
quasi_ids = ['age', 'zipcode', 'gender']

result = check_k_anonymity(dp_synthetic, quasi_ids, k=5)
print(f"Satisfies 5-anonymity: {result['satisfies_k']}")
print(f"Achieved k: {result['achieved_k']}")
print(f"Violating groups: {result['n_violating_groups']}")

## L-Diversity

In [None]:
# Check l-diversity for sensitive attribute
result = check_l_diversity(dp_synthetic, quasi_ids, 'diagnosis', l=3)
print(f"Satisfies 3-diversity: {result['satisfies_l']}")
print(f"Achieved l: {result['achieved_l']}")

## Comparing Privacy vs Utility

In [None]:
results = []

for epsilon in [10.0, 1.0, 0.1]:
    config = PrivacyConfig(
        enable_differential_privacy=True,
        epsilon=epsilon,
    )
    
    gen = SyntheticGenerator(method='gaussian_copula', privacy=config)
    gen.fit(data, discrete_columns=['zipcode', 'gender', 'diagnosis'])
    syn = gen.generate(n_samples=1000)
    
    evaluator = QualityEvaluator(data, syn)
    report = evaluator.evaluate()
    
    results.append({
        'Epsilon': epsilon,
        'Fidelity': report.fidelity_score,
        'ML Utility': report.utility_score,
        'Privacy': report.privacy_score,
    })

pd.DataFrame(results)

## Custom Privacy Configuration

In [None]:
# Fine-grained control
custom_config = PrivacyConfig(
    enable_differential_privacy=True,
    epsilon=0.5,
    delta=1e-6,
    k_anonymity=10,
    l_diversity=3,
    suppress_rare_categories=True,
    rare_category_threshold=0.02,
    sensitive_columns=['diagnosis'],
    quasi_identifiers=['age', 'zipcode', 'gender'],
)

print(custom_config.to_dict())