In [2]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from collections import Counter
from typing import Dict, List, Tuple

In [3]:
#sample datasets containing hypothetical medical records; features include age, last three digits of pincode and BP; Target feature is medical condition
dataset1 = pd.DataFrame({
    'age': [25, 25, 25, 32, 32, 45, 45, 45, 58, 58],
    'pincode': ['123', '123', '123', '234', '234', '345', '345', '345', '456', '456'],
    'blood_pressure': ['high', 'normal', 'high', 'low', 'normal', 'high', 'high', 'normal', 'low', 'high'],
    'condition': ['diabetes', 'flu', 'hypertension', 'asthma', 'diabetes', 
                 'flu', 'diabetes', 'asthma', 'hypertension', 'flu']
})

dataset2 = pd.DataFrame({
    'age': [25, 25, 25, 32, 32, 32, 45, 45, 58, 58],
    'pincode': ['123', '123', '123', '234', '234', '234', '345', '345', '456', '456'],
    'blood_pressure': ['high', 'high', 'normal', 'low', 'low', 'normal', 'high', 'normal', 'low', 'high'],
    'condition': ['diabetes', 'diabetes', 'flu', 'asthma', 'asthma', 'flu', 
                 'hypertension', 'diabetes', 'flu', 'hypertension']
})

In [4]:
#Privacy Metrics class with functions to calculate individual scores and a compound score to compare datasets
class PrivacyMetrics:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon #differential privacy parameter, set to 0.1 by default
    
    def k_anonymity(self, data: pd.DataFrame, quasi_identifiers: List[str]) -> int:        
        #Group by quasi-identifiers and count group sizes; minimum group size gives k-anonymity score
        grouped = data.groupby(quasi_identifiers).size()
        k = grouped.min()
        return k

    def l_diversity(self, data: pd.DataFrame, quasi_identifiers: List[str], sensitive_attribute: str) -> float:
        #number of distinct values per group gives l-diversity score; Constraint: l <= k
        min_diversity = float('inf')
        groups = data.groupby(quasi_identifiers)        
        for _, group in groups:
            distinct_values = len(group[sensitive_attribute].unique()) 
            min_diversity = min(min_diversity, distinct_values)        
        return min_diversity

    def t_closeness(self, data: pd.DataFrame, quasi_identifiers: List[str], sensitive_attribute: str) -> float:        
        global_dist = data[sensitive_attribute].value_counts(normalize=True) #global distribution of attribute
        max_distance = 0        
        groups = data.groupby(quasi_identifiers)
        for _, group in groups:
            group_dist = group[sensitive_attribute].value_counts(normalize=True)            
            all_values = set(global_dist.index) | set(group_dist.index)
            global_aligned = pd.Series([global_dist.get(v, 0) for v in all_values])
            group_aligned = pd.Series([group_dist.get(v, 0) for v in all_values])            
            #approximate Earth Mover's Distance to calculate difference in distributions
            distance = np.abs(np.cumsum(global_aligned - group_aligned)).max()
            max_distance = max(max_distance, distance)         
        return max_distance

    def differential_noise(self, data: pd.DataFrame, column: str, stddev: float) -> pd.Series:
        true_counts = data[column].value_counts()
        scale = stddev / self.epsilon
        noise = np.random.normal(0, stddev, size=len(true_counts)) #normally distributed noise
        noisy_counts = true_counts + noise
        noisy_counts = noisy_counts.clip(lower=0) #values cannot be negative in our dataset        
        return noisy_counts

    def privacy_metrics(self, data: pd.DataFrame, quasi_identifiers: List[str], sensitive_attributes: List[str]) -> Dict:
        metrics = {}
        metrics['k_anonymity'] = self.k_anonymity(data, quasi_identifiers)
        l_div_scores = []
        for attr in sensitive_attributes:
            l_div_scores.append(self.l_diversity(data, quasi_identifiers, attr))
        metrics['l_diversity'] = min(l_div_scores)
        t_close_scores = []
        for attr in sensitive_attributes:
            t_close_scores.append(self.t_closeness(data, quasi_identifiers, attr))
        metrics['t_closeness'] = max(t_close_scores)
        dp_counts = {}
        for attr in sensitive_attributes:
            dp_counts[attr] = self.differential_noise(data, attr, stddev=1.0)
        metrics['dp_counts'] = dp_counts        
        return metrics

    #function to calculate overall privacy score using randomly chosen weights for each score
    def privacy_score(self, metrics: Dict) -> float:
        k_norm = min(metrics['k_anonymity'] / 3, 1.0)  
        l_norm = min(metrics['l_diversity'] / 2, 1.0)  
        t_norm = 1 - metrics['t_closeness'] 
        weights = {'k_anonymity': 0.4, 'l_diversity': 0.3, 't_closeness': 0.3}
        privacy_score = (weights['k_anonymity'] * k_norm +weights['l_diversity'] * l_norm +weights['t_closeness'] * t_norm)        
        return round(privacy_score, 3)


In [5]:
#function to compare privacy of two datasets
def compare_datasets(dataset1: pd.DataFrame, dataset2: pd.DataFrame, quasi_identifiers: List[str], sensitive_attributes: List[str], epsilon: float = 0.1) -> Dict:
    analyzer = PrivacyMetrics(epsilon=epsilon)    
    metrics1 = analyzer.privacy_metrics(dataset1, quasi_identifiers, sensitive_attributes)
    metrics2 = analyzer.privacy_metrics(dataset2, quasi_identifiers, sensitive_attributes)
    score1 = analyzer.privacy_score(metrics1)
    score2 = analyzer.privacy_score(metrics2)    
    return {
        'dataset1': {'metrics': metrics1, 'privacy_score': score1},
        'dataset2': {'metrics': metrics2, 'privacy_score': score2}
    }

In [6]:
#Comparision
quasi_identifiers = ['age', 'pincode']
sensitive_attributes = ['blood_pressure', 'condition']
results = compare_datasets(dataset1, dataset2, quasi_identifiers, sensitive_attributes, epsilon=0.1)

In [7]:
#Results
print("\nDataset 1 Analysis:")
print("------------------")
print(f"k-anonymity: {results['dataset1']['metrics']['k_anonymity']}")
print(f"l-diversity: {results['dataset1']['metrics']['l_diversity']}")
print(f"t-closeness: {results['dataset1']['metrics']['t_closeness']:.3f}")
print(f"Final Privacy Score: {results['dataset1']['privacy_score']}")
print("\nDifferentially Private Counts:")
for attr in sensitive_attributes:
    print(f"\n{attr}:")
    print(results['dataset1']['metrics']['dp_counts'][attr])

print("\nDataset 2 Analysis:")
print("------------------")
print(f"k-anonymity: {results['dataset2']['metrics']['k_anonymity']}")
print(f"l-diversity: {results['dataset2']['metrics']['l_diversity']}")
print(f"t-closeness: {results['dataset2']['metrics']['t_closeness']:.3f}")
print(f"Final Privacy Score: {results['dataset2']['privacy_score']}")
print("\nDifferentially Private Counts:")
for attr in sensitive_attributes:
    print(f"\n{attr}:")
    print(results['dataset2']['metrics']['dp_counts'][attr])


Dataset 1 Analysis:
------------------
k-anonymity: 2
l-diversity: 2
t-closeness: 0.500
Final Privacy Score: 0.717

Differentially Private Counts:

blood_pressure:
blood_pressure
high      4.964887
normal    3.612012
low       3.536350
Name: count, dtype: float64

condition:
condition
diabetes        3.440919
flu             2.525616
hypertension    2.505825
asthma          3.978846
Name: count, dtype: float64

Dataset 2 Analysis:
------------------
k-anonymity: 2
l-diversity: 2
t-closeness: 0.500
Final Privacy Score: 0.717

Differentially Private Counts:

blood_pressure:
blood_pressure
high      4.845844
normal    3.766208
low       2.381761
Name: count, dtype: float64

condition:
condition
diabetes        3.953843
flu             1.061946
asthma          1.391328
hypertension    3.470046
Name: count, dtype: float64
