In [1]:
import numpy as np
import pandas as pd

# Enhanced Stratified Sampling (ESS) Implementation

In [2]:
# Putting the ECs back into one DataFrame. This is used to join the stratified data back together
# to take the total length of the dataset, when needed
def join_strats(data_strat):
    joined_strats = pd.DataFrame()

    for strat in data_strat:
        joined_strats = pd.concat([joined_strats, strat])

    return joined_strats

In [3]:
# Creates fully populated equivalence classes, or ECs, based on k, the minimum number of unique elements in an EC,
# and l, the minimum number of unique sensitive attributes, or SAs, that need to be present within each class
def sampling(k, l, data_strat):    
    all_data = join_strats(data_strat)
    
    # Creates n/k equivalence classes to be populated
    equivalence_classes = pd.Series(data=[list() for i in range((len(all_data)//k))])

    # These iterators control different parts of this function:
    # inc - inc iterates after every EC appendage and is used in conjunction with c_index
    # to keep track of which chunk in 'data_strat' needs to be accessed next
    
    # c_index - c_index utilizes modulo to cycle through each stratified data chunk, using 'inc'
    # as the numerator and 'l'(-diversity) as the denominator, since there are only as many chunks
    # as there are unique SA values, l.
    
    # index - marks the point that is going to be added next. The chunks are processed quasi-parallelly
    # so it takes from the same point in each chunk (when applicable) before cycling to the next index.
    # This process is assisted by 'c_index,' as it determines if index iterates, based on
    # whether the algorithm is currently accessing the last chunk in 'data_strat,' data_strat[l-1].
    inc = 0
    index = 0
    c_index = 0

    for ec in equivalence_classes:
        # Each EC has this block run k times to fill it
        for i in range(k):
            # Index for the current 'data_strat' chunk
            c_index = inc % l
            # Depending on how far in the algorithm is, some 'data_strat' chunks won't have any
            # more values at the current 'index.' This while loop implementation prevents
            # any index out-of-bounds instances, finding the next chunk that holds a value 
            # at the current 'index' value.
            while index >= len(data_strat[c_index]):
                inc = inc + 1
                c_index = inc % l
                if c_index == 0:
                    index = index + 1

            # Adds a record to the currently accessed EC
            ec.append(data_strat[c_index].loc[index])

            # 'data_strat' is a Series of DataFrames, with indexes 0 to (l-1)
            # Once 'c_index' gets to l-1, 'index' iterates to prepare for
            # the next parallel pass through 'data_strat.'
            if c_index == (l-1):
                index = index + 1

            # Iterates so that 'c_index' updates to read the next chunk
            inc = inc + 1

    # If the number of records isn't completely divisible by the number of ECs, there will be data left unplaced.
    # This chunk handles that; It adds one unplaced record each to the most recently appended ECs, until all
    # data is accounted for.
    index = index + 1
    # Starts from the back of 'equivalence_classes', which are where the most recently populated ECs are
    ec_index = len(equivalence_classes) - 1
    # Determines which chunk in 'data_strat' still has values in it that need to be placed
    max_strat_len = max([len(data_strat[i]) for i in range(len(data_strat))])
    while(index < max_strat_len):
        equivalence_classes[ec_index].append(data_strat[c_index].loc[index])
        ec_index = ec_index - 1
        index = index + 1
        
    # Once all equivalence classes are populated, each EC is converted into a DataFrame for easy utility
    equivalence_classes = [pd.DataFrame(ec, columns=all_data.columns)for ec in equivalence_classes]
    
    return equivalence_classes

In [4]:
# Calculates privacy loss for the set of ECs as compared to the original data
def privacy_loss(equivalence_classes, data, sa_column, unique_SAs):
    # variable that will contain the sum of all privacy losses per EC
    priv_loss_sum = 0
    # total values in the whole dataset
    data_len = len(data)

    # We utilized Proportional l-diverse Privacy Loss to represent
    # our privacy loss metric. This compares the distribution of 
    # SA values in each EC to the distribution within the whole set
    for ec in equivalence_classes:
        for sa in unique_SAs:
            # Proportion of the SA value within the EC
            p_dist = len(ec[ec[sa_column]==sa]) / len(ec)
            # Proportion of the SA value within the original data
            q_dist = len(data[data[sa_column]==sa]) / data_len

            # Add the absolute value of 'p_dist' minus 'q_dist' to the sum variable
            priv_loss_sum = priv_loss_sum + (abs(p_dist-q_dist))

    print(f"Privacy Loss: {round(priv_loss_sum/len(equivalence_classes), 2)}")
    return round(priv_loss_sum/len(equivalence_classes), 2)

# Calculates information loss for the set of ECs as compared to the original data
def information_loss(equivalence_classes, data, qi_column):
    # variable that will contain the sum of all information losses per EC
    info_loss_sum = 0
    # lower limit of the QI column in the original data
    min_QI = data[qi_column].min()
    # upper limit of the QI column in the original data
    max_QI = data[qi_column].max()
    
    # We utilized the Generalized Loss Metric to represent our 
    # information loss. This compares the range of quasi identifiers
    # within each EC to the range across the whole set
    for ec in equivalence_classes:
        info_loss_sum = info_loss_sum + (((ec[qi_column].max())-(ec[qi_column].min()))/((max_QI)-(min_QI)))

    print(f"Information Loss: {round(info_loss_sum/len(equivalence_classes), 2)}\n")
    return round(info_loss_sum/len(equivalence_classes), 2)

In [5]:
# This is the head of the Enhanced Stratified Sampling algorithm
# It performs the sampling function on the data for each k from 2 to l
# Each pass saves the privacy and information loss metrics calculated
# Determines which k is best in terms of loss mitigation through combined loss
# Returns the set of ECs that should be used for publishing

def ESS(data, qi_column, sa_column):
    
    # splitting up data into unique SA values
    unique_SAs = data[sa_column].unique()
    # determining total l-diversity
    l = len(unique_SAs)
    
    # Creates the different groupings of the data based on the unique SA values in the data
    data_strat = pd.Series([pd.DataFrame(data[data[sa_column] == sa]) for sa in unique_SAs])
    data_strat = data_strat.apply(lambda x: x.sort_values(qi_column, ignore_index=True))
    
    # Variables that contain the loss metrics for each set of ECs
    priv_loss = 0
    info_loss = 0
    # List that stores all calculated metrics for use later
    loss_metrics = []
    
    for k in range(2, (l+1)):
        # Creates equivalence classes of minimum size, k.
        equivalence_classes = sampling(k, l, data_strat)
        
        print(f"Loss metrics for {len(equivalence_classes[0])}-anonymous equivalence classes:")
    
        # Calculated privacy loss for the set of ECs
        priv_loss = privacy_loss(equivalence_classes, data, sa_column, unique_SAs)
        
        # Calculated information loss for the set of ECs
        info_loss = information_loss(equivalence_classes, data, qi_column)

        # List with all metrics used and calculated
        loss_metrics_for_k = [k, priv_loss, info_loss]
        
        # Stores the list above to be compared to the other metric lists at the end
        loss_metrics.append(loss_metrics_for_k)
        
    # Determines the combined loss values for each set of ECs of k-anonymity
    combined_loss_metrics = [(0.5 * loss_vals[1])+(0.5 * loss_vals[2]) for loss_vals in loss_metrics]
    # Finds the location of the k value that produced the lowest loss score
    index_for_k_prime = combined_loss_metrics.index(min(combined_loss_metrics))
    # Locates the value of k within the loss_metrics list
    k_prime = loss_metrics[index_for_k_prime][0]
    print(f"The best k-value to generate ECs on is k={k_prime}.")
    # sampling is run one more time on the chosen k value to generate 'e_prime,'
    # the set of ECs that should be used for publishing
    e_prime = [ec.drop(columns = sa_column) for ec in sampling(k_prime, l, data_strat)]
    
    return e_prime

# Large Dataset

## Data Cleaning

In [6]:
data = pd.read_csv("large_dataset.csv")

In [7]:
# Displays info on the details of the dataset(# of entries, present datatypes, etc.)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [8]:
# These values are either irrelevant features or explicit identifiers (EIs) that should be removed before continuing
data.drop_duplicates('Name', inplace = True)

to_drop = ['Name', 'Doctor', 'Hospital', 'Insurance Provider', 'Room Number']
data.drop(columns = to_drop, inplace=True, axis = 1)

In [9]:
# Statistical distribution of our sensitive attribute values
data['Billing Amount'].describe()

count    49992.000000
mean     25555.725277
std      14215.988133
min      -2008.492140
25%      13239.403094
50%      25541.302839
75%      37853.996819
max      52764.276736
Name: Billing Amount, dtype: float64

In [10]:
# Number of unique values in the sensitive attribute (SA) column. Going to need some temporary generalization for easier processing...
data['Billing Amount'].nunique()

49992

In [11]:
# Rounds the values in 'Billing Amount' to two decimal points
data['Billing Amount'] = data['Billing Amount'].apply(lambda x: round(x,2))

In [12]:
# In an effort to retain the original data, data_clean will store the preprocessed data
data_clean = pd.DataFrame(data)

In [13]:
# These masks will temporarily generalize the data for the ESS function
billing_mask_1 = data_clean['Billing Amount'] <= 10000
billing_mask_2 = (data_clean['Billing Amount'] > 10000) & (data_clean['Billing Amount'] <= 20000)
billing_mask_3 = (data_clean['Billing Amount'] > 20000) & (data_clean['Billing Amount'] <= 30000)
billing_mask_4 = (data_clean['Billing Amount'] > 30000) & (data_clean['Billing Amount'] <= 40000)
billing_mask_5 = data_clean['Billing Amount'] > 40000

billing_masks = [billing_mask_1, billing_mask_2, billing_mask_3, billing_mask_4, billing_mask_5]

In [14]:
# Generalized values will be stored in a new column. The values start at 5000 and increase by 10000 for each grouping
val = 5000

for bm in billing_masks:
    data_clean.loc[bm, 'New Billing Amount'] = val
    val = val + 10000

In [15]:
# Number of records per grouping
data_clean['New Billing Amount'].value_counts(sort=True)

New Billing Amount
45000.0    10320
25000.0    10228
35000.0    10143
15000.0    10088
5000.0      9213
Name: count, dtype: int64

In [16]:
# Data after preprocessing
data_clean

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Date of Admission,Billing Amount,Admission Type,Discharge Date,Medication,Test Results,New Billing Amount
0,30,Male,B-,Cancer,2024-01-31,18856.28,Urgent,2024-02-02,Paracetamol,Normal,15000.0
1,62,Male,A+,Obesity,2019-08-20,33643.33,Emergency,2019-08-26,Ibuprofen,Inconclusive,35000.0
2,76,Female,A-,Obesity,2022-09-22,27955.10,Emergency,2022-10-07,Aspirin,Normal,25000.0
3,28,Female,O+,Diabetes,2020-11-18,37909.78,Elective,2020-12-18,Ibuprofen,Abnormal,35000.0
4,43,Female,AB+,Cancer,2022-09-19,14238.32,Urgent,2022-10-09,Penicillin,Abnormal,15000.0
...,...,...,...,...,...,...,...,...,...,...,...
49995,64,Male,O+,Hypertension,2022-06-28,24747.35,Emergency,2022-07-09,Paracetamol,Inconclusive,25000.0
49996,69,Male,B+,Cancer,2020-04-04,40657.58,Elective,2020-04-17,Aspirin,Abnormal,45000.0
49997,73,Male,O-,Cancer,2023-09-08,8441.15,Elective,2023-09-22,Aspirin,Abnormal,5000.0
49998,81,Male,B-,Cancer,2020-10-13,34934.28,Elective,2020-10-14,Penicillin,Abnormal,35000.0


## ESS on Large Dataset

In [17]:
# Function call for ESS on the large dataset
# e_prime is the set of ECs that should be used in publishing
e_prime = ESS(data_clean, 'Age', 'New Billing Amount')
print("Sample equivalence class in E':")
e_prime[0]

Loss metrics for 2-anonymous equivalence classes:
Privacy Loss: 1.2
Information Loss: 0.02

Loss metrics for 3-anonymous equivalence classes:
Privacy Loss: 0.8
Information Loss: 0.03

Loss metrics for 4-anonymous equivalence classes:
Privacy Loss: 0.4
Information Loss: 0.04

Loss metrics for 5-anonymous equivalence classes:
Privacy Loss: 0.07
Information Loss: 0.05

The best k-value to generate ECs on is k=5
Sample equivalence class in E':


Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Date of Admission,Billing Amount,Admission Type,Discharge Date,Medication,Test Results
0,18,Male,AB+,Diabetes,2020-03-04,16563.17,Urgent,2020-04-01,Penicillin,Abnormal
0,18,Male,O-,Cancer,2019-11-18,37070.43,Urgent,2019-11-20,Penicillin,Inconclusive
0,18,Female,A-,Obesity,2019-05-14,27631.52,Emergency,2019-05-22,Ibuprofen,Abnormal
0,18,Female,A-,Obesity,2023-12-31,45527.27,Emergency,2024-01-09,Lipitor,Abnormal
0,18,Male,AB+,Obesity,2020-04-17,2494.91,Urgent,2020-05-01,Ibuprofen,Abnormal


# Small Dataset

In [18]:
data = pd.read_csv("small_dataset.csv")

In [19]:
# Number of unique values in the sensitive attribute (SA) column. Going to need some temporary generalization for easier processing...
data['Income'].nunique()

7667

In [20]:
# These masks will temporarily generalize the data for the ESS function. There is a skewed distribution
# so two sets of masks are made based on low and high income.
low_income_mask_1 = data['Income'] <= 66000
low_income_mask_2 = (data['Income'] > 66000) & (data['Income'] <= 69000)
low_income_mask_3 = (data['Income'] > 69000) & (data['Income'] <= 72000)
low_income_mask_4 = (data['Income'] > 72000) & (data['Income'] <= 75000)

low_income_masks = [low_income_mask_1, low_income_mask_2, low_income_mask_3, low_income_mask_4]

high_income_mask_1 = (data['Income'] > 75000) & (data['Income'] <= 875000)
high_income_mask_2 = (data['Income'] > 875000) & (data['Income'] <= 1675000)
high_income_mask_3 = (data['Income'] > 1675000) & (data['Income'] <= 2475000)
high_income_mask_4 = data['Income'] > 2475000

high_income_masks = [high_income_mask_1, high_income_mask_2, high_income_mask_3, high_income_mask_4]

In [21]:
# Generalized values will be stored in a new column. The values start at 66000/875000 and increase by 3000/800000 
# for each grouping, depending on whether they are low- or high-income masks, respectively.
val = 66000

for im in low_income_masks:
    data.loc[im, 'New Income Amount'] = val
    val = val + 3000
    
val = 875000

for im in high_income_masks:
    data.loc[im, 'New Income Amount'] = val
    val = val + 800000

## ESS On Small Dataset

In [22]:
# Function call for ESS on the small dataset
# e_prime is the set of ECs that should be used in publishing
e_prime = ESS(data, 'Age', 'New Income Amount')
print("Sample equivalence class in E':")
e_prime[0]

Loss metrics for 2-anonymous equivalence classes:
Privacy Loss: 1.4
Information Loss: 0.14

Loss metrics for 3-anonymous equivalence classes:
Privacy Loss: 1.11
Information Loss: 0.18

Loss metrics for 4-anonymous equivalence classes:
Privacy Loss: 0.82
Information Loss: 0.26

Loss metrics for 5-anonymous equivalence classes:
Privacy Loss: 0.61
Information Loss: 0.29

Loss metrics for 6-anonymous equivalence classes:
Privacy Loss: 0.45
Information Loss: 0.31

Loss metrics for 7-anonymous equivalence classes:
Privacy Loss: 0.49
Information Loss: 0.34

Loss metrics for 8-anonymous equivalence classes:
Privacy Loss: 0.49
Information Loss: 0.34

The best k-value to generate ECs on is k=6
Sample equivalence class in E':


Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,18,Master's,Healthcare,5,Urban,14,Married,Part-time,1,Rent,Apartment,Female,Biking,74840
0,18,High School,Education,2,Rural,28,Single,Self-employed,3,Own,Apartment,Male,Public transit,528755
0,18,High School,Others,0,Urban,17,Single,Full-time,1,Own,Single-family home,Male,Public transit,71674
0,18,Master's,Healthcare,5,Rural,48,Married,Self-employed,2,Rent,Apartment,Female,Biking,3165470
0,18,High School,Healthcare,2,Urban,47,Married,Part-time,2,Rent,Apartment,Female,Walking,67192
0,18,Doctorate,Technology,5,Rural,16,Married,Self-employed,6,Own,Single-family home,Male,Car,56924
