In [None]:
## Acknowledgements

# This project uses the following libraries:
# - Anjana https://github.com/IFCA-Advanced-Computing/anjana)  by IFCA  — licensed by Judith Sainz-Pardo Diaz under Apache License.

# pip install anjana

In [41]:
import pandas as pd
from anjana.anonymity import utils
from anjana.anonymity import k_anonymity

In [None]:
# Read and process the data
data = pd.read_csv("../data/adult.csv")
data.columns = data.columns.str.strip()
cols = [
    "workclass",
    "education",
    "marital.status",
    "occupation",
    "sex",
    "native.country",
]
for col in cols:
   data[col] = data[col].str.strip()

# Define the identifiers, quasi-identifiers and the sensitive attribute
quasi_ident = [
    "age",
    "education",
    "marital.status",
    "occupation",
    "sex",
    "native.country",
]
ident = ["race"]
sens_att = "income"


In [43]:
# Import the hierarquies for each quasi-identifier. Define a dictionary containing them
hierarchies = {
    "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
    "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
    "marital.status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
    "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
    "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
    "native.country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
}

In [58]:

supp_level = 50

k_values = [2, 3, 4, 5, 10, 50, 100]  # Example k values - change as needed

for k in k_values:
    data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
    suppression_percentage = 100 * (len(data) - len(data_anon)) / len(data)
    print(f"For k = {k}: Percentage of records suppressed: {suppression_percentage:.2f} %")


For k = 2: Percentage of records suppressed: 30.75 %
For k = 3: Percentage of records suppressed: 42.53 %
For k = 4: Percentage of records suppressed: 27.31 %
For k = 5: Percentage of records suppressed: 31.26 %
For k = 10: Percentage of records suppressed: 43.71 %
For k = 50: Percentage of records suppressed: 37.78 %
For k = 100: Percentage of records suppressed: 38.42 %


In [60]:
# Select the suppression limit allowed
supp_level = 70

k_values = [2, 3, 4, 5, 10, 50, 100]  # Example k values - adjust as needed

for k in k_values:
    data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
    suppression_percentage = 100 * (len(data) - len(data_anon)) / len(data)
    transformation_anon = utils.get_transformation(data_anon, quasi_ident, hierarchies)
    
    print(f"\nFor k = {k}:")
    print(f"Percentage of records suppressed: {suppression_percentage:.2f} %")
    print("Transformation applied:")
    print(transformation_anon)



For k = 2:
Percentage of records suppressed: 30.75 %
Transformation applied:
[0, 0, 0, 0, 0, 0]

For k = 3:
Percentage of records suppressed: 42.53 %
Transformation applied:
[0, 0, 0, 0, 0, 0]

For k = 4:
Percentage of records suppressed: 50.39 %
Transformation applied:
[0, 0, 0, 0, 0, 0]

For k = 5:
Percentage of records suppressed: 55.91 %
Transformation applied:
[0, 0, 0, 0, 0, 0]

For k = 10:
Percentage of records suppressed: 43.71 %
Transformation applied:
[1, 0, 0, 0, 0, 0]

For k = 50:
Percentage of records suppressed: 62.71 %
Transformation applied:
[1, 1, 0, 0, 0, 1]

For k = 100:
Percentage of records suppressed: 51.60 %
Transformation applied:
[1, 1, 0, 1, 0, 1]


In [59]:
# Apply k-anonymity
data_anon = k_anonymity(data, ident, quasi_ident, 10, supp_level, hierarchies)

data_anon.head()

Unnamed: 0,index,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,5,"[30, 35]",Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,*,Female,0,3770,45,United-States,<=50K
1,20,"[35, 40]",Private,188774,Bachelors,13,Never-married,Exec-managerial,Not-in-family,*,Male,0,2824,40,United-States,>50K
2,23,"[50, 55]",Private,153870,Some-college,10,Married-civ-spouse,Transport-moving,Husband,*,Male,0,2603,40,United-States,<=50K
3,24,"[60, 65]",?,135285,HS-grad,9,Married-civ-spouse,?,Husband,*,Male,0,2603,32,United-States,<=50K
4,33,"[50, 55]",Private,123011,Bachelors,13,Divorced,Exec-managerial,Not-in-family,*,Male,0,2559,50,United-States,>50K
