In [1]:
# Install and import the libraries (pandas, anjana)
import pandas as pd

!pip install anjana
import anjana
from anjana.anonymity import k_anonymity, l_diversity, t_closeness

Collecting anjana
  Downloading anjana-1.1.0-py3-none-any.whl.metadata (11 kB)
Collecting beartype==0.19.0 (from anjana)
  Downloading beartype-0.19.0-py3-none-any.whl.metadata (32 kB)
Collecting numpy==2.0.2 (from anjana)
  Downloading numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m890.4 kB/s[0m eta [36m0:00:00[0m
Collecting pycanon==1.0.3 (from anjana)
  Downloading pycanon-1.0.3-py3-none-any.whl.metadata (8.5 kB)
Collecting reportlab==4.2.5 (from pycanon==1.0.3->anjana)
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Collecting tabulate==0.8.10 (from pycanon==1.0.3->anjana)
  Downloading tabulate-0.8.10-py3-none-any.whl.metadata (25 kB)
Collecting typer==0.15.1 (from pycanon==1.0.3->anjana)
  Downloading typer-0.15.1-py3-none-any.whl.metadata (15 kB)
Downloading anjana-1.1.0-py3-none-any.whl (22 kB)
Downloading beartype-0.19.0-py3-none-an

In [3]:
# Read, process and visualize the data
data = pd.read_csv("adult.csv")
data.columns = data.columns.str.strip()
cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "sex",
    "native-country",
]
for col in cols:
    data[col] = data[col].str.strip()
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
# Define the identifiers, quasi-identifiers and the sensitive attribute
quasi_ident = [
    "age",
    "education",
    "marital-status",
    "occupation",
    "sex",
    "native-country",
]
ident = ["race"]
sens_att = "salary-class"

In [5]:
# Select the desired level of k, l and t
k = 10
l_div = 2
t = 0.5

In [6]:
# Select the suppression limit allowed
supp_level = 50

In [None]:
# Import the hierarchies for each quasi-identifier. Define a dictionary containing them
hierarchies = {
    "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
    "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
    "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
    "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
    "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
    "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
}

In [9]:
# Apply the three functions: k-anonymity, l-diversity and t-closeness
data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
data_anon = l_diversity(data_anon, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies)
data_anon = t_closeness(data_anon, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies)
data_anon

The data verifies k-anonymity with k=10
The data verifies k-anonymity with k=72
The data verifies t-closeness with t=0.4737011422127644


Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,1,"[40, 80[",Self-emp-not-inc,83311,Higher education,13,Spouse present,*,Husband,*,Male,0,0,13,United-States,<=50K
1,7,"[40, 80[",Self-emp-not-inc,209642,Secondary education,9,Spouse present,*,Husband,*,Male,0,0,45,United-States,>50K
2,8,"[0, 40[",Private,45781,Higher education,14,Spouse not present,*,Not-in-family,*,Female,14084,0,50,United-States,>50K
3,9,"[40, 80[",Private,159449,Higher education,13,Spouse present,*,Husband,*,Male,5178,0,40,United-States,>50K
4,10,"[0, 40[",Private,280464,Higher education,10,Spouse present,*,Husband,*,Male,0,0,80,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18322,32554,"[40, 80[",Private,321865,Higher education,14,Spouse present,*,Husband,*,Male,0,0,40,United-States,>50K
18323,32555,"[0, 40[",Private,310152,Higher education,10,Spouse not present,*,Not-in-family,*,Male,0,0,40,United-States,<=50K
18324,32557,"[40, 80[",Private,154374,Secondary education,9,Spouse present,*,Husband,*,Male,0,0,40,United-States,>50K
18325,32558,"[40, 80[",Private,151910,Secondary education,9,Spouse not present,*,Unmarried,*,Female,0,0,40,United-States,<=50K


In [10]:
# Export the results
data_anon.to_csv("adult_k10.csv")

In [14]:
# Instead of importing the hierarquies from .csv files, we can manually create the corresponding dictionaries, i.e.:
import numpy as np

age = data['age'].values
# Values: [29 24 28 27 24 23 19 29 17 19] (note that the following can be automatized)
age_5years = ['[25, 30)', '[20, 25)', '[25, 30)',
              '[25, 30)', '[20, 25)', '[20, 25)',
              '[15, 20)', '[25, 30)', '[15, 20)', '[15, 20)']

age_10years = ['[20, 30)', '[20, 30)', '[20, 30)',
               '[20, 30)', '[20, 30)', '[20, 30)',
               '[10, 20)', '[20, 30)', '[10, 20)', '[10, 20)']

hierarchies = {
    "age": {0: age,
            1: age_5years,
            2: age_10years},
    "marital-status": {
        0: data["marital-status"].values,
        1: np.array(["*"] * len(data["marital-status"].values)) # Suppression
    },
}

In [15]:
# Alternatively, we can also use the function _generate_intervals()_ from _utils_:
import numpy as np
from anjana.anonymity import utils

age = data['age'].values

hierarchies = {
    "age": {
        0: data["age"].values,
        1: utils.generate_intervals(data["age"].values, 0, 100, 5),
        2: utils.generate_intervals(data["age"].values, 0, 100, 10),
    },
    "marital-status": {
        0: data["marital-status"].values,
        1: np.array(["*"] * len(data["marital-status"].values)) # Suppression
    },
}

In [None]:
# Use what you have learned in this lab to solve exercise 5-1 and 5-2 from the Data Anonymization Exercises.