## 3.2 Anonymizing your dataset
- Goals: The goal of k-anonymity is to modify a dataset such that any given record cannot be distinguished from at least k−1 other records regarding certain "quasi-identifier" attributes.
- Our identified quasi-identifiers from 3.1: 'region', 'gender', 'age', 'height', 'weight', 'eat', 'schedule', 'howlong'

### Starting point: get a k-anonymized data set
- To k-anonymize our given data set, we are using the Mondrian Multidimensional K-Anonymity approach
- similar to other k-anonymity approaches, a simple and efficient greedy approximation algorithm is implemented to reduce complexity.

In [125]:
#Some adjustments on the dataset
import pandas as pd

df = pd.read_csv("kleine_athletes.csv", index_col=False, low_memory=False)

df['age'].fillna(0, inplace=True)
df['height'].fillna(0, inplace=True)
df['weight'].fillna(0, inplace=True)

df['region'].fillna('', inplace=True)
df['gender'].fillna('', inplace=True)

df.head()

Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong,retrieved_datetime
0,2554,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,...,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,
1,3517,Derek Abdella,,,,Male,42.0,70.0,190.0,,...,,,,,I have a coach who determines my programming|I...,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,
2,4691,,,,,,0.0,0.0,0.0,,...,,,,,,,,,,
3,5164,Abo Brandon,Southern California,LAX CrossFit,LAX CrossFit,Male,40.0,67.0,0.0,211.0,...,375.0,325.0,25.0,I eat 1-3 full cheat meals per week|,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|,4+ years|,
4,5286,Bryce Abbey,,,,Male,32.0,65.0,149.0,206.0,...,,325.0,50.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I inc...,I played college sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I strictly s...,1-2 years|,


In [126]:
#Calculates and returns the spans (range of values) for each column in a specified partition of a dataframe, with an option to scale these spans by provided values.
def get_spans(df, partition, scale=None):
    spans = {}
    for feature_column in quasi_identifiers:
        if feature_column in categorical:
            span = len(df[feature_column][partition].unique())
        else:
            span = df[feature_column][partition].max() - df[feature_column][partition].min()
        if scale is not None:
            span = span / scale[feature_column]
        spans[feature_column] = span
    return spans

In [127]:
#Divides a specified partition of a dataframe into two parts based on the median or unique values of a given column, returning a tuple with the indices of these two parts.
def split(df, partition, column):
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values) // 2])
        rv = set(values[len(values) // 2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        return (dfl, dfr)

In [128]:
#Checks if a partition is k-anonymous by comparing its amount of entries with the required (k).
def is_k_anonymous(df, partition, sensitive_column, k=3):
    if len(partition) < k:
        return False
    return True

In [129]:
#Partitions a dataframe into valid subsets based on specified feature columns, a sensitive column, and span scales, using a validity function to ensure each partition meets certain criteria.
def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    finished_partitions_temp = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x: -x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions_temp.append(partition)
    return finished_partitions_temp

In [130]:
#Aggregates the values of a series with categorical values by concatenating them.
def agg_categorical_column(series):
    # Check if the series is empty or if mode() returns an empty series
    if series.empty or series.mode().empty:
        return None  # or some default value or placeholder
    else:
        return series.mode().iloc[0]  # Safely access the first element of mode


In [131]:
def agg_numerical_column(series):
    #return series.mean()
    #print(series)
    return series.mean()

In [132]:
#Constructs an anonymized dataset by aggregating feature columns and sensitive columns separately for each partition.
def build_anonymized_dataset(df, partitions, feature_columns, sensitive_columns,
                                                  max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if i % 100 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        values = grouped_columns.to_dict()
        # Iterate through each sensitive column and aggregate counts
        for sensitive_column in sensitive_columns:
            sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column: 'count'})
            for sensitive_value, count in sensitive_counts[sensitive_column].items():
                if count == 0:
                    continue
                sensitive_values = values.copy()
                sensitive_values.update({
                    sensitive_column: sensitive_value,
                    'count': count,
                })
                rows.append(sensitive_values)
    return pd.DataFrame(rows)

In [133]:
#categorical attributes that need to be treated in another way than numerical
categorical = {'schedule', 'howlong', 'region', 'gender', 'eat'}
for name in categorical:
    df[name] = df[name].astype('category')
    
#Our quasi identifiers that we identified earlier
quasi_identifiers = ['region', 'gender', 'age', 'height', 'weight', 'eat', 'schedule', 'howlong']

# sensitive-values that should be taken into account
sensitive_columns = ['athlete_id', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k', 'candj', 'snatch', 'deadlift',
                     'backsq', 'pullups']

In [134]:
full_spans = get_spans(df, df.index)

In [135]:
finished_partitions = partition_dataset(df, quasi_identifiers, sensitive_columns, full_spans, is_k_anonymous)
len(finished_partitions)

78

In [136]:
dfn = build_anonymized_dataset(df, finished_partitions, quasi_identifiers, sensitive_columns)
# we sort the resulting dataframe using the feature columns and the sensitive attribute
#dfn.sort_values()

dfn.to_csv("k3_anon_klein.csv")

Finished 1 partitions...


## Extending the k-anonymized dataset to also be l-diverse


In [137]:
def diversity(df, partition, column):
    print(f"Partition indices: {partition}")
    
    data = df.loc[partition, column].squeeze()
    #print(f"Partition: {partition}")
    #print(f"Data: {data}")
    unique_values = data.unique() if isinstance(data, pd.Series) else [data]
    #print(f"Unique values: {unique_values}")
    return len(unique_values)

def is_l_diverse(df, partition, sensitive_column, l=2):
    return diversity(df, partition, sensitive_column) >= l

In [138]:
df_ldiverse = pd.read_csv("k3_anon_klein.csv", index_col=False, low_memory=False)
full_spans = get_spans(df_ldiverse, df_ldiverse.index)

finished_l_diverse_partitions = partition_dataset(
    df, quasi_identifiers, sensitive_columns, full_spans,
    lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))

dfldiverse_finished = build_anonymized_dataset(df_ldiverse, finished_l_diverse_partitions, quasi_identifiers, sensitive_columns)
dfldiverse_finished.to_csv("l2_diverse.csv")

Partition indices: Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       488, 489, 490, 491, 493, 494, 495, 496, 497, 498],
      dtype='int64', length=469)
Partition indices: Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       489, 490, 491, 492, 493, 494, 495, 496, 497, 498],
      dtype='int64', length=475)
Partition indices: Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       489, 490, 491, 492, 493, 494, 495, 496, 497, 498],
      dtype='int64', length=476)
Partition indices: Index([  2,   3,   7,  11,  14,  19,  24,  26,  28,  29,
       ...
       480, 482, 483, 485, 486, 488, 493, 494, 497, 498],
      dtype='int64', length=249)
Partition indices: Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       487, 488, 489, 490, 492, 493, 494, 495, 497, 498],
      dtype='int64', length=434)
Partition indices: Index([  0,   2,   6,   7,  11,  15,  17,  19,  24,  29,
       ...
       479, 480, 481, 4

## Extending the k-anonymized dataset to also achieve t-closeness

In [139]:
from scipy.stats import ks_2samp

def t_closeness(df, partition, sensitive_columns, global_freqs, t=0.3, categorical=None):
    # Helper function for numerical t-closeness
    def t_closeness_numerical(full_data, partition_data):
        ks_stat, _ = ks_2samp(full_data, partition_data)
        return ks_stat

    # Helper function for categorical t-closeness
    def t_closeness_categorical(partition_data, global_freqs):
        total_count = float(len(partition_data))
        d_max = None
        group_counts = partition_data.groupby(column)[column].agg('count')
        for value, count in group_counts.to_dict().items():
            p = count / total_count
            d = abs(p - global_freqs[value])
            if d_max is None or d > d_max:
                d_max = d
        return d_max

    # Main t-closeness logic
    for column in sensitive_columns:
        full_data = df[column]
        partition_data = df.loc[partition, column]
        
        if categorical and column in categorical:
            distance = t_closeness_categorical(partition_data, global_freqs[column])
        else:
            distance = t_closeness_numerical(full_data, partition_data)

        if distance > t:
            return False
    return True


In [140]:
df_tclose = pd.read_csv("k3_anon_klein.csv", index_col=False, low_memory=False)
full_spans = get_spans(df_tclose, df_tclose.index)

# Get the global frequencies for the sensitive column
global_frequencies = {sensitive_column: {} for sensitive_column in sensitive_columns}
total_count = len(df)

# Determine frequency for every sensitive attribute
for sensitive_column in sensitive_columns:
    group_counts = df_tclose.groupby(sensitive_column, observed=False)[sensitive_column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count / total_count
        global_frequencies[sensitive_column][value] = p

finished_l_closepartitions = partition_dataset(
    df_tclose, quasi_identifiers, sensitive_columns, full_spans,
    lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))

dfclose_finished = build_anonymized_dataset(df_tclose, finished_l_closepartitions, quasi_identifiers, sensitive_columns)
dfclose_finished.to_csv("t_close.csv")

Partition indices: Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133],
      dtype='int64', length=1654)
Partition indices: Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       172, 173, 174, 175, 176, 177, 178, 179, 180, 181],
      dtype='int64', length=182)
Partition indices: Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2112],
      dtype='int64', length=1054)
Partition indices: Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133],
      dtype='int64', length=1046)
Partition indices: Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2124, 2125, 2126, 2127, 2128, 2129, 2130, 2131, 2132, 2133],
      dtype='int64', length=1052)
Partition indices: Index([