## 3.2 Anonymising your dataset
Goals: The goal of k-anonymity is to modify a dataset such that any given record cannot be distinguished from at least k−1 other records regarding certain "quasi-identifier" attributes.

quasi-identifiers: 'region', 'gender', 'age', 'height', 'weight', 'eat', 'schedule', 'howlong'

In [578]:
import pandas as pd

df = pd.read_csv("athletes.csv", index_col=False, low_memory=False)

df['age'].fillna(0, inplace=True)
df['height'].fillna(0, inplace=True)
df['weight'].fillna(0, inplace=True)

df['region'].fillna('', inplace=True)
df['gender'].fillna('', inplace=True)

df.head()


Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong,retrieved_datetime
0,2554.0,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,...,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,
1,3517.0,Derek Abdella,,,,Male,42.0,70.0,190.0,,...,,,,,I have a coach who determines my programming|I...,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|,
2,4691.0,,,,,,0.0,0.0,0.0,,...,,,,,,,,,,
3,5164.0,Abo Brandon,Southern California,LAX CrossFit,LAX CrossFit,Male,40.0,67.0,0.0,211.0,...,375.0,325.0,25.0,I eat 1-3 full cheat meals per week|,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|,4+ years|,
4,5286.0,Bryce Abbey,,,,Male,32.0,65.0,149.0,206.0,...,,325.0,50.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I inc...,I played college sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I strictly s...,1-2 years|,


In [579]:
def get_spans(df, partition, scale=None):
    """
    Calculates and returns the spans (range of values) for each column in a specified partition of a dataframe, with
    an option to scale these spans by provided values.

    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in scale for that column
    :returns        : The spans of all columns in the partition
    """
    spans = {}
    for feature_column in quasi_identifiers:
        if feature_column in categorical:
            span = len(df[feature_column][partition].unique())
        else:
            span = df[feature_column][partition].max() - df[feature_column][partition].min()
        if scale is not None:
            span = span / scale[feature_column]
        spans[feature_column] = span
    return spans

In [580]:
def split(df, partition, column):
    """
    Divides a specified partition of a dataframe into two parts based on the median or unique values of a given column,
    returning a tuple with the indices of these two parts.

    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :returns        : A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values) // 2])
        rv = set(values[len(values) // 2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        return (dfl, dfr)

In [581]:
def is_k_anonymous(df, partition, sensitive_column, k=2):
    """
    Checks if a partition is k-anonymous by comparing its amount of entries with the required (k).

    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True

In [582]:
def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    Partitions a dataframe into valid subsets based on specified feature columns, a sensitive column, and span scales,
    using a validity function to ensure each partition meets certain criteria.

    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the is_valid function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x: -x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [583]:
def agg_categorical_column(series):
    """
    Aggregates the values of a series with categorical values by concatenating them.

    :param           series: A series of categorical values that need to be aggregated.
    :returns               : A string with all the values in the series joined with a ',' (comma).
    """
    series = set(series.astype(str))
    return ','.join(series)
    #return [','.join(series)]


In [584]:
def agg_numerical_column(series):
    return series.mean()
    #return [series.mean()]

In [585]:
def build_anonymized_dataset(df, partitions, feature_columns, sensitive_columns,
                                                  max_partitions=None):
    """
    Constructs an anonymized dataset by aggregating feature columns and sensitive columns separately for each partition.

    :param                df: The dataframe to be anonymized.
    :param        partitions: A list of indices for each partition of the dataframe.
    :param   feature_columns: A list of feature column names to aggregate.
    :param sensitive_columns: A list of sensitive column names to aggregate.
    :param    max_partitions: Optional, maximum number of partitions to process.
    :returns                : An anonymized dataframe with aggregated feature and sensitive columns.
    """
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if i % 100 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        values = grouped_columns.to_dict()
        # Iterate through each sensitive column and aggregate counts
        for sensitive_column in sensitive_columns:
            sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column: 'count'})
            for sensitive_value, count in sensitive_counts[sensitive_column].items():
                if count == 0:
                    continue
                sensitive_values = values.copy()
                sensitive_values.update({
                    sensitive_column: sensitive_value,
                    'count': count,
                })
                rows.append(sensitive_values)
    return pd.DataFrame(rows)

In [586]:
categorical = {'schedule', 'howlong', 'region', 'gender', 'eat'}

# we apply our partitioning method to two columns of our dataset, using "income" as the sensitive attribute
for name in categorical:
    df[name] = df[name].astype('category')
# Obtain the spans for each column in the dataframe
# quasi-identifiers that should be taken into acount

quasi_identifiers = ['region', 'gender', 'age', 'height', 'weight', 'eat', 'schedule', 'howlong']
# sensitive-values that should be taken into account
sensitive_columns = ['athlete_id', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'run400', 'run5k', 'candj', 'snatch', 'deadlift',
                     'backsq', 'pullups', 'retrieved_datetime']

In [587]:
full_spans = get_spans(df, df.index)

In [588]:
finished_partitions = partition_dataset(df, quasi_identifiers, sensitive_columns, full_spans, is_k_anonymous)
len(finished_partitions)

75326

In [589]:
dfn = build_anonymized_dataset(df, finished_partitions, quasi_identifiers, sensitive_columns)
# we sort the resulting dataframe using the feature columns and the sensitive attribute
#dfn.sort_values()

dfn.to_csv("k_anon.csv")

Finished 1 partitions...
Finished 101 partitions...
Finished 201 partitions...
Finished 301 partitions...
Finished 401 partitions...
Finished 501 partitions...
Finished 601 partitions...
Finished 701 partitions...
Finished 801 partitions...
Finished 901 partitions...
Finished 1001 partitions...
Finished 1101 partitions...
Finished 1201 partitions...
Finished 1301 partitions...
Finished 1401 partitions...
Finished 1501 partitions...
Finished 1601 partitions...
Finished 1701 partitions...
Finished 1801 partitions...
Finished 1901 partitions...
Finished 2001 partitions...
Finished 2101 partitions...
Finished 2201 partitions...
Finished 2301 partitions...
Finished 2401 partitions...
Finished 2501 partitions...
Finished 2601 partitions...
Finished 2701 partitions...
Finished 2801 partitions...
Finished 2901 partitions...
Finished 3001 partitions...
Finished 3101 partitions...
Finished 3201 partitions...
Finished 3301 partitions...
Finished 3401 partitions...
Finished 3501 partitions...
Fini