In [1]:
import numpy as np
import pandas as pd


In [2]:
def preproceesing(df):

    def group_edu(x):
            if x <= 5:
                return "<6"
            elif x >= 13:
                return ">12"
            else:
                return x

    def age_cut(x):
        if x >= 70:
            return ">=70"
        else:
            return x

    def group_race(x):
        if x == "White":
            return 1.0
        else:
            return 0.0

    # Cluster education and age attributes.
    # Limit education range
    df["education-num"] = df["education-num"].apply(lambda x: group_edu(x))
    df["education-num"] = df["education-num"].astype("category")

    # Limit age range
    df["age"] = df["age"].astype(int)
    df["age"] = df["age"].apply(lambda x: x // 10 * 10)
    df["age"] = df["age"].apply(lambda x: age_cut(x))

    # Group race
    df["race"] = df["race"].apply(lambda x: group_race(x))

    return df

In [3]:
def preproceesing_v2(train,test,
    protected_attribute_name="",
    privileged_classes=[],
    missing_value=[],
    features_to_drop=[],
    categorical_features=[],
    favorable_classes=[],
    normalize=True,
):
    cols = [
        x
        for x in train.columns
        if x
        not in (
            features_to_drop
            + [protected_attribute_name]
            + categorical_features
            + ["result"]
        )
    ]

    result = []
    for df in [train, test]:
        # drop useless features
        df = df.drop(columns=features_to_drop)

        # create one-hot encoding of categorical features
        df = pd.get_dummies(df, columns=categorical_features, prefix_sep="=")

        # map protected attributes to privileged or unprivileged
        pos = np.logical_or.reduce(
            np.equal.outer(privileged_classes, df[protected_attribute_name].values)
        )
        df.loc[pos, protected_attribute_name] = 1
        df.loc[~pos, protected_attribute_name] = 0
        df[protected_attribute_name] = df[protected_attribute_name].astype(int)

        # set binary labels
        pos = np.logical_or.reduce(
            np.equal.outer(favorable_classes, df["result"].values)
        )
        df.loc[pos, "result"] = 1
        df.loc[~pos, "result"] = 0
        df["result"] = df["result"].astype(int)

        result.append(df)

    # standardize numeric columns
    for col in cols:
        data = result[0][col].tolist()
        mean = np.mean(data)
        std = np.std(data)
        result[0][col] = (result[0][col] - mean) / std
        result[1][col] = (result[1][col] - mean) / std

    train = result[0]
    test = result[1]
    for col in train.columns:
        if col not in test.columns:
            test[col] = 0
    cols = train.columns
    test = test[cols]
    assert all(
        train.columns[i] == test.columns[i] for i in range(len(train.columns))
    )

    return train, test

In [4]:
column_names = [
                "age",
                "workclass",
                "fnlwgt",
                "education",
                "education-num",
                "marital-status",
                "occupation",
                "relationship",
                "race",
                "sex",
                "capital-gain",
                "capital-loss",
                "hours-per-week",
                "native-country",
                "result",
            ]

dataframe_train = pd.read_csv('./adult.data', names=column_names)
dataframe_test = pd.read_csv('./adult.test', names=column_names)
# dataframe_test
# dataframe_train

dataframe_test.isin(['?']).sum(axis=0)
dataframe_test['native-country'] = dataframe_test['native-country'].replace('?', np.nan)
dataframe_test['workclass'] = dataframe_test['workclass'].replace('?', np.nan)
dataframe_test['occupation'] = dataframe_test['occupation'].replace('?', np.nan)
dataframe_test.dropna(how='any', inplace=True)

preproceesing(dataframe_train)
preproceesing(dataframe_test)
# dataframe_train

categorical_features = [
                "workclass",
                "education",
                "age",
                "race",
                "education-num",
                "marital-status",
                "occupation",
                "relationship",
                "native-country",
            ]

protected_attribute_name = "sex"
privileged_classes = ["Male"]
missing_value=["?"]
features_to_drop=["fnlwgt"]
categorical_features=categorical_features
favorable_classes=[">50K", ">50K."]
col = dataframe_train.columns.values
fair_variables = [ele for ele in col if "occupation" in ele]

train,test=preproceesing_v2(dataframe_train,dataframe_test,
                protected_attribute_name,privileged_classes,
                missing_value,features_to_drop,categorical_features,
                favorable_classes)


display(train)
display(test)


Unnamed: 0,sex,capital-gain,capital-loss,hours-per-week,result,workclass= ?,workclass= Federal-gov,workclass= Local-gov,workclass= Never-worked,workclass= Private,...,native-country= Portugal,native-country= Puerto-Rico,native-country= Scotland,native-country= South,native-country= Taiwan,native-country= Thailand,native-country= Trinadad&Tobago,native-country= United-States,native-country= Vietnam,native-country= Yugoslavia
0,0,0.148453,-0.21666,-0.035429,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,-0.145920,-0.21666,-2.222153,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,-0.145920,-0.21666,-0.197409,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32557,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32558,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32559,0,-0.145920,-0.21666,-1.655225,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,sex,capital-gain,capital-loss,hours-per-week,result,workclass= ?,workclass= Federal-gov,workclass= Local-gov,workclass= Never-worked,workclass= Private,...,native-country= Portugal,native-country= Puerto-Rico,native-country= Scotland,native-country= South,native-country= Taiwan,native-country= Thailand,native-country= Trinadad&Tobago,native-country= United-States,native-country= Vietnam,native-country= Yugoslavia
1,0,-0.145920,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,-0.145920,-0.21666,0.774468,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,-0.145920,-0.21666,-0.035429,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0.895083,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,0,-0.145920,-0.21666,-0.845327,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16277,0,-0.145920,-0.21666,-0.359389,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
16278,0,-0.145920,-0.21666,-0.035429,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,0,-0.145920,-0.21666,0.774468,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
16280,0,0.592721,-0.21666,-0.035429,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
