# Preprocessing and splitting the dataset

## Load the data

In [None]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold

In [None]:
import warnings
warnings.simplefilter(action = "ignore", category = RuntimeWarning) #ignore runtime warnings, which occur due to the presence of NaN values

In [None]:
data = pd.read_csv('adult.csv')

In [None]:
data.shape

In [None]:
data.replace('?', np.nan, inplace=True)

In [None]:
data.isna().sum()

In [None]:
#visualize features
data.hist(figsize=(10, 10))

In [None]:
#value counts for the categorical features: workclass, education, marital-status, occupation, relationship
for feature in ['workclass', 'education', 'marital-status', 'occupation', 'relationship']:
    print('------------ ' + feature + ' ---------------')
    print(data[feature].value_counts())

## Encode categorical and ordinal columns

Currently, we encode education as categorical; there's not an obvious way to encode an ordinality for some college vs the two associate's degrees. 

In [None]:
cols_cat = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'native-country', 'race', 'sex']

features_cat = data[cols_cat]
features_numerical = data.drop(columns = cols_cat + ['income']).convert_dtypes()

outcome = data['income']

For categorical values in the input features, we use one-hot encoding. We first check that we have no missing data here.

In [None]:
features_cat.isnull().sum()

In [None]:
ohe = OneHotEncoder(sparse=False, dtype=int)


In [None]:
cat_array = ohe.fit_transform(features_cat)

In [None]:
ohe.get_feature_names_out()

In [None]:
feature_labels = ohe.get_feature_names_out()
features_cat_onehot = pd.DataFrame(cat_array, columns=feature_labels)

In [None]:
features_cat_onehot

In [None]:
# features_cat_onehot.hist(figsize=(30, 30))

In [None]:
nan_indicators = features_cat_onehot.columns[features_cat_onehot.columns.str.contains('nan')]

In [None]:
for indicator in nan_indicators: 
    feature = indicator.split('_nan')[0]
    other_indicators = features_cat_onehot.columns[features_cat_onehot.columns.str.contains(feature)]
    missing_mask = features_cat_onehot[indicator] == 1
    features_cat_onehot.loc[missing_mask,other_indicators] = np.nan

In [None]:
#TODO: verify - would imputation drop these indicators, or keep them in addition? 
for indicator in nan_indicators: 
    features_cat_onehot.drop(columns=indicator, inplace=True)

### Collect into a single DataFrame

In [None]:
df_onehot = pd.concat([features_cat_onehot, outcome], axis=1)

features = features_numerical.join([features_cat])
df = pd.concat([features, outcome], axis=1)

full_df = pd.concat([features_numerical, features_cat_onehot, outcome], axis=1)

In [None]:
df

In [None]:
df_onehot

Some of the code requires encoding the categorical columns (factors) with numerical levels. To ensure consistency, we save a list of all the levels of these columns.

In [None]:
levels = [(col, sorted(df[col][df[col].notnull()].unique())) for col in sorted(cols_cat)]
with open("factor_levels.json", "w", encoding="UTF-8") as levelsfile:
    json.dump(levels, levelsfile)

Some of the imputation code requires knowing which columns are categorical and ordinal, so we store this information.  We now include yes/no (or similar) columns in the list of categorical columns.

One of the imputation methods (MissForest) required encoding the one-hot columns as a single ordinal column; we also determine the column numbers of the categorical and ordinal columns for this encoded version.  For this purpose, we use a standalone variant of the `onehot_to_ord_multicat` method from `data_loader.py` that just returns the columns in the encoded version.  It does more than strictly needed, but that is to ensure it behaves as the `data_loader.py` method does.  Furthermore, the imputation methods only see the non-outcome columns, so we remove the outcome column before performing the encoding.

In [None]:
def get_encoders(factor_levels):
    # factor_levels should be the `levels` variable created above
    factors = [fl[0] for fl in factor_levels]
    levels = [fl[1] for fl in factor_levels]

    # sklearn requires us to fit a non-empty DataFrame even if we specify all
    # the levels
    dummy_df = pd.DataFrame({fl[0]: [fl[1][0]] for fl in factor_levels})
    cat_colnames = factors
    # building the model for transformations
    ohe = OneHotEncoder(categories=levels, sparse=False)
    onehot_encoder = ohe.fit(dummy_df)
    encoded_colnames = ohe.get_feature_names_out(factors)
    # building LabelEncoder dictionary model
    orde = OrdinalEncoder(categories=levels)
    ordinal_encoder = orde.fit(dummy_df)

    return {
        "cat_colnames": cat_colnames,
        "onehot_encoder": onehot_encoder,
        "encoded_colnames": encoded_colnames,
        "ordinal_encoder": ordinal_encoder,
    }


def onehot_to_ord_columns(df, factor_levels):
    encoders = get_encoders(factor_levels)
    onehot_encoder = encoders["onehot_encoder"]
    ordinal_encoder = encoders["ordinal_encoder"]
    encoded_colnames = encoders["encoded_colnames"]
    cat_colnames = encoders["cat_colnames"]

    onehot_df = df[encoded_colnames]
    oh_decoded = onehot_encoder.inverse_transform(onehot_df)
    # silence warning in ordinal_encoder.transform
    oh_decoded_df = pd.DataFrame(oh_decoded, columns=cat_colnames, index=df.index)
    ord_df = ordinal_encoder.transform(oh_decoded_df)
    ord_df = pd.DataFrame(ord_df, columns=cat_colnames, index=df.index)
    rest_df = df.drop(encoded_colnames, axis=1)
    converted_df = pd.concat([rest_df, ord_df], axis=1)
    return list(converted_df.columns)

In [None]:
cols_ord = []

idxs = {}

columns = list(df_onehot.columns)
idx_cat = []
for idx, col in enumerate(columns):
    for cat in cols_cat:
        if col.startswith(cat):
            idx_cat.append(idx)
idx_ord = [columns.index(col) for col in cols_ord]
idxs["onehot"] = [idx_cat, idx_ord]

encoded_cols = onehot_to_ord_columns(df_onehot.dropna(), levels)#todo: more nuanced nan handling
idx_cat = [encoded_cols.index(col) for col in cols_cat]
idx_ord = [encoded_cols.index(col) for col in cols_ord]
idxs["encoded"] = [idx_cat, idx_ord]

idxs["colnames"] = {"onehot": columns, "encoded": encoded_cols}

In [None]:
with open("adult_cols.json", "w", encoding="UTF-8") as colsfile:
    json.dump(idxs, colsfile)

We also save the complete resulting dataset for later use.

In [None]:
df.to_csv("complete_used.csv", index=False)
df_onehot.to_csv("complete_used_onehot.csv", index=False)

In [None]:
full_df.dtypes

In [None]:
full_df.to_csv(f'./test.csv', index=False)

### Create training, validation and holdout sets

We use the one-hot encoded data to create the standard datasets.

In [None]:
outdir = Path('.')
outdir.mkdir(exist_ok=True)

n_splits = 10
n_folds = 5
idx = np.arange(len(df))

kf_splits = KFold(n_splits=n_splits, random_state=1896, shuffle=True)

for holdout_num, out_split in enumerate(kf_splits.split(idx)):
    idx_train = idx[out_split[0]]
    idx_test = idx[out_split[1]]
    devel_fold = full_df.iloc[idx_train, ]
    test_fold = full_df.iloc[idx_test, ]

    test_fold.to_csv(outdir / f'holdout_{holdout_num}.csv', index=False)

    kf_folds = KFold(n_splits=n_folds, random_state=165782 * holdout_num, shuffle=True)
    idx_folds = np.arange(len(devel_fold))
    for fold_num, idx_fold_split in enumerate(kf_folds.split(idx_folds)):
        train_fold = devel_fold.iloc[idx_fold_split[0]]
        val_fold = devel_fold.iloc[idx_fold_split[1]]
        train_fold.to_csv(outdir / f'devel_{holdout_num}_train_{fold_num}.csv', index=False)
        val_fold.to_csv(outdir / f'devel_{holdout_num}_val_{fold_num}.csv', index=False)