In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import skew, boxcox


out_dir = './outlier_removal_panelwise/'
os.makedirs(out_dir, exist_ok=True)


# ---- helper functions -------
def save_pickle(obj, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(obj, f)


def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)


def flag_outliers(data):
    """
    Remove's outliers using Tukey's rule:
        < q1-1.5*iqr
        > q3+1.5*iqr

    Log transform is applied to normalize before filtering outliers

    Args:
        data: numpy array (n_samples, n_variables)
    Returns:
        is_outlier: boolean array of predicted outliers
    """
    is_outlier = np.zeros(data.shape[0], dtype=bool)
    for i in range(data.shape[1]):
        skewness = skew(data[:, i])
        tdata, _ = boxcox(data[:, i])
        q1 = np.quantile(tdata, 0.25)
        q3 = np.quantile(tdata, 0.75)
        iqr = q3 - q1
        if skewness >= 1:  # if positively skewed, only remove from right tail
            outlier_i = np.array(tdata > (q3+(iqr*1.5)))
        elif skewness <= -1:  # if negatively skewed, only remove from left tail
            outlier_i = np.array(tdata < (q1-(iqr*1.5)))
        else:
            outlier_i = np.array(tdata < (q1-(iqr*1.5))) | np.array(tdata > (q3+(iqr*1.5)))
        is_outlier = is_outlier | outlier_i
    return is_outlier


# load data
df = pd.read_csv('./liver_preprocessed.csv')
df.head()


Unnamed: 0,gender,age,label,alanine aminotransferase,albumin,alkaline phosphatase,aspartate aminotransferase,bilirubin,cholesterol,cholinesterase,creatinine,gamma-glutamyl transferase,total protein
0,M,19,hepatitis,87.0,4.1,,67.0,0.70164,0.100854,7.55,0.70122,65.0,7.5
1,M,23,hepatitis,38.9,4.7,19.1,164.2,0.99399,0.082752,7.09,0.896883,90.4,7.01
2,M,25,hepatitis,63.3,4.2,38.2,187.7,0.81858,0.110681,6.0,0.756639,40.2,7.05
3,M,27,hepatitis,10.5,4.5,27.5,37.8,0.5847,0.082752,8.77,0.624312,35.9,7.45
4,M,29,fibrosis,2.4,4.1,43.1,83.5,0.35082,0.140161,11.49,0.624312,130.0,6.65


In [2]:
len(df)


612

In [3]:
# drop na
df_or = df.dropna().copy()


In [4]:
analytes = df.columns[3:]
genders = df['gender'].unique()


In [5]:
# remove outliers across panel
mask = flag_outliers(df_or.iloc[:, 3:].values)
df_or = df_or.iloc[~mask]


In [6]:
# create 1d test dataset
samples = []
targets = []
metadata = []


def direct_method(data):
    return np.array([np.quantile(data, 0.025), np.quantile(data, 0.975)])


out_dir_sub = f"{out_dir}/1d"
os.makedirs(out_dir_sub, exist_ok=True)

for an in analytes:
    for g in genders:
        targets.append(
            direct_method(
                df[
                    (df['gender'] == g) &
                    (df['label'] == 'reference')
                ][an].to_numpy()
            )
        )
        sample = df_or[df_or['gender'] == g][an].to_numpy()
        samples.append(sample)
        metadata.append({
            'analyte': an,
            'gender': g
        })

save_pickle(samples, f"{out_dir_sub}/samples.pkl")
save_pickle(targets, f"{out_dir_sub}/targets.pkl")
pd.DataFrame(metadata).to_csv(f"{out_dir_sub}/metadata.csv")


In [7]:
# create 2d test dataset
samples = []
targets = []
metadata = []


def direct_method(data):
    return [np.mean(data, axis=0), np.cov(data.T)]


out_dir_sub = f"{out_dir}/2d"
os.makedirs(out_dir_sub, exist_ok=True)

for pair in combinations(analytes, 2):
    for g in genders:
        targets.append(
            direct_method(
                df[
                    (df['gender'] == g) &
                    (df['label'] == 'reference')
                ][list(pair)].to_numpy()
            )
        )
        sample = df_or[df_or['gender'] == g][list(pair)].to_numpy()
        sample = np.log(sample)  # for 2d liver data, all prediction and evaluation is done in log space
        samples.append(sample)
        metadata.append({
            'analyte_pair': pair,
            'gender': g
        })

save_pickle(samples, f"{out_dir_sub}/samples.pkl")
save_pickle(targets, f"{out_dir_sub}/targets.pkl")
pd.DataFrame(metadata).to_csv(f"{out_dir_sub}/metadata.csv")

