In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from itertools import combinations


out_dir = './outlier_removal_none/'
os.makedirs(out_dir, exist_ok=True)


# ---- helper functions -------
def save_pickle(obj, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(obj, f)


def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)


df = pd.read_csv('./liver_preprocessed.csv')
df.head()


Unnamed: 0,gender,age,label,alanine aminotransferase,albumin,alkaline phosphatase,aspartate aminotransferase,bilirubin,cholesterol,cholinesterase,creatinine,gamma-glutamyl transferase,total protein
0,M,19,hepatitis,87.0,4.1,,67.0,0.70164,0.100854,7.55,0.70122,65.0,7.5
1,M,23,hepatitis,38.9,4.7,19.1,164.2,0.99399,0.082752,7.09,0.896883,90.4,7.01
2,M,25,hepatitis,63.3,4.2,38.2,187.7,0.81858,0.110681,6.0,0.756639,40.2,7.05
3,M,27,hepatitis,10.5,4.5,27.5,37.8,0.5847,0.082752,8.77,0.624312,35.9,7.45
4,M,29,fibrosis,2.4,4.1,43.1,83.5,0.35082,0.140161,11.49,0.624312,130.0,6.65


In [2]:
analytes = df.columns[3:]
genders = df['gender'].unique()


In [3]:
genders


array(['M', 'F'], dtype=object)

In [3]:
# create 1d test dataset
samples = []
targets = []
metadata = []


def direct_method(data):
    return np.array([np.quantile(data, 0.025), np.quantile(data, 0.975)])


out_dir_sub = f"{out_dir}/1d"
os.makedirs(out_dir_sub, exist_ok=True)

for an in analytes:
    for g in genders:

        # get values and labels
        sample = df[df['gender'] == g][an].to_numpy()
        labels = df[df['gender'] == g]['label'].to_numpy()

        # drop nan
        mask = ~np.isnan(sample)
        sample = sample[mask]
        labels = labels[mask]

        samples.append(sample)
        metadata.append({
            'analyte': an,
            'gender': g
        })
        targets.append(direct_method(sample[labels == 'reference']))

save_pickle(samples, f"{out_dir_sub}/samples.pkl")
save_pickle(targets, f"{out_dir_sub}/targets.pkl")
pd.DataFrame(metadata).to_csv(f"{out_dir_sub}/metadata.csv")


In [4]:
# create 2d test dataset
samples = []
targets = []
metadata = []


def direct_method(data):
    return [np.mean(data, axis=0), np.cov(data.T)]


out_dir_sub = f"{out_dir}/2d"
os.makedirs(out_dir_sub, exist_ok=True)

for pair in combinations(analytes, 2):
    for g in genders:
        # get values and labels
        sample = df[df['gender'] == g][list(pair)].to_numpy()
        labels = df[df['gender'] == g]['label'].to_numpy()

        # drop nan
        mask = ~np.isnan(sample).any(axis=1)
        sample = sample[mask]
        labels = labels[mask]

        sample = np.log(sample)  # for 2d liver data, all prediction and evaluation is done in log space

        samples.append(sample)
        metadata.append({
            'analyte_pair': pair,
            'gender': g
        })
        targets.append(direct_method(sample[labels == 'reference']))

save_pickle(samples, f"{out_dir_sub}/samples.pkl")
save_pickle(targets, f"{out_dir_sub}/targets.pkl")
pd.DataFrame(metadata).to_csv(f"{out_dir_sub}/metadata.csv")

