In [1]:
import logging
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from scipy.io.arff import loadarff

In [2]:
input_dir = "../data/input/dami-base-processed-2000"
file_regex = re.compile('(.*withoutdupl_norm((_\d\d)|_v01|_catremoved)?.arff$)')

os.listdir(input_dir)

target_files = []

for root, dirs, files in os.walk(input_dir):
#     print(root)
    for file in files:
        target_files += [os.path.join(root, file)]

In [3]:
def load_file(file_name):
    data, meta = loadarff(file_name)
    data = pd.DataFrame(data, columns=meta.names())
    data['label'] = data['outlier'].apply(lambda x: 'outlier' if x == b"'yes'" else 'inlier')
    data = data.drop(columns=['id', 'outlier'])
    # reorder columns
    data = data[np.append([x for x in data.columns if x != 'label'], ['label'])]
    return data

In [4]:
res = []
for t in target_files:
    data_name = t.split("\\")[-1].split("_")[0]
    data = pd.read_csv(t, header=None)
    data.rename(columns={data.columns[-1]: "label" }, inplace = True)
    n, m = data.shape
    out_ratio = data['label'].value_counts(normalize=True)['outlier']
    out_n = data['label'].value_counts(normalize=False)['outlier']
    res.append([data_name, n, m, out_n, out_ratio])
    print(f'{data_name} & {n} & {m - 1} & {out_n} & {out_ratio:.2f} \\\\')

../data/input/dami-base-processed-2000/ALOI/ALOI & 2000 & 27 & 61 & 0.03 \\
../data/input/dami-base-processed-2000/Annthyroid/Annthyroid & 2000 & 21 & 150 & 0.07 \\
../data/input/dami-base-processed-2000/Arrhythmia/Arrhythmia & 450 & 259 & 206 & 0.46 \\
../data/input/dami-base-processed-2000/Cardiotocography/Cardiotocography & 2000 & 21 & 441 & 0.22 \\
../data/input/dami-base-processed-2000/Glass/Glass & 214 & 7 & 9 & 0.04 \\
../data/input/dami-base-processed-2000/HeartDisease/HeartDisease & 270 & 13 & 120 & 0.44 \\
../data/input/dami-base-processed-2000/Hepatitis/Hepatitis & 80 & 19 & 13 & 0.16 \\
../data/input/dami-base-processed-2000/InternetAds/InternetAds & 1966 & 1555 & 368 & 0.19 \\
../data/input/dami-base-processed-2000/Ionosphere/Ionosphere & 351 & 32 & 126 & 0.36 \\
../data/input/dami-base-processed-2000/KDDCup99/KDDCup99 & 2000 & 40 & 8 & 0.00 \\
../data/input/dami-base-processed-2000/Lymphography/Lymphography & 148 & 3 & 6 & 0.04 \\
../data/input/dami-base-processed-2000/Pa

In [5]:
data_stats = pd.DataFrame(res, columns = ['data_set_name', 'N', 'M', 'out_n', 'outlier_ratio'])
data_stats.to_pickle("../data/output/data_stats.pkl")

In [6]:
data_stats.to_latex('tables/data_stats.tex', float_format="{:0.2f}".format)