In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
from collections import defaultdict

# Base path where to find the dataset
base_path = "../data/Thyroid"
# Base path where processed dataset will be stored
export_path = "../data/Thyroid"
# Name of the file summarizing the preprocessing
info_fname = "thyroid_info.csv"
# File name of the original dataset
fname = "thyroid.mat"
# Used to track preprocessing steps
stats = defaultdict()

In [2]:
data = sio.loadmat(base_path + "/" + fname)
X = np.concatenate((data['X'], data['y']), axis=1)
stats["n_instances"] = X.shape[0]
stats["n_features"] = X.shape[1] - 1
stats["anomaly_ratio"] = "{:2.4f}".format((X == 1).sum() / len(X))
X

array([[7.74193548e-01, 1.13207547e-03, 1.37571157e-01, ...,
        2.95774648e-01, 2.36065574e-01, 0.00000000e+00],
       [2.47311828e-01, 4.71698113e-04, 2.79886148e-01, ...,
        5.35211268e-01, 1.73770492e-01, 0.00000000e+00],
       [4.94623656e-01, 3.58490566e-03, 2.22960152e-01, ...,
        5.25821596e-01, 1.24590164e-01, 0.00000000e+00],
       ...,
       [9.35483871e-01, 2.45283019e-02, 1.60341556e-01, ...,
        3.75586854e-01, 2.00000000e-01, 0.00000000e+00],
       [6.77419355e-01, 1.47169811e-03, 1.90702087e-01, ...,
        3.23943662e-01, 1.95081967e-01, 0.00000000e+00],
       [4.83870968e-01, 3.56603774e-03, 1.90702087e-01, ...,
        3.38028169e-01, 1.63934426e-01, 0.00000000e+00]])

## Check unique values

In [3]:
df = pd.DataFrame(X)
uniq_cols = df.columns[df.nunique() <= 1].tolist()
stats["n_unique_cols"] = len(uniq_cols)
if uniq_cols:
    stats["unique_cols"] = uniq_cols
assert len(uniq_cols) == 0, "Found columns with unique values: {}".format(uniq_cols)
print("Columns are valid with more than one distinct value")

Columns are valid with more than one distinct value


## Check for NaN/invalid values

In [4]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)
nan_cols = df.columns[df.isna().sum() > 0].tolist()
stats["n_nan_cols"] = len(nan_cols)
if nan_cols:
    stats["nan_cols"] = nan_cols
assert len(nan_cols) == 0, "Found NaN columns: {}".format(nan_cols)
print("Dataset has no NaN or +- INF values")

Dataset has no NaN or +- INF values


## Check if scaling is required

In [5]:
assert np.allclose(df.max(axis=0).to_numpy(), 1.), "Found values greater than 1."
assert np.allclose(df.min(axis=0).to_numpy(), 0.), "Found values lesser than 0."
print("Data is already scaled between 0 and 1")

Data is already scaled between 0 and 1


## Store basic information

In [6]:
stats_df = pd.DataFrame(stats, index=[0])
stats_df.to_csv(export_path + "/" + info_fname, index=False)
print("Processing summary saved under: {}".format(base_path + "/" + info_fname))
stats_df

Processing summary saved under: ../data/Thyroid/thyroid_info.csv


Unnamed: 0,n_instances,n_features,anomaly_ratio,n_unique_cols,n_nan_cols
0,3772,6,0.0268,0,0
