In [40]:
import numpy as np
import pandas as pd
import scipy.io as sio
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

# Base path where to find the dataset
base_path = "../data/Arrhythmia"
# Base path where processed dataset will be stored
export_path = "../data/Arrhythmia"
# Name of the file summarizing the preprocessing
info_fname = "arrhythmia_info.csv"
# File name of the original dataset
fname = "arrhythmia.mat"
# File name of the cleaned/processed dataset
export_fname = "arrhythmia.npy"
# Used to track preprocessing steps
stats = defaultdict()

In [41]:
data = sio.loadmat(base_path + "/" + fname)
X = np.concatenate((data['X'], data['y']), axis=1)
stats["n_dropped_cols"] = 0
stats["n_instances"] = X.shape[0]
stats["n_features"] = X.shape[1] - 1
stats["anomaly_ratio"] = "{:2.4f}".format((X == 1).sum() / len(X))
X

array([[ 75. ,   0. , 190. , ...,  23.3,  49.4,   1. ],
       [ 56. ,   1. , 165. , ...,  20.4,  38.8,   0. ],
       [ 54. ,   0. , 172. , ...,  12.3,  49. ,   0. ],
       ...,
       [ 36. ,   0. , 166. , ..., -44.2, -33.2,   0. ],
       [ 32. ,   1. , 155. , ...,  25. ,  46.6,   0. ],
       [ 78. ,   1. , 160. , ...,  21.3,  32.8,   0. ]])

## Check unique values

In [42]:
df = pd.DataFrame(X)
uniq_cols = df.columns[df.nunique() <= 1].tolist()
stats["n_unique_cols"] = len(uniq_cols)
if uniq_cols:
    print("Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols))
    stats["unique_cols"] = ", ".join([str(col) for col in uniq_cols])
    df.drop(uniq_cols, axis=1, inplace=True)
    stats["n_dropped_cols"] += len(uniq_cols)
    uniq_cols = df.columns[df.nunique() <= 1].tolist()
assert len(uniq_cols) == 0, "Found columns with unique values: {}".format(uniq_cols)
print("Columns are valid with more than one distinct value")

Found 17 columns with unique values: [14, 62, 64, 78, 126, 127, 134, 136, 138, 140, 146, 151, 152, 159, 199, 259, 269]
Columns are valid with more than one distinct value


## Check for NaN/invalid values

In [43]:
df.replace([-np.inf, np.inf], np.nan, inplace=True)
nan_cols = df.columns[df.isna().sum() > 0].tolist()
stats["n_nan_cols"] = len(nan_cols)
if nan_cols:
    print("Found {} columns with NaN values: {}".format(len(nan_cols), nan_cols))
    stats["nan_cols"] = ", ".join([str(col) for col in nan_cols])
assert len(nan_cols) == 0, "Found NaN columns: {}".format(nan_cols)
print("Dataset has no NaN or +- INF values")

Dataset has no NaN or +- INF values


## Store processed dataset in a compressed file

In [45]:
np.save(base_path + "/" + export_fname, df.to_numpy())
print("Processed data saved under: {}".format(base_path + "/" + export_fname))

Processed data saved under: ../data/Arrhythmia/arrhythmia.npy


## Store basic information

In [46]:
stats_df = pd.DataFrame(stats, index=[0])
stats_df.to_csv(export_path + "/" + info_fname, index=False)
print("Processing summary saved under: {}".format(base_path + "/" + info_fname))
stats_df

Processing summary saved under: ../data/Arrhythmia/arrhythmia_info.csv


Unnamed: 0,n_dropped_cols,n_instances,n_features,anomaly_ratio,n_unique_cols,unique_cols,n_nan_cols
0,17,452,274,2.3916,17,"14, 62, 64, 78, 126, 127, 134, 136, 138, 140, ...",0
