In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from argparse import Namespace
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder

# Gaussian Datasets

In [26]:
path = "./../Datasets/Gaussian/Raw/penguins_size.csv"
y_name = "species"
name = 'penguin'

In [15]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [17]:
df.isna().sum(axis=0)

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [18]:
df = df[~df[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']].isna().all(axis=1)]

In [19]:
df.isna().sum(axis=0)

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  8
dtype: int64

In [22]:
drop_columns = [y_name, "sex", "island"]

y = pd.Categorical(df[y_name]).codes
X = df.drop(columns=drop_columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [23]:
pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

In [24]:
y_train

array([0, 0, 0, 2, 1, 0, 2, 2, 1, 2, 0, 2, 1, 2, 0, 0, 2, 2, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0,
       0, 0, 0, 0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 1, 1,
       0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 1, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2,
       1, 1, 2, 2, 0, 1, 1, 2, 0, 1, 0, 0, 0, 2, 0, 2, 1, 2, 1, 2, 2, 1,
       0, 1, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 2, 0, 2, 0, 0, 0, 0, 0, 2,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 1, 1, 2, 2, 0, 0, 0, 0, 2, 2, 0, 2,
       0, 1, 2, 2, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0,
       2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 2, 2, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 2, 0, 1, 0,
       2, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0,
       2, 2, 0, 0, 1, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0,
       1, 2, 1, 0, 0, 2, 0, 0, 1], dtype=int8)

In [28]:
np.save(f"./../Datasets/Gaussian/Processed/{name}/X_train_{name}.npy", X_train_prepared)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_train_{name}.npy", y_train)
np.save(f"./../Datasets/Gaussian/Processed/{name}/X_test_{name}.npy", X_test_prepared)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_test_{name}.npy", y_test)

# Mushroom

In [2]:
df = pd.read_csv("./../Datasets/Bernoulli/Raw/mushroom.csv")

In [3]:
df.head()


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises%3F,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,b'x',b's',b'n',b't',b'p',b'f',b'c',b'n',b'k',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b's',b'u',b'p'
1,b'x',b's',b'y',b't',b'a',b'f',b'c',b'b',b'k',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'n',b'n',b'g',b'e'
2,b'b',b's',b'w',b't',b'l',b'f',b'c',b'b',b'n',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'n',b'n',b'm',b'e'
3,b'x',b'y',b'w',b't',b'p',b'f',b'c',b'n',b'n',b'e',...,b'w',b'w',b'p',b'w',b'o',b'p',b'k',b's',b'u',b'p'
4,b'x',b's',b'g',b'f',b'n',b'f',b'w',b'b',b'k',b't',...,b'w',b'w',b'p',b'w',b'o',b'e',b'n',b'a',b'g',b'e'


In [63]:
df.nunique()

cap-shape                    6
cap-surface                  4
cap-color                   10
bruises%3F                   2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
class                        2
dtype: int64

In [64]:
df.isna().sum(axis=0)

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises%3F                  0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
class                       0
dtype: int64

In [None]:
drop_columns = ["class",'veil-type']

y = pd.Categorical(df['class']).codes
X = df.drop(columns=drop_columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))  
])

X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

In [66]:
name="mushroom"
np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_train_{name}.npy", X_train_prepared)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_train_{name}.npy", y_train)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_test_{name}.npy", X_test_prepared)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_test_{name}.npy", y_test)

---

# MNIST

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
mnist = fetch_openml('mnist_784', parser ='auto')

X_mnist = np.array(mnist.data) > 0.5 # binary-rise the data
Y_mnist = np.array(mnist.target, dtype ='int')
X_mnist_train = X_mnist[:4000, :] # use the first 4000 images as training data
Y_mnist_train = Y_mnist[:4000]
X_mnist_test = X_mnist[4000:5000, :] # the next 1000 as testing
Y_mnist_test = Y_mnist[4000:5000]

In [3]:
name="mnist"
np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_train_{name}.npy", X_mnist_train)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_train_{name}.npy", Y_mnist_train)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_test_{name}.npy", X_mnist_test)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_test_{name}.npy", Y_mnist_test)