In [2]:
from math import inf
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


def build_AM(x,y):
    """Function to build the A matrix.

    Keyword arguments:
    x -- the input data numpy array in the form nxm (samplesxfeatures).
    y -- the numpy array that represents the classes for each sample.
    """
    classes = np.unique(y)
    Am = np.zeros((x.shape[1],len(classes)))
    # print(Am)
    for feat in range(x.shape[1]):
        for lab in np.unique(y):
            # print(x[np.where(y == lab)[0],feat])
            x_fit = x[np.where(y == lab)[0],feat].reshape(-1, 1)
            params = {'bandwidth': np.linspace(0.01, 3, 30)}
            grid = GridSearchCV(KernelDensity(), params, cv=5)
            grid.fit(x_fit)
            kde = grid.best_estimator_
            #h = np.std(x_fit)*(4/3/len(x_fit))**(1/5)
            #print(h)
            #kde = KernelDensity(bandwidth=max(h,0.5)).fit(x_fit)
            Am[feat,lab-1] = x_fit[np.argmax(np.exp(kde.score_samples(x_fit)))]
    return Am

In [19]:
import numpy as np
import pandas as pd

# Load the Iris dataset from a CSV file
df = pd.read_csv('../data/iris', header=None)

# Convert class names to numeric values
class_dict = {'Iris-setosa': 1, 'Iris-versicolor': 2, 'Iris-virginica': 3}
df[4] = df[4].map(class_dict)
print(df)

# Extract the feature values and class labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Build the A matrix
A = build_AM(X, y)

print(A)


       0    1    2    3  4
0    5.1  3.5  1.4  0.2  1
1    4.9  3.0  1.4  0.2  1
2    4.7  3.2  1.3  0.2  1
3    4.6  3.1  1.5  0.2  1
4    5.0  3.6  1.4  0.2  1
..   ...  ...  ...  ... ..
145  6.7  3.0  5.2  2.3  3
146  6.3  2.5  5.0  1.9  3
147  6.5  3.0  5.2  2.0  3
148  6.2  3.4  5.4  2.3  3
149  5.9  3.0  5.1  1.8  3

[150 rows x 5 columns]
[[5.  5.7 6.4]
 [3.4 2.9 3. ]
 [1.5 4.4 5.4]
 [0.2 1.4 1.9]]


In [6]:
df = pd.read_csv("../data/wine/wine", header=None, sep=',')
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
A = build_AM(X, y)
print(A)

[[1.377e+01 1.221e+01 1.311e+01]
 [1.730e+00 1.470e+00 3.240e+00]
 [2.420e+00 2.240e+00 2.350e+00]
 [1.680e+01 2.000e+01 2.050e+01]
 [1.010e+02 8.600e+01 9.000e+01]
 [2.880e+00 2.110e+00 1.590e+00]
 [2.910e+00 1.940e+00 6.300e-01]
 [2.800e-01 3.500e-01 4.800e-01]
 [1.970e+00 1.560e+00 1.060e+00]
 [5.250e+00 2.800e+00 5.880e+00]
 [1.070e+00 1.000e+00 6.500e-01]
 [2.870e+00 2.960e+00 1.640e+00]
 [1.285e+03 6.800e+02 5.200e+02]]


In [18]:
df = pd.read_csv("../data/winequality/winequality-red", header=None, sep=';', skiprows=1)
# df = df.assign(new_col=1)
df1 = pd.read_csv("../data/winequality/winequality-white", header=None, sep=';', skiprows=1)
df.insert(len(df.columns), "label", 'red')
df1.insert(len(df1.columns), "label",'white')
class_dict = {'red': 1, 'white': 2}
df = pd.concat([df, df1], ignore_index=True)
# print(df.isna())
df.iloc[:, -1] = df.iloc[:, -1].map(class_dict)
# print(df.shape)
df = df.dropna()
# print(df.shape)
X = df.iloc[:, :-1].values

y = df.iloc[:, -1].values
A = build_AM(X, y)
print(A)

[[7.4000e+00 6.7000e+00]
 [5.1000e-01 2.5000e-01]
 [0.0000e+00 3.1000e-01]
 [2.0500e+00 1.7000e+00]
 [7.8000e-02 4.2000e-02]
 [6.0000e+00 3.1000e+01]
 [1.8000e+01 1.1500e+02]
 [9.9674e-01 9.9398e-01]
 [3.3100e+00 3.1500e+00]
 [6.1000e-01 4.5000e-01]
 [9.5000e+00 9.4000e+00]
 [5.0000e+00 6.0000e+00]]
