In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from datetime import datetime

In [None]:
df = pd.read_csv("data/dataset.csv", sep=",")
df

In [None]:
n_features = len(df.columns) - 1

In [None]:
X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values

In [None]:
import optuna
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
import numpy as np

In [None]:
def exponential_dilation(r): return lambda x: (1 - np.exp(-r*x))

In [None]:
def objective(trial):
    X_dilated = exponential_dilation(trial.suggest_float("dilation_coefficient",0,1))(X)
    kf = KFold(n_splits = 10, shuffle = True)
    n_pca_components = trial.suggest_int("pca_components", 3, 90)
    pca = PCA(n_components=n_pca_components)
    corr = []
    for ix_train, ix_test in kf.split(X_dilated):
        model = KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors",3,10), metric=trial.suggest_categorical("knn_metric",['minkowski', 'euclidean', 'cosine']))
        X_train, y_train = X[ix_train], y[ix_train]
        X_test, y_test = X[ix_test], y[ix_test]
        U_train = pca.fit_transform(X_train)
        U_test = pca.transform(X_test)
        model.fit(U_train, y_train)
        y_pred = model.predict(U_test)
        corr.append(matthews_corrcoef(y_test, y_pred))
    return np.mean(corr) - 0.01*n_pca_components

In [None]:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
study = optuna.create_study(
    storage="sqlite:///data/optuna.sqlite",
    direction='maximize',
    study_name=f'KNN_with_PCA_timestamp_{timestamp}',
    load_if_exists=False
)
study.optimize(objective, n_trials = 100)

In [None]:
study.best_params

In [None]:
pca = PCA(n_components=study.best_params['pca_components'])
X_transformed = exponential_dilation(study.best_params['dilation_coefficient'])(X)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, stratify=y)
U_train = pca.fit_transform(X_train)
U_test = pca.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=study.best_params['n_neighbors'], metric=study.best_params['knn_metric'])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
matthews_corrcoef(y_true=y_test,y_pred=y_pred)

In [None]:
confusion_matrix(y_true=y_test,y_pred=y_pred)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
udf_train = pd.DataFrame(U_train)
udf_train['labels'] = y_train

In [None]:
plt.figure()
sns.pairplot(udf_train, hue='labels')
plt.show()