In [1]:
import autorootcwd

In [2]:
import numpy as np
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [3]:
from evaluation import search_knn, evaluate

In [4]:
# load files
df_test = pd.read_csv("homeworks/homework2/artificial_test.data", sep=' ', header=None)
df_train = pd.read_csv("homeworks/homework2/artificial_train.data", sep=' ', header=None)
df_train_labels = pd.read_csv("homeworks/homework2/artificial_train.labels", sep=' ', header=None)

# drop last column because its NaN for some reason
df_train = df_train.iloc[:, :-1]
df_test = df_test.iloc[:, :-1]

# convert df_train_labels to 0, 1
df_train_labels = df_train_labels.replace(-1, 0)

# convert to numpy
X_train = df_train.values
X_test = df_test.values
y_train = df_train_labels.values.ravel()

# show shapes
X_train.shape, X_test.shape, y_train.shape

((2000, 500), (600, 500), (2000,))

In [5]:
def generate_feature_subsets(X_train, y_train, num_features_list):
    feature_index_subsets = []
    selected_features = set() 

    for num_features in tqdm(num_features_list):
        cf = KNeighborsClassifier(n_neighbors=11, weights="distance", p=2)

        sfs1 = SFS(
            cf,
            k_features=num_features,
            forward=True,
            floating=True,
            verbose=0,
            scoring="balanced_accuracy",
            cv=5,
            n_jobs=-1,
            fixed_features=list(selected_features),
        )

        sfs1 = sfs1.fit(X_train, y_train)

        feature_index_subsets.append(sfs1.k_feature_idx_)

        # Update the selected features using the current iteration's selected features
        selected_features.update(sfs1.k_feature_idx_)

    return feature_index_subsets

In [6]:
num_features_list = [
    20, 22, 24, 26, 28, 30
]

feature_index_subsets = generate_feature_subsets(X_train, y_train, num_features_list)

In [12]:
# print(feature_index_subsets)

# # save best feature subsets in file
# with open("best_feature_subsets2.txt", "w") as f:
#     for feature_index_subset in feature_index_subsets:
#         f.write(str(feature_index_subset) + "\n")
        
        
# load list of best feature subsets from file
# with open("best_feature_subsets2.txt", "r") as f:
#     feature_index_subsets = f.readlines()
#     feature_index_subsets = [eval(x) for x in feature_index_subsets]
    
# print(feature_index_subsets)


In [8]:
hist_knn = []
hist_rf = []

for feature_indexes in feature_index_subsets:
    X_train_subset = X_train[:, feature_indexes]
    
    params_knn, _ = search_knn(X_train, y_train, verbose=0)
    print("knn done")
    
    scores = evaluate(X_train_subset, y_train, num_folds=10, params_knn=params_knn)
    print("num features:", len(feature_indexes))
    print(params_knn)
    print("scores:", scores)
    print("---")
    
    hist_knn.append((len(feature_indexes), scores["KNN"]))

knn done
num features: 20
{'n_neighbors': 18, 'p': 2, 'weights': 'distance'}
scores: {'KNN': 0.8899999999999999}
---
knn done
num features: 22
{'n_neighbors': 18, 'p': 2, 'weights': 'distance'}
scores: {'KNN': 0.8919999999999998}
---
knn done
num features: 24
{'n_neighbors': 18, 'p': 2, 'weights': 'distance'}
scores: {'KNN': 0.8940000000000001}
---
knn done
num features: 26
{'n_neighbors': 18, 'p': 2, 'weights': 'distance'}
scores: {'KNN': 0.8939999999999999}
---


In [11]:
# generate predictions for X_test using KNN with 26 features, and {'n_neighbors': 15, 'p': 2, 'weights': 'uniform'}
best_features = feature_index_subsets[2]
assert len(best_features) == 24

print(X_train.shape, X_test.shape)
X_train_subset = X_train[:, best_features]
X_test_subset = X_test[:, best_features]
print(X_train_subset.shape, X_test_subset.shape)

params_knn = {'n_neighbors': 18, 'p': 2, 'weights': 'distance'}
knn = KNeighborsClassifier(**params_knn)
knn.fit(X_train_subset, y_train)
y_pred = knn.predict(X_test_subset)

# save predictions to file, use float format, add custom first row
np.savetxt("predictions.txt", y_pred, fmt="%.2f", header="329532", comments="")

(2000, 500) (600, 500)
(2000, 24) (600, 24)
