In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
import pandas as pd
import scipy
import scipy.io
import os
import math
import pdb

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, KFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import RFE, SelectKBest, f_classif, mutual_info_classif
from skfeature.function.sparse_learning_based import RFS

import matplotlib.pyplot as plt
import matplotlib
import subprocess
from multiprocessing import Pool

import fourier_learning
import time
from datetime import datetime
import pickle
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from collections import defaultdict

In [6]:
from skfeature.function.similarity_based import reliefF
from skfeature.function.sparse_learning_based import MCFS
from skfeature.utility import construct_W
# from skfeature.utility.sparse_learning import feature_ranking
from skfeature.utility.sparse_learning import construct_label_matrix, feature_ranking
from skfeature.function.similarity_based import lap_score
from skfeature.function.sparse_learning_based import UDFS
from skfeature.function.sparse_learning_based import NDFS

from tqdm import tqdm

# Constants

In [7]:
NO_RUNS = 5
CLF_NAME = "kernel SVM with RBF"  # {"kernel SVM with RBF", "random forest"}

UNSUPERVISED_FEAT_SEL = False
SUPERVISED_FEAT_SEL = True

# Existing supervised algorithms
FEAT_SEL_ALGS_SUPERVISED = {
    'FL-Depth-2': False,
    'FL-Depth-1': False,
    'FL': False,  # Fourier Learning exhaustive search
    'SFFS': False,
    'SFFS (exhaustive)': False,  # Fourier Learning exhaustive search
    'UFFS + SFFS (t=3)': True,
    'UFFS + SFFS (t=2)': True,
    'UFFS + SFFS (t=1)': True,
    'UFFS + SFFS (exhaustive)': False,  # Fourier Learning exhaustive search
    'CCM': True,
    'ReliefF': True,
    'mRMR': True,
    'MI': True,
    'RFE': False,
    'RFS': True,
    'F-Value': False
}

# Existing unsupervised algorithms
FEAT_SEL_ALGS_UNSUPERVISED = {
    'UFFS': False,
    'NO_FS': False,
    'NDFS': False,
    'UDFS': False,
    'MCFS': False,
    'LS': False
}

UFFS_CALCN_CV = True
PARALLELIZE_CV = True
UFFS_K = 200
UFFS_FOLDS = True
UFFS_SINGLE = False

FIGSIZE_SCALE_REQD = 0.5

# =============================================================================
if FEAT_SEL_ALGS_SUPERVISED['CCM']:
    sys.path.append('../lib/CCM/core')
    import ccm_v1 as ccm

# DON'T CHANGE IT
ORTHOGONALIZE = False
N_UNIQUE_CLASSES_Y = 20

FEAT_SEL_ALGS_FN_LABEL = {
    "FL": "fourier-learning",
    "FL-Depth-2": "fourier-learning_depth_2",
    "FL-Depth-1": "fourier-learning_depth_1",
    "UFFS + SFFS (t=3)": "UFFS + SFFS (t=3)",
    "UFFS + SFFS (t=2)": "UFFS + SFFS (t=2)",
    "UFFS + SFFS (t=1)": "UFFS + SFFS (t=1)",
    # Existing supervised algorithms
    "CCM": "ccm",
    "ReliefF": "relieff",
    "mRMR": "mrmr",
    "RFE": "rfe",
    "RFS": "rfs",
    "MI": "mi",
    "F-Value": "fval",
    # Existing unsupervised algorithms
    "NDFS": True,
    "UDFS": True,
    "MCFS": True,
    "LS": True
}

UNSUPERVISED_FEAT_SEL_ALGS_FN_LABEL = {
    'NDFS': True,
    'UDFS': True,
    'MCFS': True,
    "LAP_SCORE": "lap_score"
}

Instructions for updating:
non-resource variables are not supported in the long term


# Figure settings

In [8]:
# Figure settings =============================================================

def figsize(scale):
    fig_width_pt = 503.295     # Get this from LaTeX using \the\textwidth
    inches_per_pt = 1.0/72.27   # Convert pt to inch
    # Aesthetic ratio (you could change this)
    golden_mean = (np.sqrt(5.0)-1.0)/2.0
    fig_width = fig_width_pt*inches_per_pt*scale  # width in inches
    fig_height = fig_width*golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    return fig_size


pgf_with_latex = {                      # setup matplotlib to use latex for output
    "pgf.texsystem": "pdflatex",        # change this if using xetex or lautex
    "text.usetex": True,                # use LaTeX to write all text
    "font.family": "serif",
    # blank entries should cause plots to inherit fonts from the document
    "font.serif": [],
    "font.sans-serif": [],
    "font.monospace": [],
    "axes.labelsize": 10,               # LaTeX default is 10pt font.
    "font.size": 8,
    "legend.fontsize": 8,               # Make the legend/label fonts a little smaller
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "figure.figsize": figsize(0.9),     # default fig size of 0.9 textwidth
}
matplotlib.rcParams.update(pgf_with_latex)

cust_color = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3",
              "#ff7f00", "#ffff33", "#a65628", "#f781bf"]
matplotlib.rcParams['savefig.dpi'] = 125
matplotlib.rcParams['text.latex.preamble'] = r"\usepackage{amsmath,amssymb,amsfonts}"
# =============================================================================

# Load data

In [9]:
def load_data(data_name):
    try:
        file = "../data/icml_paper/{0}.mat".format(data_name)
        temp_data = scipy.io.loadmat(file)
        X, y = temp_data['X'], temp_data['Y'].squeeze()
        if len(np.unique(y)) > 2:
            type_y = 'categorical'
        else:
            type_y = 'binary'
    except:
        print("No valid data_name entered")
        sys.exit(2)
        
    if type_y == 'binary' or type_y == 'categorical':
        if y.dtype != np.int:
            y = y.astype(np.int, copy=False)

    return X, y, type_y

# Utilities

In [8]:
# Helper function to call supervised fourier selection
def fourier_feature_selection(X, y, k, approx="depth_based", depth=2):
    mean_emp = np.mean(X, axis=0)
    std_emp = np.std(X, ddof=1, axis=0)
    fourier_featsel = fourier_learning.SupervisedFourierFS(
        k, mean_emp, std_emp, approx, depth)
    feats_selected = fourier_featsel.fit(X, y)
    return feats_selected


# Helper function to call the classifier using only the selected features
def clf_score_with_feature_selection(X_train, y_train, X_test, y_test, clf, feats_selected):
    X_sel_train = X_train[:, feats_selected]
    X_sel_test = X_test[:, feats_selected]

    clf.fit(X_sel_train, y_train)
    y_sel_pred = clf.predict(X_sel_test)
    return accuracy_score(y_test, y_sel_pred)


# Condiational co-variance feature selection
def ccm_feature_selection(X, y, type_y):
    epsilon = 0.001
    _, d = X.shape
    if d <= 100:
        num_features = math.ceil(d / 5)
    else:
        num_features = 100
    rank_ccm = ccm.ccm(X, y, num_features, type_y, epsilon,
                       iterations=100, verbose=False)
    all_feats_selected = np.argsort(rank_ccm)
    return all_feats_selected

In [9]:
# def supervised_fs(X_train, y_train, type_y, feat_selection, sel_features_UFFS_X=[]):
def supervised_fs(X_train, y_train, type_y, feat_selection):
    '''
    Feature selection with single fold of cross validation: 
    for parallel implementation of cross validation

    Arguments:
        X: input data of one fold with columns as features and rows as samples
        y: output lables of one fold
        type_y: type of y - binary, categorical or real, for CCM feature selection
        feat_selection: feature selection algorithm name
        clf: classifier function
        partitions: the set of all cross-validation partitions
        k: partition index
        sel_features_UFFS_X: set of all features selected by unsupervised Fourier feature selection 
                             if we use UFFS_SINGLE = True 
    '''

    # Variance threshold --------------------------------------------------
#     _, d = X_train.shape
#     mask = (np.std(X_train, ddof=1, axis=0) > 1e-5)
#     valid_features = np.arange(d)
#     valid_features = valid_features[mask]

#     if feat_selection in ["fourier-learning_depth_1", "fourier-learning_depth_2","UFFS + SFFS (t=2)", "UFFS + SFFS (t=1)"]:
#         X_train = X_train[:, sel_features_UFFS_X]
    # ---------------------------------------------------------------------
    _, d = X_train.shape

    if feat_selection == "ccm":
        all_feats_selected = ccm_feature_selection(X_train, y_train, type_y)
    elif feat_selection == "relieff":
        score_reliefF_temp = reliefF.reliefF(X_train, y_train)
        # rank features in descending order according to score
        all_feats_selected = reliefF.feature_ranking(score_reliefF_temp)[:d]
    elif feat_selection == "mrmr":
        # Prepare data
        y_train = y_train.reshape(-1, 1)
        a = ["class"] + [str(i) for i in range(0, d)]
        df_mRMR = pd.DataFrame(data=np.concatenate(
            (y_train, X_train), axis=1), columns=a)
        df_mRMR.to_csv('../data/temp_mRMR_data.csv', index=False)
        y_train = y_train.ravel()

        # Call the C++ executable
        command_mRMR = '../lib/mrmr_c_Peng/./mrmr -i ../data/temp_mRMR_data.csv -n {0}'.format(
            d)
        out_code = subprocess.call([command_mRMR], shell=True)
        if out_code != 0:
            print("something wrong")
            sys.exit(2)
        all_feats_selected = np.loadtxt('out_temp.txt', dtype=np.int)
        # ==================================================================
    elif feat_selection == "mi":
        feat_sel_alg = SelectKBest(score_func=mutual_info_classif, k=d)
        feat_sel_alg.fit(X_train, y_train)
        feat_sel_alg.scores_
#         all_feats_selected = feat_sel_alg.get_support(indices=True)
        all_feats_selected = np.argsort(-feat_sel_alg.scores_)
    elif feat_selection == "fval":
        feat_sel_alg = SelectKBest(score_func=f_classif, k=d)
        feat_sel_alg.fit(X_train, y_train)
        all_feats_selected = np.argsort(-feat_sel_alg.scores_)
#         all_feats_selected = feat_sel_alg.get_support(indices=True)
    elif feat_selection == "UFFS + SFFS (t=3)":
        all_feats_selected = fourier_feature_selection(X_train, y_train, d,
                                                       approx="depth_based", depth=3)
    elif feat_selection == "UFFS + SFFS (t=2)":
        all_feats_selected = fourier_feature_selection(X_train, y_train, d,
                                                       approx="depth_based", depth=2)
    elif feat_selection == "UFFS + SFFS (t=1)":
        all_feats_selected = fourier_feature_selection(X_train, y_train, d,
                                                       approx="depth_based", depth=1)
    elif feat_selection == "fourier-learning":
        all_feats_selected = fourier_feature_selection(
            X_train, y_train, d, approx="none")
    elif feat_selection == "rfe":
        clf_rfe = LinearSVC(loss="squared_hinge",
                            penalty="l1", dual=False, max_iter=2000)
        feat_sel_alg = RFE(clf_rfe, n_features_to_select=d, step=1)
        X_train_1 = StandardScaler().fit_transform(X_train)
        feat_sel_alg.fit(X_train_1, y_train)
        all_feats_selected = feat_sel_alg.get_support(indices=True)
    elif feat_selection == "rfs":
        Y_train = construct_label_matrix(y_train)
        Weight = RFS.rfs(X_train, Y_train, gamma=0.1)
        # sort the feature scores in an ascending order according to the feature scores
        all_feats_selected = feature_ranking(Weight)[:d]
        # obtain the dataset on the selected features
    else:
        print("none selected")
    return all_feats_selected

In [10]:
# from sklearn.datasets import load_digits
# from sklearn.feature_selection import SelectKBest, chi2
# X, y = load_digits(return_X_y=True)
# X.shape

In [11]:
# mm = SelectKBest(chi2, k=20).fit(X, y)

In [12]:
# mm.scores_

In [13]:
# aaa = np.argsort(-mm.scores_)

In [14]:
# len(mm.scores_[aaa])

In [15]:
def plot_ccm_comparison(xaxis, data_name, scores, file_name=None):
    '''
    Plot the results
    arguments:
        xaxis: xaxis vector
        data_name: name of the dataset
        scores: dictionary with key as the feature selection algorithm name and value as the vector 
                of cross-validation scores for different k's
        file_nmae: save to a specific file, otherwise create a time-stamped file
    '''

    fig, ax = plt.subplots(figsize=figsize(FIGSIZE_SCALE_REQD))
    # ax = plt.axes(frameon=1)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.grid(alpha=1, linestyle='dotted')
    ax.minorticks_on()
    ax.tick_params(axis='x', which='minor', bottom=True)
    ax.tick_params(axis='y', which='minor', left=False)

    Fourier_FS_length = min(len(scores["UFFS + SFFS (t=2)"]), len(scores["UFFS + SFFS (t=1)"]))
    # Fourier_FS_length = len(scores["UFFS + SFFS (t=2)"])
    for k, v in scores.items():
        plt.plot(xaxis[:Fourier_FS_length],np.array(v[:Fourier_FS_length])*100, label=k, alpha=0.7)
        # plt.legend(loc='best')
        plt.xlabel(r"$k$")
        plt.ylabel("Mean accuracy")
    title_str = r"Dataset: {0}".format(data_name)
    title_str = title_str.replace('_', '\_')
    plt.title(title_str)

    handles, labels = ax.get_legend_handles_labels()
    lgd = ax.legend(handles, labels, loc='upper center',
                    bbox_to_anchor=(0.45, -0.2), ncol=3)
    
    plt.ylim(25,100)
    if file_name == None:
        timestr = time.strftime("%Y%m%d_%H%M%S")
        file_name = "comparison_feat_sel_{0}".format(timestr)
    plt.savefig('../results/{0}.pdf'.format(file_name),
                bbox_extra_artists=(lgd,), bbox_inches='tight', pad_inches=0.02)
    return file_name

In [25]:
def xaxis_fn(d):
    if d <= 50:
        xaxis = list(range(1, min(20, d)+1))
    elif 50 < d <= 100:
        xaxis = list(range(5, 51, 5))
    elif d > 100:
        xaxis = list(range(10, 101, 10))
    return xaxis

# Main

In [20]:
DATA_NAME = "Musk"
# -1 is a proxy for "d" in cluster_sizes
# UFFS_SETTINGS = {
#     "max_depth": 3,
#     "cluster_sizes": [-1, 50, 25],
#     "selection_thresholds":[0.95,0.95,0.95],
#     "norm_epsilon":[0.001, 0.001, 0.001],
#     "shuffle": False,
#     "preranking": "non"
# }

UFFS_SETTINGS = {
    "max_depth": 3,
    "cluster_sizes": [-1, 51, 31],
    "selection_thresholds":[0.95,0.95,0.95],
    "norm_epsilon":[0.001, 0.001, 0.001],
    "shuffle": False,
    "preranking": "non"
}

In [18]:
DATA_NAME = "data1"
# -1 is a proxy for "d" in cluster_sizes

UFFS_SETTINGS = {
    "max_depth": 3,
    "cluster_sizes": [-1, 50, 31],
    "selection_thresholds":[0.95,0.95,0.95],
    "norm_epsilon":[0.001, 0.001, 0.001],
    "shuffle": False,
    "preranking": "non"
}

In [19]:
DATA_NAME = "E1"
# -1 is a proxy for "d" in cluster_sizes

UFFS_SETTINGS = {
    "max_depth": 3,
    "cluster_sizes": [-1, 40, 20],
    "selection_thresholds":[0.98,0.98,0.98],
    "norm_epsilon":[0.001, 0.001, 0.001],
    "shuffle": False,
    "preranking": "non"
}

In [20]:
DATA_NAME = "E2"
# -1 is a proxy for "d" in cluster_sizes

UFFS_SETTINGS = {
    "max_depth": 3,
    "cluster_sizes": [-1, 40, 20],
    "selection_thresholds":[0.999,0.9995,0.99],
    "norm_epsilon":[0.001, 0.001, 0.001],
    "shuffle": False,
    "preranking": "non"
}

In [21]:
X, y, type_y = load_data(DATA_NAME)
no_samples, d = X.shape
random_state = [42, 100, 0, 25, 80]
X_train, X_test, y_train, y_test = [], [], [], []
for state in random_state:
    t = train_test_split(X, y, test_size=0.2, random_state=state, stratify=y, shuffle=True)
    for i, j in zip((X_train,X_test,y_train, y_test),t):
        i.append(j)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if y.dtype != np.int:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = y.astype(np.int, copy=False)


## Orthogonalize

In [22]:
X_train.shape

AttributeError: 'list' object has no attribute 'shape'

In [22]:
X_train = X_train[0].copy()
X_test = X_test[0].copy()
y_train = y_train[0].copy()
y_test = y_test[0].copy()

In [23]:
orig_features = np.arange(X_train.shape[1])
mask = (np.std(X_train, ddof=1, axis=0) > 1e-5)
valid_features = orig_features[mask]

valid_features = valid_features[mask]
X_train_orig = X_train.copy()
X_train = X_train[:, valid_features]

mean_emp_X = np.mean(X_train, axis=0)
std_emp_X = np.std(X_train, ddof=1, axis=0)

if UFFS_SETTINGS["cluster_sizes"][0] == -1:
    UFFS_SETTINGS["cluster_sizes"][0] = d
UFFS_options = fourier_learning.OptionsUnsupervisedFourierFS(**UFFS_SETTINGS)
sel_features_UFFS_X_train = fourier_learning.UnsupervisedFourierFS(X_train,UFFS_options)

depth:  0
[0.99868334 0.9710501  0.64421057 0.89672784 0.99000723 0.97482371
 0.70423542 0.95386502 0.48423835 0.79241115 0.67746419 0.70574558
 0.44544536 0.44594552 0.38030003 0.84996018 0.36059562 0.58869414
 0.36594369 0.59152161 0.46081196 0.17797768 0.31417809 0.50133138
 0.394726   0.30968279 0.50796447 0.74744512 0.50929486 0.27753776
 0.85931071 0.78904285 0.53825143 0.29975257 0.49428611 0.81361097
 0.60308818 0.41282839 0.42539331 0.26077712 0.18058911 0.47661637
 0.12365986 0.14129585 0.16418419 0.20679685 0.57266691 0.24433804
 0.18988494 0.18903479 0.13127156 0.06073971 0.1186435  0.54240867
 0.41176349 0.35419974 0.09325358 0.34721867 0.18789464 0.14407153
 0.40785392 0.26914886 0.34098081 0.14617715 0.23854168 0.38189255
 0.87839818 0.2531599  0.28823995 0.13383104 0.0417486  0.19225816
 0.10854906 0.08102534 0.19865666 0.56580855 0.09581191 0.15397509
 0.0410147  0.23492078 0.10661343 0.02692175 0.13174079 0.28535862
 0.11625185 0.03989501 0.16753968 0.39521145 0.15742

In [None]:
len(sel_features_UFFS_X_train)

In [None]:
sel_features_UFFS_X_train

In [None]:
feat_sel_algs_chosen = [k for k, v in FEAT_SEL_ALGS_SUPERVISED.items() if v == True]

In [None]:
seltd_features = {}
time_taken_algs = {}
for alg in feat_sel_algs_chosen:
    print(alg)
    start_time = datetime.now()
    if alg in ["fourier-learning_depth_1", "fourier-learning_depth_2","UFFS + SFFS (t=2)","UFFS + SFFS (t=1)"]:
        relevant_features = supervised_fs(X_train[:, sel_features_UFFS_X_train], y_train, type_y, FEAT_SEL_ALGS_FN_LABEL[alg])
        seltd_features[alg] = orig_features[mask][sel_features_UFFS_X_train][relevant_features]
    else:
        relevant_features = supervised_fs(X_train, y_train, type_y, FEAT_SEL_ALGS_FN_LABEL[alg])
        seltd_features[alg] = orig_features[mask][relevant_features]
    end_time = datetime.now()
    time_taken_algs[alg] = end_time - start_time

In [None]:
seltd_features

In [None]:
# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1,8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
    n_iter=32,
    random_state=0,
    n_jobs = 5
)

# executes bayesian optimization
_ = opt.fit(X_train, y_train)

print("Score on test set without feature selection:",opt.score(X_test, y_test))

In [None]:
xaxis = xaxis_fn(len(sel_features_UFFS_X_train))
print(xaxis)

In [None]:
scores = defaultdict(list)
params = {
    'C': Real(1e-6, 100, prior='log-uniform'),
    'gamma': Real(1e-6, 100, prior='log-uniform'),
    'degree': Integer(1,5),
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid'])
}

params_grid_search = { 
    'C':[0.1,1,100,1000],
    'kernel':['rbf'],
#     'kernel':['rbf','poly','sigmoid','linear'],
    'degree':[1,2,3,4,5,6],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

params_grid_search_1 = { 
    'svm_clf__C':[0.1,1,100,1000],
    'svm_clf__kernel':['rbf'],
#     'kernel':['rbf','poly','sigmoid','linear'],
    'svm_clf__degree':[1,2,3,4,5,6],
    'svm_clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", C=1, gamma='auto'))
])

for alg, features in seltd_features.items():
    print(alg)
    score_temp = np.zeros(len(xaxis))
    for i, k in enumerate(xaxis):
        if k > d:
            break
        feats_selected = features[:k]
        X_train_orig_sel = X_train_orig[:, feats_selected]
        X_test_sel = X_test[:, feats_selected]
        # executes bayesian optimization
#         opt = BayesSearchCV(estimator = SVC(), search_spaces=params, n_jobs=-1, random_state=20)
#         opt = GridSearchCV(SVC(),params_grid_search, n_jobs = 6)
        opt = GridSearchCV(clf,params_grid_search_1, n_jobs = 6)
        opt.fit(X_train_orig_sel, y_train)
        
        print(opt.score(X_test_sel, y_test))
        print(opt.best_params_)
        scores[alg].append(opt.score(X_test_sel, y_test))

In [None]:
scores_no_opt = defaultdict(list)
for alg, features in seltd_features.items():
    print(alg)
    score_temp = np.zeros(len(xaxis))
    for i, k in enumerate(xaxis):
        if k > d:
            break
        feats_selected = features[:k]
        X_train_orig_sel = X_train_orig[:, feats_selected]
        X_test_sel = X_test[:, feats_selected]
        
        clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="rbf", C=1, gamma='auto'))
        ])
        clf.fit(X_train_orig_sel, y_train)
        
        print(clf.score(X_test_sel, y_test))
        scores_no_opt[alg].append(clf.score(X_test_sel, y_test))

In [None]:
plot_ccm_comparison(xaxis, DATA_NAME, scores, file_name=None)

In [None]:
# On the entire dataset
params_grid_search = { 
    'C':[0.1,1,100,1000],
    'kernel':['rbf','poly','sigmoid','linear'],
    'degree':[1,2,3,4,5,6],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

opt = GridSearchCV(SVC(),params_grid_search, n_jobs = 6)

# executes bayesian optimization
_ = opt.fit(X_train, y_train)

print("Score on test set without feature selection:",opt.score(X_test, y_test))

# Saving the results as Pickle files

In [None]:
fpath = os.path.join("runs_ICML","{0}_seltd_features_train_seed-{1}.pickle".format(DATA_NAME, random_state))
with open(fpath, 'wb') as handle:
    pickle.dump(seltd_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
scores["xaxis"] = xaxis
fpath = os.path.join("runs_ICML","{0}_scores_test_seed-{1}.pickle".format(DATA_NAME, random_state))
with open(fpath, 'wb') as handle:
    pickle.dump(scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Running on multiple seeds

In [None]:
def run_supervised_algs(X_train, X_test, y_train, y_test):
    orig_features = np.arange(X_train.shape[1])
    mask = (np.std(X_train, ddof=1, axis=0) > 1e-5)
    valid_features = orig_features[mask]

    valid_features = valid_features[mask]
    X_train_orig = X_train.copy()
    X_train = X_train[:, valid_features]

    mean_emp_X = np.mean(X_train, axis=0)
    std_emp_X = np.std(X_train, ddof=1, axis=0)

    if UFFS_SETTINGS["cluster_sizes"][0] == -1:
        UFFS_SETTINGS["cluster_sizes"][0] = d
    UFFS_options = fourier_learning.OptionsUnsupervisedFourierFS(**UFFS_SETTINGS)
    sel_features_UFFS_X_train = fourier_learning.UnsupervisedFourierFS(X_train,UFFS_options)
    
    feat_sel_algs_chosen = [k for k, v in FEAT_SEL_ALGS_SUPERVISED.items() if v == True]
    
    seltd_features = {}
    time_taken_algs = {}
    for alg in feat_sel_algs_chosen:
        print(alg)
        start_time = datetime.now()
        if alg in ["fourier-learning_depth_1", "fourier-learning_depth_2","UFFS + SFFS (t=2)","UFFS + SFFS (t=1)"]:
            relevant_features = supervised_fs(X_train[:, sel_features_UFFS_X_train], y_train, type_y, FEAT_SEL_ALGS_FN_LABEL[alg])
            seltd_features[alg] = orig_features[mask][sel_features_UFFS_X_train][relevant_features]
        else:
            relevant_features = supervised_fs(X_train, y_train, type_y, FEAT_SEL_ALGS_FN_LABEL[alg])
            seltd_features[alg] = orig_features[mask][relevant_features]
        end_time = datetime.now()
        time_taken_algs[alg] = end_time - start_time

    xaxis = xaxis_fn(len(sel_features_UFFS_X_train))

    scores = defaultdict(list)
    params_grid_search_1 = { 
        'svm_clf__C':[0.1,1,100,1000],
        'svm_clf__kernel':['rbf'],
        'svm_clf__degree':[1,2,3,4,5,6],
        'svm_clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]
    }

    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="rbf", C=1, gamma='auto'))
    ])

    for alg, features in seltd_features.items():
        print(alg)
        for i, k in enumerate(xaxis):
            if k > d:
                break
            feats_selected = features[:k]
            X_train_orig_sel = X_train_orig[:, feats_selected]
            X_test_sel = X_test[:, feats_selected]
#             opt = GridSearchCV(clf,params_grid_search_1, n_jobs = 6)
#             opt.fit(X_train_orig_sel, y_train)
#             print(opt.score(X_test_sel, y_test))
#             print(opt.best_params_)
#             scores[alg].append(opt.score(X_test_sel, y_test))

            clf = Pipeline([
                ("scaler", StandardScaler()),
                ("svm_clf", SVC(kernel="rbf", C=1, gamma='auto'))
            ])
            clf.fit(X_train_orig_sel, y_train)
            scores[alg].append(clf.score(X_test_sel, y_test))
    return scores

In [None]:
scores_collection = defaultdict(list)
for i, (X_train_i, X_test_i, y_train_i, y_test_i) in enumerate(zip(X_train,X_test,y_train, y_test)):
    scores_temp =  run_supervised_algs(X_train_i, X_test_i, y_train_i, y_test_i)
    for k, v in scores_temp.items():
        scores_collection[k].append(v)

In [None]:
scores_ensemble = {}
for alg_i, scores_i in scores_collection.items():
    scores_i = np.array(scores_i)
#     if alg_i in ['UFFS + SFFS (t=2)', 'UFFS + SFFS (t=1)']:
    min_len = min([len(score_ii) for score_ii in scores_i])
#     else:
#         min_len = None
    avg = np.array([0]*min_len, dtype=np.float64)
    for score_ii in scores_i:
        avg += np.array(score_ii[:min_len])
    avg /= len(scores_i)
    scores_ensemble[alg_i] = avg

In [None]:
scores_ensemble

In [None]:
xaxis = xaxis_fn(len(scores_ensemble["UFFS + SFFS (t=3)"]))
plot_ccm_comparison(xaxis, DATA_NAME, scores_ensemble, file_name=None)

In [None]:
xaxis

In [None]:
fpath = os.path.join("runs_ICML","{0}_seltd_features_train_seed-{1}.pickle".format(DATA_NAME, random_state))
with open(fpath, 'wb') as handle:
    pickle.dump(seltd_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
scores["xaxis"] = xaxis
fpath = os.path.join("runs_ICML","{0}_scores_test_seed-{1}.pickle".format(DATA_NAME, random_state))
with open(fpath, 'wb') as handle:
    pickle.dump(scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Checking Fourier-Orth

In [10]:
import ICML_final

In [11]:
config = {
    "data_name": "Musk",
    "random_states": [42],
    "fourier_orth_settings": {
        "max_depth": 3,
        "cluster_sizes": [166, 50, 31],
        "selection_thresholds": [0.999,0.9995,0.999],
        "norm_epsilon": [0.001, 0.001, 0.001],
        "shuffle": False,
        "preranking": "non"
    }
}

In [30]:
X, y, type_y = load_data(config["data_name"])
no_samples, d = X.shape
random_state = config["random_states"]
X_train, X_test, y_train, y_test = [], [], [], []
for state in random_state:
    t = train_test_split(X, y, test_size=0.2, random_state=state, stratify=y, shuffle=True)
    for i, j in zip((X_train,X_test,y_train, y_test),t):
        i.append(j)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if y.dtype != np.int:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = y.astype(np.int, copy=False)


In [31]:
y

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [13]:
for i, (X_train_i, X_test_i, y_train_i, y_test_i) in enumerate(zip(X_train,X_test,y_train, y_test)):
    sel_features_UFFS_X_train = ICML_final.fourier_orth(X_train_i, config)
    print("len(sel_features_UFFS_X_train):", len(sel_features_UFFS_X_train))

depth:  0
No. of selected features in UnsupervisedFourierFS, depth 0: 150
depth:  1
No. of selected features in UnsupervisedFourierFS, depth 1: 74
depth:  2
No. of selected features in UnsupervisedFourierFS, depth 2: 37
len(sel_features_UFFS_X_train): 37


In [26]:
sel_features_UFFS_X_train = ICML_final.fourier_orth(X, config)
print("len(sel_features_UFFS_X_train):", len(sel_features_UFFS_X_train))

depth:  0
No. of selected features in UnsupervisedFourierFS, depth 0: 164
depth:  1
No. of selected features in UnsupervisedFourierFS, depth 1: 120
depth:  2
No. of selected features in UnsupervisedFourierFS, depth 2: 55
len(sel_features_UFFS_X_train): 55


In [25]:
sel_features_UFFS_X_train

# Temp

In [1]:
import pickle

In [33]:
scores_ensemble =  pickle.load(open("/home/jithin/fourier_learning/results/Musk/sel_features_train.pickle", "rb"))

In [34]:
aaa =  pickle.load(open("/home/jithin/fourier_learning/results/Musk/sel_features_train.pickle", "rb"))

In [36]:
aaa["UFFS + SFFS (t=3)"]

[array([  9, 115,  42, 108,   7,  22,  82,  50,  30,  90, 124,  31, 165,
         25,  52,  62, 163,  33,  94,   1,  69,  35, 158,  15,  75,  89,
         39,   5,  68,   0, 159,   3,  92,  66,  27,   4,  83]),
 array([  9, 115,  42, 108, 110,   7,  30,  90, 124,  47,  50, 114,  31,
         80,  22, 165,  33,   0,   4,  35,  94,  15,  75, 158,  62,   1,
         25,  39,  69,  66,  68,   3,  89,   5,  83, 159,  27]),
 array([115, 108, 110,   9,   7,  30,  90, 124,  31,  50, 165,  22,  62,
         52,  25, 123,   5,   1,  33,  35,  63, 158,  94, 131,   3,  92,
         28,  83,  68,  69,  84,  15,   0, 159,  27,  66,   4]),
 array([  9, 115,  42, 110,   7,  30,  90, 124,  47,  50, 163, 164,  22,
         31, 165,  61,   0,   4,  35,  33, 159,  25,  84,  52,  94, 158,
         89,   1,  57,  69,  34,   5,  83,   3,  15,  66,  27]),
 array([ 18,  48, 132,   7,   9,  74,  72,  30,  31,  90, 108,  43,  45,
         14, 165, 125, 154, 101, 110, 157,   1,  40, 137,  57,  35,  98,
          

In [27]:
scores_collection =  pickle.load(open("/home/jithin/fourier_learning/results/Musk/scores_collection_train.pickle", "rb"))

In [19]:
[len(i) for i in aa["UFFS + SFFS (t=1)"]]

[37, 37, 37, 37, 49]

In [21]:
aa

defaultdict(list,
            {'UFFS + SFFS (t=3)': [array([ 12,  42, 128, 115,  93,  49,  17,  55, 132, 139,  72,   9,  19,
                      74, 133,  50, 140,  82, 102,  47,  10,  20,  11, 134,  48,  78,
                      45, 107, 103,   7,  43,  18, 108,  77,  73,  14, 126, 104, 162,
                     163, 164,  79,  13, 110, 105, 111,  53,  44, 109,  67,  69, 147,
                      80, 138, 157,  22, 156, 158,  25, 135, 148,  36,  64,  96, 129,
                      30,  90, 124,   2,  86, 131, 117,  91, 130,  52, 112,   8,  87,
                     114,  57,  85,  33, 123, 113,  51,  76,  59,  15,  75,  31,  21,
                      39,  70,  35, 151, 149,  16, 141,  58, 118, 137,  81,   6,  94,
                      66,  62, 165,  26,  40,  61,  63,  98, 101, 154,   1,  95,  34,
                      97,   5, 100,  37,  71,  84, 143, 119,  23,  28, 120,  29,   3,
                     125,  32, 159, 142, 136,  56, 161,  99,  38,  41, 127,  89, 153,
               

In [None]:
plot_ccm_comparison(xaxis, config["data_name"], scores_ensemble, file_name=dir_name+"/plot_comparison")