In [1]:
# Imports

# import utility modules
import pandas as pd
import numpy as np
import configparser
import os

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# import joblib
from joblib import dump, load

# helper functions and classes
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames, Gene_SPCA, EnetSPCA

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from multiprocessing import Pool

# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# Read data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')


In [5]:
X_testing = data['sorlie']['none']['X_train']
X_testing_test = data['sorlie']['none']['X_test']

spca = EnetSPCA(n_comps = 5, alpha = 0.05, tol = 0.0001, use_sklearn= True, n_jobs = 6)
spca.fit(X_testing, verbose = 1)

X_transformed_train = spca.transform(X_testing)
X_transformed_test = spca.transform(X_testing_test)

y_train = data['sorlie']['none']['y_train']
lr = LogisticRegression()
lr.fit(X_transformed_train, y_train)

acc = lr.score(X_transformed_test, data['sorlie']['none']['y_test'])
nzero_percent = spca.nonzero / spca.totloadings

print(f"Accuracy: {acc:.3f}, Nonzero Percent: {nzero_percent:.3f}")


Progress based on max iterations:


  1%|          | 70/10000 [01:07<2:40:19,  1.03it/s]

Accuracy: 0.897, Nonzero Percent: 0.151



  y = column_or_1d(y, warn=True)


In [3]:
os.cpu_count()

8

In [7]:
# Let's only count accuracy wins first
def list_transforms_metrics(results_dict):
    metrics = []
    transform_win_counter = {}
    hasRun = False
    for dname, dobj in results_dict.items():
        for clf_name, clf_obj in dobj.items():
            for tname, tobj in clf_obj.items():
                if tname not in transform_win_counter:
                    transform_win_counter[tname] = 0
                for metric_name, metric in tobj.items():
                    if metric_name not in metrics:
                        metrics.append(metric_name)
    return transform_win_counter, metrics

counter, metrics = list_transforms_metrics(results_dict)

counter

count_results = {}
# Loop over metrics found in results dictionary
for metric in metrics:
    count_results[metric] = counter.copy()
    cur_counter = count_results[metric]
    cur_counter['ties'] = 0
    for dname, dobj in results_dict.items():
        for clf_name, clf_obj in dobj.items():
            # if clf_name != 'knn':
            #     continue
            cur_max = 0
            for tname, tobj in clf_obj.items():
                if tobj[metric] > cur_max:
                    cur_max = tobj[metric]
                    max_tname = tname
                elif tobj[metric] == cur_max:
                    max_tname = ''
            if max_tname == '':
                cur_counter['ties'] += 1
                continue
            cur_counter[max_tname] += 1




In [8]:
count_results

#Print count results to pandas dataframe
df = pd.DataFrame(count_results)
df.columns = ['Accuracy', 'F1', 'Recall', 'Precision']
df = df.T
print(df.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  none &  pca &  spca &  ties \\
\midrule
Accuracy  &    23 &   13 &    11 &    41 \\
F1        &    30 &   13 &    14 &    31 \\
Recall    &    28 &   13 &    13 &    34 \\
Precision &    29 &   15 &    13 &    31 \\
\bottomrule
\end{tabular}



  print(df.to_latex())
