In [19]:
# Imports

# import utility modules
import pandas as pd
import numpy as np
import configparser
import os

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# import joblib
from joblib import dump, load

# helper functions and classes
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames, Gene_SPCA

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# Read data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')


In [24]:
from sklearn.base import clone

clf_dict = {
            'lr': LogisticRegression(random_state=SEED, max_iter = 10000),
            'lgbm': LGBMClassifier(random_state=SEED),
            'svc': SVC(random_state=SEED),
            'knn':KNeighborsClassifier()}

results_dict = {}
dataset_list = ['golub', 'christensen', 'chin', 'nakayama']

metrics = {'acc':accuracy_score, 'f1-macro':f1_score} #, 'recall-macro':recall_score, 'precision-macro':precision_score }

for key, dataset in data.items():
    # Skip if key not in dataset_list, only for testing!
    if key not in dataset_list:
        continue
    
    results_dict[key] = {}
    print("-" * 80)
    print(f"Author: {key}")
    print("-" * 80)

    for clf_name, clf_obj in clf_dict.items():
        results_dict[key][clf_name] = {}
        clf = clone(clf_obj)
        print(f"classifier: {clf_name}")
        print('-' * 30)

        for transform_name, transform_data in dataset.items():
            results_dict[key][clf_name][transform_name] = {}

            # Fit model
            clf.fit(transform_data['X_train'], transform_data['y_train'][0])

            # Calculate metrics
            X_test = transform_data['X_test']
            y_test = transform_data['y_test'][0]

            for metric_name, metric in metrics.items():
                results_dict[key][clf_name][transform_name][metric_name] = metric(y_test, clf.predict(X_test), average='macro')


# Count number of times a single tranform wins


--------------------------------------------------------------------------------
Author: nakayama
--------------------------------------------------------------------------------
classifier: lr
------------------------------


TypeError: got an unexpected keyword argument 'average'

In [13]:
for key, dataset in data.items():
    # Skip if key not in dataset_list, only for testing!
    if key not in dataset_list:
        continue
    for clf_name, clf_obj in clf_dict.items():
        trans_wins = {}
        for metric_name, metric in metrics.items():
            trans_wins[metric_name] = {}
            for transform_name, transform_data in clf_obj.items():
                






{'nakayama': {'lr': {'none': {'acc': 0.6285714285714286,
    'f1-macro': 0.3211764705882353,
    'recall-macro': 0.3952380952380952,
    'precision-macro': 0.31691919191919193},
   'pca': {'acc': 0.6285714285714286,
    'f1-macro': 0.3667366946778712,
    'recall-macro': 0.39761904761904765,
    'precision-macro': 0.4222222222222222},
   'spca': {'acc': 0.6,
    'f1-macro': 0.44496031746031744,
    'recall-macro': 0.5375,
    'precision-macro': 0.4357142857142858}},
  'lgbm': {'none': {'acc': 0.5428571428571428,
    'f1-macro': 0.38841880341880347,
    'recall-macro': 0.38511904761904764,
    'precision-macro': 0.47000000000000003},
   'pca': {'acc': 0.5714285714285714,
    'f1-macro': 0.3219135802469136,
    'recall-macro': 0.40939153439153436,
    'precision-macro': 0.3234126984126984},
   'spca': {'acc': 0.5428571428571428,
    'f1-macro': 0.2883791208791209,
    'recall-macro': 0.35178571428571426,
    'precision-macro': 0.30416666666666664}},
  'svc': {'none': {'acc': 0.4285714285

In [14]:
# Classification performance for each transform

# Setup transform win counter
transform_win_counter = {}
hasRun = False
for dname, dobj in results_dict.items():
    for clf_name, clf_obj in dobj.items():
        for tname, tobj in clf_obj.items():
            if tname not in transform_win_counter:
                transform_win_counter[tname] = 0

transform_win_counter

for dname, dobj in results_dict.items():
    for clf_name, clf_obj in dobj.items():
        acc_clf = []
        f1_clf = []
        recall_clf = []
        precision_clf = []
        tnames = []
        for tname, tobj in clf_obj.items():
            acc_clf.append(tobj['acc'])
            f1_clf.append(tobj['f1-macro'])
            recall_clf.append(tobj['recall-macro'])
            precision_clf.append(tobj['precision-macro'])
            tnames.append(tname)
        transform_win_counter[tnames[np.argmax(acc_clf)]] += 1
        

{'none': 0, 'pca': 0, 'spca': 0}

In [22]:
test_dict = {}
test_dict['idx1']['idx2'] = 2

KeyError: 'idx1'