In [2]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
import time

# helper functions
from helpers.helper_classes import Gene_SPCA

# sklearn
from sklearn.decomposition import PCA, SparsePCA

# joblib
from joblib import dump, load

# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

# Load in data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

In [None]:
# Relevant transformations
    # TODO: make spca and gene spca a fair comparison by making them use 
    # the same number of non-zero loadings

def get_gene_spca(n_components, random_state):
    return Gene_SPCA(n_comps = n_components, l1= 400)

def get_spca(n_components, random_state):
    return SparsePCA(n_components=n_components, random_state=random_state)

def get_pca(n_components, random_state):
    return PCA(n_components=n_components, random_state=random_state)

In [2]:
### Config for runtime tables

## Which datasets to run

# Golub because original, Christensen because of small dataset, Chin because of large dataset, Nakayama because of large number of classes
# dataset_list = ['golub', 'christensen', 'chin', 'nakayama']

# Easy running datasets
dataset_list = ['sorlie', 'christensen', 'alon']

## Which transformations to run
transforms_dict = {'pca': get_pca, 'spca': get_spca, 'gene_spca': get_gene_spca}

In [3]:
# Loop to construct table of runtimes

# Initialize dictionary to store results
timed_results_dict = {}

# Loop through datasets
for data_name in dataset_list:
    
    X = data[data_name]['none']['X_train']
    timed_results_dict[data_name] = {}

    for transform_name, transform_fn in transforms_dict.items():
        print(f'{data_name} {transform_name}')

        # Instantiate transformer
        transformer_cur = transform_fn(N_COMPONENTS, SEED)

        # Time execution of fitting transformer
        timed_result = %timeit -o transformer_cur.fit(X)
        timed_results_dict[data_name][transform_name] = (timed_result.average, timed_result.stdev)

        


sorlie pca
9.73 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
--- 0.00972821125003975 seconds ---
sorlie gene_spca
50.7 ms ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
--- 0.05072041605682378 seconds ---
christensen pca
185 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
--- 0.18537338035613565 seconds ---
christensen gene_spca
4.89 s ± 495 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
--- 4.891852499850627 seconds ---
alon pca
199 ms ± 39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
--- 0.1988741101273003 seconds ---
alon gene_spca
7.72 s ± 2.88 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
--- 7.720449595429402 seconds ---


In [14]:
# Reform created dictionary into right format for dataframe
reform = {}

for dname, res_dict in timed_results_dict.items():
    tnames = []
    reform[(dname, 'avg')] = []
    reform[(dname, 'stdev')] = []
    for tname, res in res_dict.items():
        tnames.append(tname)
        reform[(dname, 'avg')].append(res[0])
        reform[(dname, 'stdev')].append(res[1])

# Create dataframe
res_runtimes = pd.DataFrame.from_dict(reform).T
res_runtimes.columns = tnames
res_runtimes    





Unnamed: 0,Unnamed: 1,pca,gene_spca
sorlie,avg,0.009728,0.05072
sorlie,stdev,0.001029,0.002748
christensen,avg,0.185373,4.891852
christensen,stdev,0.014255,0.495207
alon,avg,0.198874,7.72045
alon,stdev,0.039008,2.882258


In [None]:
print(res_runtimes.to_latex())