In [34]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
import time
import sys

# helper functions
from helpers.helper_classes import Gene_SPCA, EnetSPCA

# sklearn
from sklearn.decomposition import PCA, SparsePCA

# joblib
from joblib import dump, load

# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

# Load in data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

In [10]:
# Relevant transformations
    # TODO: make spca and gene spca a fair comparison by making them use 
    # the same number of non-zero loadings

def get_gene_spca(n_components, random_state, alpha = 10):
    return Gene_SPCA(n_comps = n_components, l1= alpha)

def get_spca(n_components, random_state, alpha = 0.001):
    return EnetSPCA(n_comps=n_components, alpha = alpha, tol = 0.001, n_jobs = 6)

def get_pca(n_components, random_state):
    return PCA(n_components=n_components, random_state=random_state)

In [23]:
### Config for runtime tables

## Which datasets to run

# Golub because original, Christensen because of small dataset, Chin because of large dataset, Nakayama because of large number of classes
# dataset_list = ['golub', 'christensen', 'chin', 'nakayama']

# Easy running datasets
# dataset_list = ['sorlie', 'christensen', 'alon']
# datasets = ['chin', 'chowdary', 'gravier', 'west']
datasets = ['sorlie', 'christensen']

## Which transformations to run
transforms_dict = {'pca': get_pca, 'spca': get_gene_spca, 'gene_spca': get_gene_spca}

## Parameter settings
n_components_list = [5]
# percentage_nzero_loadings = [0.2]
fixed_alpha = None#{'spca': 40, 'gene_spca': 20}
N_TIMINGS = 2


In [24]:
# Bisection search for regularization parameter that sets nonzero loadings to a certain percentage
def get_regularisation_value(X, n_components, percentage_nzero_loadings, get_transform, lower_bound = 0.0001, upper_bound = 1000, verbose = 0):
    percent_nz = 0
    alpha_cur = 0
    alpha_lower = lower_bound
    alpha_upper = upper_bound
    
    while abs(percent_nz - percentage_nzero_loadings) > 0.02:
        if upper_bound - alpha_lower < 0.001:
            raise ValueError("Correct alpha likely not in interval")

        alpha_cur = (alpha_lower + alpha_upper) / 2
        cur_transform = get_transform(alpha = alpha_cur, n_components = n_components, random_state = SEED)
        cur_transform.fit(X)
        percent_nz = cur_transform.nonzero / cur_transform.totloadings
        
        if verbose:
            print(f"lower: {alpha_lower}, upper: {alpha_upper}, cur: {alpha_cur}")
            print(f"percentage nonzero: {percent_nz}")
            print('-' * 40)

        if percent_nz > percentage_nzero_loadings:
            alpha_lower = alpha_cur
        else:
            alpha_upper = alpha_cur
    return alpha_cur


In [30]:

results_dict = {}

for n_components in n_components_list:
    for dname in datasets:
        print('-' * 40)
        print(f"Dataset: {dname}, n_components: {n_components}")
        
        X_cur = data[dname]['none']['X_train']
            
        spca_transform = get_spca(n_components = n_components, random_state = SEED, alpha = 0.01)
        spca_fitted = spca_transform.fit(X_cur)
        spca_nzero_percentage = spca_fitted.nonzero / spca_fitted.totloadings
        print(f"non zero % target: {spca_nzero_percentage}")

        # Find lambda value such that gene_spca has same percentage of nonzero loadings as spca.
        lambda_genespca = get_regularisation_value(X_cur, n_components, spca_nzero_percentage, get_gene_spca, lower_bound = 0, upper_bound = X_cur.shape[1] * 4, verbose = 1)                 

        # Time pca
        print(f"Timing pca...")
        results_dict[(dname, 'pca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_pca = get_pca(n_components = n_components, random_state = SEED)
            start = time.time()
            cur_pca.fit(X_cur)
            end = time.time()
            results_dict[(dname, 'pca', n_components)].append(end - start)

        # Time spca
        print(f"Timing spca...")
        results_dict[(dname, 'spca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_spca = get_spca(n_components = n_components, random_state = SEED, alpha = 0.01)
            start = time.time()
            cur_spca.fit(X_cur)
            end = time.time()
            results_dict[(dname, 'spca', n_components)].append(end - start)

        # Time gene spca
        print(f"Timing gene spca...")
        results_dict[(dname, 'gene_spca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_genespca = get_gene_spca(n_components = n_components, random_state = SEED, alpha = lambda_genespca)
            start = time.time()
            cur_genespca.fit(X_cur)
            end = time.time()
            results_dict[(dname, 'gene_spca', n_components)].append(end - start)

                

            
            



----------------------------------------
Dataset: sorlie, n_components: 5
non zero % target: 0.19210526315789472


  0%|          | 1/10000 [00:00<04:18, 38.65it/s]


lower: 0, upper: 4000, cur: 2000.0
percentage nonzero: 0.0
----------------------------------------


  0%|          | 1/10000 [00:00<54:40,  3.05it/s]


lower: 0, upper: 2000.0, cur: 1000.0
percentage nonzero: 0.0
----------------------------------------


  0%|          | 18/10000 [00:00<00:55, 178.43it/s]


lower: 0, upper: 1000.0, cur: 500.0
percentage nonzero: 0.02587719298245614
----------------------------------------


  0%|          | 37/10000 [00:00<01:48, 91.81it/s]


lower: 0, upper: 500.0, cur: 250.0
percentage nonzero: 0.19429824561403508
----------------------------------------
Timing pca...
Timing spca...
Timing gene spca...
----------------------------------------
Dataset: christensen, n_components: 5
non zero % target: 0.1578202406227884


  0%|          | 37/10000 [00:00<01:38, 101.46it/s]


lower: 0, upper: 4000, cur: 2000.0
percentage nonzero: 0.15003538570417552
----------------------------------------
Timing pca...
Timing spca...
Timing gene spca...


In [46]:
# Reform created dictionary into right format for dataframe
reform = {}
for n_components in n_components_list:
    for dname in datasets:
        reform[(dname, 'avg')] = []
        reform[(dname, 'stdev')] = []
        for tname in transforms_dict.keys():
            res_arr = results_dict[(dname, tname, n_components)]
            reform[(dname, 'avg')].append(np.mean(res_arr))
            reform[(dname, 'stdev')].append(np.std(res_arr))

    # Create dataframe
    res_runtimes = pd.DataFrame.from_dict(reform).T
    res_runtimes.columns = transforms_dict.keys()

    # Save to file
    fname = config['LOGGING']['TIME_DIR'] + f"/runtime_table_{n_components}.txt"

    # If exists delete
    if os.path.exists(fname):
        os.remove(fname)

    # Write table to file
    with open(fname, 'a') as f:
        f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))
    print(res_runtimes)

dump(results_dict, config['LOGGING']['TIME_DIR'] + f"/runtime_dict.joblib")




                        pca        spca  gene_spca
sorlie      avg    0.133314   26.534121   0.024038
            stdev  0.075122    0.808690   0.005266
christensen avg    0.045619  160.145590   0.264853
            stdev  0.016029    1.426293   0.035340


  f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))


['./logs/runtime_dict.joblib']

In [36]:
# print(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))

# Print above to file
with open('runtime_table.txt', 'w') as f:
    # res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}")
    # append to file
    f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))


  f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))


In [42]:
print(fname)

./logsruntime_table_christensen_5.txt


In [1]:
################################################################################
##### 1.0 IMPORT MODULES
################################################################################
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
import time

# helper functions
from helpers.helper_classes import Gene_SPCA, EnetSPCA
from helpers.helper_functions import get_regularisation_value

# sklearn
from sklearn.decomposition import PCA

# joblib
from joblib import dump, load

# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')

# Load in data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

# Relevant transformations
def get_gene_spca(n_components, random_state, alpha = 10):
    return Gene_SPCA(n_comps = n_components, l1= alpha, tol = 0.001)

def get_spca(n_components, random_state, alpha = 0.001):
    return EnetSPCA(n_comps=n_components, alpha = alpha, tol = 0.001)

def get_pca(n_components, random_state):
    return PCA(n_components=n_components, random_state=random_state)

################################################################################
##### 2.0 Config of plotting script
################################################################################

# Set datasets
datasets = ['chin', 'chowdary', 'gravier', 'west']
N_TIMINGS = 3
n_components_list = [5]
transforms_dict = {'pca': get_pca, 'spca': get_spca, 'gene_spca': get_gene_spca}
datasets = ['sorlie', 'christensen']
################################################################################
##### 3.0 Obtain results
################################################################################


results_dict = {}

for n_components in n_components_list:
    for dname in datasets:
        print('-' * 40)
        print(f"Dataset: {dname}, n_components: {n_components}")
        
        X_cur = data[dname]['none']['X_train']
            
        spca_transform = get_spca(n_components = n_components, random_state = SEED, alpha = 0.01)
        spca_fitted = spca_transform.fit(X_cur, n_jobs = 6)
        spca_nzero_percentage = spca_fitted.nonzero / spca_fitted.totloadings
        print(f"non zero % target: {spca_nzero_percentage}")

        # Find lambda value such that gene_spca has same percentage of nonzero loadings as spca.
        lambda_genespca = get_regularisation_value(X_cur, n_components, spca_nzero_percentage, get_gene_spca, lower_bound = 0, upper_bound = X_cur.shape[1] * 4, verbose = 1, random_state = SEED)                 

        # Time pca
        print(f"Timing pca...")
        results_dict[(dname, 'pca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_pca = get_pca(n_components = n_components, random_state = SEED)
            start = time.time()
            cur_pca.fit(X_cur)
            end = time.time()
            results_dict[(dname, 'pca', n_components)].append(end - start)

        # Time spca
        print(f"Timing spca...")
        results_dict[(dname, 'spca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_spca = get_spca(n_components = n_components, random_state = SEED, alpha = 0.01)
            start = time.time()
            cur_spca.fit(X_cur, n_jobs = 6)
            end = time.time()
            results_dict[(dname, 'spca', n_components)].append(end - start)

        # Time gene spca
        print(f"Timing gene spca...")
        results_dict[(dname, 'gene_spca', n_components)] = []
        for i in range(N_TIMINGS):
            cur_genespca = get_gene_spca(n_components = n_components, random_state = SEED, alpha = lambda_genespca)
            start = time.time()
            cur_genespca.fit(X_cur)
            end = time.time()
            results_dict[(dname, 'gene_spca', n_components)].append(end - start)

                
################################################################################
##### 4.0 Save results to table
################################################################################
            
reform = {}
for n_components in n_components_list:
    for dname in datasets:
        reform[(dname, 'avg')] = []
        reform[(dname, 'stdev')] = []
        for tname in transforms_dict.keys():
            res_arr = results_dict[(dname, tname, n_components)]
            reform[(dname, 'avg')].append(np.mean(res_arr))
            reform[(dname, 'stdev')].append(np.std(res_arr))

    # Create dataframe
    res_runtimes = pd.DataFrame.from_dict(reform).T
    res_runtimes.columns = transforms_dict.keys()

    # Save to file
    fname = config['LOGGING']['TIME_DIR'] + f"/runtime_table_{n_components}.txt"

    # If exists delete
    if os.path.exists(fname):
        os.remove(fname)

    # Write table to file
    with open(fname, 'a') as f:
        f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))
    print(res_runtimes)

dump(results_dict, config['LOGGING']['TIME_DIR'] + f"/runtime_dict.joblib")


----------------------------------------
Dataset: sorlie, n_components: 5
non zero % target: 0.19210526315789472
lower: 0, upper: 1824, cur: 912.0
percentage nonzero: 0.0
----------------------------------------
lower: 0, upper: 912.0, cur: 456.0
percentage nonzero: 0.038157894736842106
----------------------------------------
lower: 0, upper: 456.0, cur: 228.0
percentage nonzero: 0.21842105263157896
----------------------------------------
lower: 228.0, upper: 456.0, cur: 342.0
percentage nonzero: 0.11359649122807018
----------------------------------------
lower: 228.0, upper: 342.0, cur: 285.0
percentage nonzero: 0.1631578947368421
----------------------------------------
lower: 228.0, upper: 285.0, cur: 256.5
percentage nonzero: 0.1881578947368421
----------------------------------------
Timing pca...
Timing spca...
Timing gene spca...
----------------------------------------
Dataset: christensen, n_components: 5
non zero % target: 0.1578202406227884
lower: 0, upper: 5652, cur: 282

  f.write(res_runtimes.to_latex(caption = f"Runtime for {dname} data, {n_components} components", label = f"tab:runtime_{dname}"))


['./logs/times//runtime_dict.joblib']