### Try to find out how SPCA in sklearn works

In [1]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames, LoadingsSPCA

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

from joblib import dump, load


# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

In [3]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')
N_COMPONENTS = 15

In [4]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),

    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

In [5]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# Data with 80/20 split
full_df = pd.concat([train, test])
train80, test20 = train_test_split(full_df, test_size=0.2, random_state=SEED)

# Get target for 80/20 split
X_train, X_test = preprocessing_pipe.fit_transform(train80), preprocessing_pipe.transform(test20)
y_train, y_test = train80['cancer'], test20['cancer']

In [6]:
pca = PCA(n_components=15, random_state=SEED)
pca.fit(X_train)

Xpca = pca.transform(X_train)

In [7]:
lspca  = LoadingsSPCA(n_components=15, random_state=SEED, max_iter=100, alpha=1)
lspca.fit(X_train)

Xlspca = lspca.transform(X_train)
print(Xlspca.shape)

(57, 15)


In [8]:
gspca = get_gene_spca(l1=10, n_components=15)
gspca.fit(X_train)

Xgspca = gspca.transform(X_train)

NameError: name 'get_gene_spca' is not defined

In [None]:
print(f"gscpa zero: {gspca.zero}")
print(f"lscpa zero: {lspca.zero}")

gscpa zero: 350
lscpa zero: 68061
