In [83]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
import tarfile

# import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config

# Joblib
from joblib import dump, load

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

# Progress bar
from tqdm import tqdm

In [26]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

In [104]:
set_config(transform_output='pandas')

def get_PCA(n_components = 20):
    return Pipeline([
    # # Step 0:
    #     # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # # Step 1:
    #     # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=n_components, random_state=SEED)),
    ])

def get_SPCA(n_components = 20):
    return Pipeline([
    # # Step 0:
    #     # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply PCA
        ('spca', SparsePCA(n_components=n_components, random_state=SEED, alpha=1, max_iter=400, n_jobs = -1)),
    ])

def get_none_pipeline():
    return Pipeline([
        # Step 0:
            # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),

        # Step 1:
            # Scale features as most methods utilized benefit from scaling s.t. no one feature dominates.
        ('scaler', StandardScaler()),
    ])

In [105]:
# Construct set of unique dataset names
dnames = os.listdir(config['PATH']['MICR_CSV'])
dset = set()
for fname in dnames: dset.add(fname[:fname.find('_')])

In [108]:
# construct data dictionary.
data_full = {}
data_full_fname = config['PATH']['DATA_DIR'] + '/micro_dict.lib'

for i, name in enumerate(tqdm((dset))):
    data_full[name] = {}
    X_cur = pd.read_csv(config['PATH']['MICR_CSV'] + '/' + name + '_inputs.csv', header = None)
    y_cur = pd.read_csv(config['PATH']['MICR_CSV'] + '/' + name + '_outputs.csv', header = None)
    X_train, X_test, y_train, y_test = train_test_split(X_cur, y_cur, test_size = 0.33, random_state=SEED)
    
    for key in ['none','pca', 'spca']:
        match key:
            case 'none': pipe = get_none_pipeline()
            case 'pca': pipe = get_PCA()
            case 'spca': pipe = get_SPCA()

        data_full[name][key] = {}
        data_full[name][key]['X_train'] = pipe.fit_transform(X_train)
        data_full[name][key]['X_test'] = pipe.transform(X_test)
        data_full[name][key]['y_train'] = y_train
        data_full[name][key]['y_test'] = y_test
    
dump(data_full, data_full_fname)
data_full




  5%|▍         | 1/22 [11:32<4:02:26, 692.70s/it]

In [53]:
for key, v in data_full.items():
    print(key)

pomeroy
gordon
nakayama
singh
sorlie


In [103]:
data_full['singh']['none']['X_train']

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x12590,x12591,x12592,x12593,x12594,x12595,x12596,x12597,x12598,x12599
57,0.232325,1.339437,-0.193075,0.425004,-0.087190,-1.475168,0.276293,0.961381,-0.068136,-0.440988,...,0.817534,0.001142,-0.480134,0.039998,0.460538,-0.447139,1.043544,2.430438,-1.013976,-0.302381
64,0.502070,-0.034789,0.026109,-0.632799,0.175733,0.467637,0.276293,-0.453056,0.722186,0.610579,...,-0.489823,0.287217,-0.119370,-0.424364,-0.638289,-0.676354,0.275805,-0.287937,0.594603,-0.185982
10,0.412155,-0.427425,-0.412258,-0.354429,-0.087190,0.592979,0.176144,-0.453056,0.574000,0.566764,...,-0.472392,0.573292,-0.600389,0.902383,-0.590514,-0.217923,-0.382257,-0.065524,0.262397,0.454213
101,0.502070,-0.623742,-0.412258,-0.688472,0.175733,0.969006,0.276293,-0.453056,0.722186,0.873471,...,-0.960472,0.450688,-0.600389,-0.888726,-0.829390,-0.676354,-0.162903,-0.411499,0.804418,0.221415
42,0.771816,-0.525584,-0.412258,-0.521451,0.570118,0.655651,0.476591,-0.655118,0.623395,1.004917,...,-0.803589,0.981970,-0.359880,0.968721,-0.829390,-0.332531,-0.601611,-0.386787,0.856872,-0.069583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,-0.217251,-0.623742,-0.412258,-0.465777,-0.481575,0.342295,0.176144,0.287840,0.524605,0.347687,...,-0.245783,0.532424,0.241393,-0.756051,0.126112,0.011292,0.495159,-0.065524,0.384789,-0.069583
55,-1.565978,0.750483,0.902843,-0.020386,-1.138882,-0.660443,-0.625046,0.422548,-1.451199,-0.747696,...,1.026711,-0.284933,-0.600389,0.637034,0.747188,-0.103316,2.030637,-1.029312,-0.856615,-0.884376
70,-1.835724,1.830232,3.094678,2.317914,2.147656,-3.167288,-3.529362,3.520838,-1.846360,-2.456493,...,2.264343,-0.121462,0.602157,0.371684,-0.303864,3.793349,1.043544,3.913188,-2.028080,-0.884376
25,0.322240,-0.427425,-0.631442,-0.521451,0.175733,0.154282,0.176144,-0.318347,0.623395,0.479133,...,-0.350372,0.736763,0.241393,0.172672,-0.399414,-0.561747,-0.711288,-0.362074,0.419758,0.454213


In [38]:
data = {}

pd.read_csv(config['PATH']['MICR_CSV'] + '/alon_inputs.csv', header = None)

In [97]:
none_test = get_PCA()
X_train_test = none_test.fit_transform(X_train)

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca11,pca12,pca13,pca14,pca15,pca16,pca17,pca18,pca19
30,20.440101,-3.811074,0.594319,-6.557925,-7.442944,-1.131467,0.844079,3.275581,1.188397,7.103636,4.455097,-1.954079,0.372631,0.477591,0.716407,1.165442,-2.881774,0.236966,0.605114,-2.616789
19,3.450773,2.798767,-8.513769,4.393991,3.727832,-0.277508,-1.492326,-4.010592,0.213973,2.038897,-2.454903,-0.030035,-3.914302,-0.842849,0.472705,1.436303,1.523705,1.943343,-3.850913,-2.118215
12,6.41583,6.622438,-6.985503,-7.269321,-1.664058,0.103458,-3.2301,-0.216137,4.524762,-3.566562,-1.02482,0.185461,-0.831278,-0.876454,1.7266,2.571256,0.123579,-0.467447,-3.413303,-1.300387
5,4.542911,10.155496,7.807916,-4.803216,4.463096,0.894511,0.382868,-4.014326,0.436491,-0.144044,-1.265117,2.864059,-4.362784,-0.152467,-3.177127,-3.062784,1.177835,-3.914148,-3.877683,0.562964
31,19.636808,-5.635958,0.342275,1.307415,-0.02693,-4.449698,0.082468,-1.044644,-6.234148,-1.150459,-1.685922,-1.917363,2.124244,0.311599,-0.715003,1.013579,0.214803,-1.320486,-1.037413,-0.107384
60,-10.183159,-4.270386,3.672666,-5.823921,-7.385792,-6.848818,-3.533494,-0.12957,5.152181,5.400424,-4.482848,-5.983783,-7.933341,6.624475,-3.271114,5.54294,3.704898,0.972332,3.172313,3.516079
11,-3.322548,18.51659,-3.217664,-5.443171,-5.678812,-3.388143,6.972107,0.893458,-4.383499,-1.816557,0.987788,-4.075346,-0.113141,-0.467158,6.429724,-2.189994,0.654213,-1.022618,6.636449,-1.438491
74,-2.021994,1.713425,2.319863,4.056108,3.923745,4.78559,10.159957,-4.969023,1.139351,4.576543,1.750361,-4.119599,1.154245,4.228843,-1.16626,0.959015,-1.784226,-0.754563,1.146164,3.555079
59,0.747171,-8.175727,-4.063865,-3.26063,9.527007,-5.639706,3.64293,1.956506,4.679027,-6.113483,3.335957,-1.216782,-0.344053,3.613459,0.401817,0.45201,-3.688869,-1.903845,4.9612,-3.212863
57,-4.260461,-2.982526,-6.083386,-1.260116,1.697067,0.689665,2.142687,4.061497,5.106904,-5.787631,6.856139,-1.456636,1.390879,-6.145746,1.628397,4.143848,-0.948616,0.429839,1.95176,6.429444


In [63]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,446,447,448,449,450,451,452,453,454,455
30,0.92,-0.817,2.63,-1.63,-0.21,-1.02,-1.17,-0.79,0.04,1.23,...,-0.63,0.4,-1.0,-0.46,-0.34,-0.76,-1.34,-3.76,-1.12,-3.18
19,2.38,0.82,0.86,3.66,-0.61,-0.71,0.33,-0.17,-1.41,-0.04,...,0.95,0.6,0.56,0.06,0.27,0.66,-0.148,-1.69,0.18,2.15
12,1.6,0.22,0.28,2.81,-1.71,-1.58,-1.05,-0.87,0.17,-0.07,...,-0.82,-0.66,1.81,0.42,0.41,0.16,-0.69,-0.43,-0.25,0.3
5,-0.36,-0.15,-0.67,-0.91,-2.85,-2.4,-1.67,1.97,0.15,0.11,...,0.74,0.87,-2.77,-2.01,-2.41,-1.68,-1.52,-0.29,-0.88,-0.2
31,0.48,0.04,-0.5,-0.43,-0.02,-0.69,-0.93,0.09,0.67,1.6,...,-0.36,0.0,0.86,1.0,1.38,0.32,1.64,1.98,0.18,1.73
60,-0.76,1.86,2.83,0.87,-1.56,-1.41,-0.55,-1.22,-0.486,-0.06,...,-0.24,-0.73,4.39,3.71,3.51,5.28,0.414,0.36,-0.73,-0.418
11,0.55,1.43,-0.233,1.83,-2.47,-2.32,-0.97,-0.314,-0.312,0.67,...,-2.15,-5.26,-1.71,-0.77,-0.85,-2.37,0.43,0.35,0.053,0.27
74,-0.198,0.0,-1.03,-1.86,-0.71,-0.68,-0.39,0.3,-0.79,-0.69,...,-8.02,-0.82,-1.67,-0.78,-0.9,-1.33,0.12,0.0,-0.56,-0.29
59,-0.55,-0.44,-0.08,1.2,0.18,-0.24,-0.1,1.08,0.78,-0.6,...,-1.06,-0.4,-1.5,-2.39,-1.53,0.582,0.65,1.16,-1.48,-0.91
57,-0.256,-0.52,-0.748,-0.89,-0.49,0.06,0.43,0.34,-0.261,-0.47,...,-0.36,0.08,1.92,-0.46,0.77,2.85,0.73,0.54,0.13,-0.228
