In [115]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
import tarfile

# import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import set_config

# Joblib
from joblib import dump, load

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

# Progress bar
from tqdm import tqdm

In [116]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

In [131]:
set_config(transform_output='pandas')

def get_PCA(n_components = 20):
    return Pipeline([
    # Imputing missing values
        ('imputer', SimpleImputer(missing_values=np.nan, strategy = 'mean')),

    # # Step 0:
    #     # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),

    # # Step 1:
    #     # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=n_components, random_state=SEED)),
    ])

def get_SPCA(n_components = 20):
    return Pipeline([
    
    # Imputing missing values
        ('imputer', SimpleImputer(missing_values=np.nan, strategy = 'mean')),

    # # Step 0:
    #     # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),

    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply PCA
        ('spca', SparsePCA(n_components=n_components, random_state=SEED, alpha=1, max_iter=400, n_jobs = -1)),
    ])

def get_none_pipeline():
    return Pipeline([
        # Imputing missing values
        ('imputer', SimpleImputer(missing_values=np.nan, strategy = 'mean')),

        # Step 0:
            # Drop constant and duplicate features
        ('drop_constant', DropConstantFeatures(tol=0.98)),

        # Step 1:
            # Scale features as most methods utilized benefit from scaling s.t. no one feature dominates.
        ('scaler', StandardScaler()),
    ])

In [136]:
# Construct set of unique dataset names
dnames = os.listdir(config['PATH']['MICR_CSV'])
dset = set()
for fname in dnames: dset.add(fname[:fname.find('_')])


In [137]:
# construct data dictionary.
data_full = {}
data_full_fname = config['PATH']['DATA_DIR'] + '/micro_dict.lib'

for i, name in enumerate(tqdm((dset))):
    print(name)
    data_full[name] = {}
    X_cur = pd.read_csv(config['PATH']['MICR_CSV'] + '/' + name + '_inputs.csv', header = None)
    y_cur = pd.read_csv(config['PATH']['MICR_CSV'] + '/' + name + '_outputs.csv', header = None)
    X_train, X_test, y_train, y_test = train_test_split(X_cur, y_cur, test_size = 0.33, random_state=SEED)
    
    for key in ['none','pca']:
        match key:
            case 'none': pipe = get_none_pipeline()
            case 'pca': pipe = get_PCA()
            case 'spca': pipe = get_SPCA()

        data_full[name][key] = {}
        data_full[name][key]['X_train'] = pipe.fit_transform(X_train)
        data_full[name][key]['X_test'] = pipe.transform(X_test)
        data_full[name][key]['y_train'] = y_train
        data_full[name][key]['y_test'] = y_test
    
dump(data_full, data_full_fname)




  0%|          | 0/22 [00:00<?, ?it/s]

pomeroy


  5%|▍         | 1/22 [00:09<03:15,  9.33s/it]

gordon


  9%|▉         | 2/22 [00:26<04:39, 13.95s/it]

nakayama


 14%|█▎        | 3/22 [00:55<06:36, 20.84s/it]

singh


 18%|█▊        | 4/22 [01:11<05:41, 18.99s/it]

sorlie


 23%|██▎       | 5/22 [01:12<03:30, 12.37s/it]

shipp


 27%|██▋       | 6/22 [01:21<03:01, 11.37s/it]

chowdary


 32%|███▏      | 7/22 [01:50<04:15, 17.01s/it]

west


 36%|███▋      | 8/22 [01:59<03:22, 14.49s/it]

khan


 41%|████      | 9/22 [02:02<02:21, 10.92s/it]

borovecki


 45%|████▌     | 10/22 [02:29<03:08, 15.74s/it]

burczynski


 50%|█████     | 11/22 [02:57<03:36, 19.72s/it]

subramanian


 55%|█████▍    | 12/22 [03:10<02:56, 17.65s/it]

yeoh


 59%|█████▉    | 13/22 [03:28<02:39, 17.68s/it]

golub


 64%|██████▎   | 14/22 [03:38<02:02, 15.25s/it]

chin


 68%|██████▊   | 15/22 [04:07<02:15, 19.37s/it]

alon


 73%|███████▎  | 16/22 [04:09<01:25, 14.31s/it]

su


 77%|███████▋  | 17/22 [04:16<01:00, 12.19s/it]

gravier


 82%|████████▏ | 18/22 [04:20<00:39,  9.76s/it]

tian


 86%|████████▋ | 19/22 [04:37<00:35, 11.91s/it]

chiaretti


 91%|█████████ | 20/22 [04:54<00:26, 13.21s/it]

christensen


 95%|█████████▌| 21/22 [04:56<00:09,  9.85s/it]

sun


100%|██████████| 22/22 [06:16<00:00, 17.13s/it]


{'pomeroy': {'none': {'X_train':           x0        x1        x2        x3        x4        x5        x6  \
   26  0.624296 -0.160397 -0.147273 -0.815697  0.002599  0.009966  1.108944   
   16  0.248573  1.171522  0.044048 -1.365656 -1.108389 -1.220355 -0.176027   
   27  1.136926  1.243774  1.457310  0.281124  2.692620  0.770732 -0.725244   
   40  0.856173 -0.589873  1.732708  1.466928 -1.483029  0.775028 -0.657582   
   2  -0.137869  0.714059 -0.453599  0.301996  1.671698  1.367755  0.503724   
   58 -0.313656 -0.829255  0.874226  2.842749  1.367426  1.248076  1.524937   
   20  0.837117  1.380932  0.915222 -0.278764  0.955401  1.524925  0.337223   
   46  2.137369  0.653783  0.211631 -0.588089 -0.294220  0.258672  0.253278   
   31  0.846025  1.612250  0.656495  1.490719  0.536986  1.543162  2.526739   
   21  0.767054  1.073887  0.121346 -0.088695  0.287594 -1.112417 -0.269043   
   48  0.320735 -0.177105  0.367484 -0.340330 -0.061655  0.100678 -0.039097   
   45  0.256764 -0.170