# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```

In [1]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load
import datetime as dt
import json
import copy


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA
from helpers.config.hyperparameters import OptunaOptimzation
from helpers.config.hyperparameters import PCA_LGBM_CFG, SPCA_LGBM_CFG, GSPCA_LGBM_CFG, PCA_LR_CFG, SPCA_LR_CFG, GSPCA_LR_CFG


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

OPTUNA_DIR = config['LOGGING']['OPTUNA_DIR']
DATA_DIR = config['PATH']['DATA_DIR']
DATASETS = json.loads(config.get('PARAMS', 'DATASETS'))
PIPE_DIR = config['LOGGING']['PIPE_DIR']

# Scoring the tuned models

In [26]:
def parse_best_params_from_csv(path) -> dict:
    df = pd.read_csv(path)
    params = {
        "_".join(col.split("_")[1:]): df.loc[df.value.argmax(), col]
        for col in df.columns
        if "params" in col
    }

    return params

def parse_name_from_csv(path) -> str:
    return "_".join(path.split("/")[-1].split("_")[2:4])

def init_hyperparameter_configs():
    hyperparameter_configs = {
        "PCA_LGBM": PCA_LGBM_CFG(),
        "SPCA_LGBM": SPCA_LGBM_CFG(),
        "GSPCA_LGBM": GSPCA_LGBM_CFG(),
        "PCA_LR": PCA_LR_CFG(),
        "SPCA_LR": SPCA_LR_CFG(),
        "GSPCA_LR": GSPCA_LR_CFG(),
    }
    return hyperparameter_configs

In [27]:
fitted_pipelines = dict.fromkeys(DATASETS, {})

In [28]:
# Load data library
data = load(DATA_DIR + '/microarray-data-dict.lib')

hyperparameter_configs = init_hyperparameter_configs()
fitted_pipelines = dict.fromkeys(DATASETS, {})

for dataset in DATASETS:
    X_train, X_test = data[dataset]["none"]["X_train"], data[dataset]["none"]["X_test"]
    y_train, y_test = data[dataset]["none"]["y_train"], data[dataset]["none"]["y_test"]
    
    for file in os.listdir(os.path.join(OPTUNA_DIR, dataset)):
        if not file.endswith(".csv"):
            continue
        
        path = os.path.join(OPTUNA_DIR, dataset, file)
        
        name = parse_name_from_csv(path)
        best_params = parse_best_params_from_csv(path)
        cfg = hyperparameter_configs[name]
        
        pipe = Pipeline([
            ("pca", get_pca_pipeline(cfg.get_params()["pca"]["method"], **best_params)),
            ("model", get_model(cfg.get_model(static=True), **cfg.get_params()["static"], **best_params)),
        ])
        
        pipe.fit(X_train, y_train)
        
        # Save pipeline in dictionary
        fitted_pipelines[dataset][name] = copy.deepcopy(pipe)
        
# dump to joblib
dump(fitted_pipelines, os.path.join(PIPE_DIR, 'fitted-pipelines.lib'))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


['./logs/pipelines/fitted-pipelines.lib']

In [31]:
for datasets in data:
    print(datasets)
    X_train = data[datasets]["none"]["X_train"]
    print(X_train.shape)

yeoh
(166, 12625)
nakayama
(70, 22283)
golub
(48, 7129)
khan
(42, 2308)
west
(32, 7129)
alon
(41, 2000)
subramanian
(33, 10100)
burczynski
(85, 22283)
chin
(79, 22215)
borovecki
(20, 22283)
shipp
(51, 7129)
tian
(115, 12625)
gordon
(121, 12533)
chiaretti
(85, 12625)
sorlie
(56, 456)
chowdary
(69, 22283)
sun
(120, 54613)
pomeroy
(40, 7128)
gravier
(112, 2905)
su
(68, 5556)
christensen
(145, 1413)
singh
(68, 12600)
