# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```

In [2]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load
import datetime as dt


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA
from helpers.config.hyperparameters import OptunaOptimzation
from helpers.config.hyperparameters import PCA_LGBM_CFG, SPCA_LGBM_CFG, GSPCA_LGBM_CFG, PCA_LR_CFG, SPCA_LR_CFG, GSPCA_LR_CFG


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

OPTUNA_DIR = config['LOGGING']['OPTUNA_DIR']
DATA_DIR = config['PATH']['DATA_DIR']
DATASETS = config['PARAMS']['DATASETS']


['[',
 "'",
 'c',
 'h',
 'i',
 'n',
 "'",
 ',',
 ' ',
 "'",
 'c',
 'h',
 'o',
 'w',
 'd',
 'a',
 'r',
 'y',
 "'",
 ',',
 ' ',
 "'",
 'g',
 'r',
 'a',
 'v',
 'i',
 'e',
 'r',
 "'",
 ',',
 ' ',
 "'",
 'w',
 'e',
 's',
 't',
 "'",
 ']']

In [None]:
config.get('PARAMS', 'DATASETS')

In [None]:
# Load data library
data = load(DATA_DIR + '/microarray-data-dict.lib')

# Scoring the tuned models

In [6]:
def parse_best_params_from_csv(path) -> dict:
    df = pd.read_csv(path)
    params = {
        "_".join(col.split("_")[1:]): df.loc[df.value.argmax(), col]
        for col in df.columns
        if "params" in col
    }

    return params

def parse_name_from_csv(path) -> str:
    return "_".join(path.split("/")[-1].split("_")[2:4])

In [7]:
hyperparameter_configs = {
    'PCA_LGBM': PCA_LGBM_CFG(),
    'SPCA_LGBM': SPCA_LGBM_CFG(),
    'GSPCA_LGBM': GSPCA_LGBM_CFG(),
    'PCA_LR': PCA_LR_CFG(),
    'SPCA_LR': SPCA_LR_CFG(),
    'GSPCA_LR': GSPCA_LR_CFG(),
}

for file in os.listdir(OPTUNA_DIR):
    if not file.endswith(".csv"):
        continue
    
    print(file)
    path = os.path.join(OPTUNA_DIR, file)
    
    name = parse_name_from_csv(path)
    best_params = parse_best_params_from_csv(path)
    cfg = hyperparameter_configs[name]
    
    pipe = Pipeline([
        ("pca", get_pca_pipeline(cfg.get_params()["pca"]["method"], **best_params)),
        ("model", get_model(cfg.get_model(static=True), **cfg.get_params()["static"], **best_params)),
    ])
    
    pipe.fit(X_train, y_train)
    print(pipe.score(X_test, y_test))

20230131_0103_GSPCA_LR_optuna_run.csv
1.0
20230131_0309_PCA_LGBM_optuna_run.csv
0.7333333333333333
20230131_0103_SPCA_LR_optuna_run.csv
0.0
0.7333333333333333
20230131_0103_SPCA_LGBM_optuna_run.csv
0.0
0.7333333333333333
20230131_0103_GSPCA_LGBM_optuna_run.csv
0.7333333333333333
20230131_0103_PCA_LGBM_optuna_run.csv
0.7333333333333333
20230131_0103_PCA_LR_optuna_run.csv
1.0




In [13]:
pipe = Pipeline([
        ("pca", get_pca_pipeline(cfg.get_params()["pca"]["method"], **best_params)),
        ("model", get_model(cfg.get_model(static=True), **cfg.get_params()["static"], **best_params)),
])
    
# pipe.fit(X_train, y_train)
# print(pipe.score(X_test, y_test))