# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [1]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load
import datetime as dt
from tqdm import tqdm


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA
from helpers.config.hyperparameters import OptunaOptimzation
from helpers.config.hyperparameters import PCA_LGBM_CFG, SPCA_LGBM_CFG, GSPCA_LGBM_CFG, PCA_LR_CFG, SPCA_LR_CFG, GSPCA_LR_CFG


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

OPTUNA_DIR = config['LOGGING']['OPTUNA_DIR']
DATA_DIR = config['PATH']['DATA_DIR']


In [3]:
# Load data library
data = load(DATA_DIR + '/microarray-data-dict.lib')

# Hyperparameter Optimization

In [4]:
def init_hyperparameter_configs():
    hyperparameter_configs = {
        'PCA_LGBM': PCA_LGBM_CFG(),
        'SPCA_LGBM': SPCA_LGBM_CFG(),
        'GSPCA_LGBM': GSPCA_LGBM_CFG(),
        'PCA_LR': PCA_LR_CFG(),
        'SPCA_LR': SPCA_LR_CFG(),
        'GSPCA_LR': GSPCA_LR_CFG(),
    }
    return hyperparameter_configs

def run_all_optimizations(X_train, y_train, hyperparameter_configs, dataset, n_trials=50):
    study_dict = {}
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M")
    
    for name, cfg in hyperparameter_configs.items():
        optimizer = OptunaOptimzation(
            X_train,
            y_train,
            n_trials=n_trials,
            hyperparameter_config=cfg,
            name=name,
        )
        optimizer.run()
        
        # Save study object
        if not os.path.exists(f"{OPTUNA_DIR}{dataset}"):
            os.makedirs(f"{OPTUNA_DIR}{dataset}")
            
        optimizer.save_study(path=f"{OPTUNA_DIR}{dataset}/{timestamp}_{name}_optuna_run.csv")
        study_dict[name] = optimizer.study
        
    return study_dict

In [5]:
dataset_list = ['chin', 'chowdary', 'gravier', 'west']

for dataset in tqdm(dataset_list):
    print(f"Dataset: {dataset}")
    X_train = data[dataset]['none']['X_train']
    y_train = data[dataset]['none']['y_train'].to_numpy().ravel()
    print(f"X_train shape: {X_train.shape}")
    
    run_all_optimizations(X_train, y_train, init_hyperparameter_configs(), dataset,  n_trials=5)

  0%|          | 0/4 [00:00<?, ?it/s]

Dataset: chin
X_train shape: (79, 22215)


  0%|          | 0/4 [08:47<?, ?it/s]


TypeError: _ArrayMemoryError.__init__() missing 1 required positional argument: 'dtype'