# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [1]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load
import datetime as dt


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA
from helpers.config.hyperparameters import OptunaOptimzation
from helpers.config.hyperparameters import PCA_LGBM_CFG, SPCA_LGBM_CFG, GSPCA_LGBM_CFG, PCA_LR_CFG, SPCA_LR_CFG, GSPCA_LR_CFG


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

In [3]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')
OPTUNA_DIR = config.get('LOGGING','OPTUNA_DIR')

### Setup preprocessing, PCA, and SparsePCA

In [4]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),

    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

### Optuna objects

In [5]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# Data with 80/20 split
full_df = pd.concat([train, test])
train80, test20 = train_test_split(full_df, test_size=0.2, random_state=SEED)

# Get target for 80/20 split
X_train, X_test = preprocessing_pipe.fit_transform(train80), preprocessing_pipe.transform(test20)
y_train, y_test = train80['cancer'], test20['cancer']

# Hyperparameter Optimization

## Logistic Regression

In [6]:
# Create dictionary of all Hyperparameter configurations
hyperparameter_configs = {
    'PCA_LGBM': PCA_LGBM_CFG(),
    'SPCA_LGBM': SPCA_LGBM_CFG(),
    'GSPCA_LGBM': GSPCA_LGBM_CFG(),
    'PCA_LR': PCA_LR_CFG(),
    'SPCA_LR': SPCA_LR_CFG(),
    'GSPCA_LR': GSPCA_LR_CFG(),
}

def run_all_optimizations(X_train, y_train, hyperparameter_configs, n_trials=50):
    study_dict = {}
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M")
    
    for name, cfg in hyperparameter_configs.items():
        optimizer = OptunaOptimzation(
            X_train,
            y_train,
            n_trials=n_trials,
            hyperparameter_config=cfg,
            name=name,
        )
        optimizer.run()
        
        # Save study object
        optimizer.save_study(path=f"{OPTUNA_DIR}{timestamp}_{name}_optuna_run.csv")
        study_dict[name] = optimizer.study
        
    return study_dict

In [7]:
run_all_optimizations(X_train, y_train, hyperparameter_configs, n_trials=3)

[33m[W 2023-01-31 00:33:48,247][0m Trial 0 failed with parameters: {'num_leaves': 615, 'max_depth': 14, 'min_data_in_leaf': 7900, 'min_gain_to_split': 0.26618813263057983, 'n_components': 10} because of the following error: TypeError("EnetSPCA.__init__() got an unexpected keyword argument 'n_components'").[0m
Traceback (most recent call last):
  File "/Users/jacco/Documents/repos/vu-case-study-eds/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/Users/jacco/Documents/repos/vu-case-study-eds/src/helpers/config/hyperparameters.py", line 61, in <lambda>
    lambda trial: self._objective(
  File "/Users/jacco/Documents/repos/vu-case-study-eds/src/helpers/config/hyperparameters.py", line 39, in _objective
    pca = get_pca_pipeline(**params.get("pca"))
  File "/Users/jacco/Documents/repos/vu-case-study-eds/src/helpers/helper_functions.py", line 142, in get_pca_pipeline
    "spca": EnetSPCA(
TypeError: EnetSPC

TypeError: EnetSPCA.__init__() got an unexpected keyword argument 'n_components'

In [None]:
# pipe = Pipeline([
#     ("pca", get_pca_pipeline(cfg.get_params()["pca"]["method"], **best_params)),
#     ("model", get_model(cfg.get_model(), **cfg.get_params()["static"], **best_params)),
# ])

# pipe.fit(X_train, y_train)
# pipe.score(X_test, y_test)

1.0