# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [1]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA
from helpers.config.hyperparameters import LGBMHyperparameterConfig, LRHyperparameterConfig, OptunaOptimzation


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

In [3]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

### Setup preprocessing, PCA, and SparsePCA

In [4]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),

    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

### Optuna objects

In [5]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# Data with 80/20 split
full_df = pd.concat([train, test])
train80, test20 = train_test_split(full_df, test_size=0.2, random_state=SEED)

# Get target for 80/20 split
X_train, X_test = preprocessing_pipe.fit_transform(train80), preprocessing_pipe.transform(test20)
y_train, y_test = train80['cancer'], test20['cancer']

# Hyperparameter Optimization

## Logistic Regression

In [6]:
model = LogisticRegression
cfg = LRHyperparameterConfig(model=model)

optimizer = OptunaOptimzation(
    X_train,
    y_train,
    n_trials=3,
    hyperparameter_config=cfg,
    name="LR",
)

optimizer.run()



<optuna.study.study.Study at 0x1770ab7c0>

In [7]:
study = optimizer.study
best_params = study.best_params

In [8]:
pipe = Pipeline([
    ("pca", get_pca_pipeline(cfg.get_params()["pca"]["method"], **best_params)),
    ("model", get_model(model(), **cfg.get_params()["static"], **best_params)),
])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



1.0