# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [11]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
from joblib import dump, load
import datetime as dt
from tqdm import tqdm
import json


# import sweetviz
import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# helper functions
from helpers.helper_functions import transform_data, add_actuals, get_pca_pipeline, get_model
from helpers.helper_classes import AddFeatureNames, GeneSPCA, EnetSPCA
from helpers.config.hyperparameters import OptunaOptimzation
from helpers.config.hyperparameters import PCA_LGBM_CFG, SPCA_LGBM_CFG, GSPCA_LGBM_CFG, PCA_LR_CFG, SPCA_LR_CFG, GSPCA_LR_CFG


# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

In [12]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

DATA_DIR = config['PATH']['DATA_DIR']
DATASETS = json.loads(config.get('PARAMS', 'DATASETS'))
PIPE_DIR = config["LOGGING"]["PIPE_DIR"]
LOG_DIR = config["LOGGING"]["LOG_DIR"]
TIMESTAMP_PIPE = '20230201_0938_'

In [13]:
# Load data library
data = load(DATA_DIR + '/microarray-data-dict.lib')
fitted_pipelines = load(os.path.join(PIPE_DIR, TIMESTAMP_PIPE + 'fitted-pipelines.lib'))

In [14]:
multicolumn = pd.MultiIndex.from_product([['PCA', 'SPCA', 'GSPCA'], ['LGBM', 'LR']])
res = pd.DataFrame(index=DATASETS, columns=multicolumn)
res

Unnamed: 0_level_0,PCA,PCA,SPCA,SPCA,GSPCA,GSPCA
Unnamed: 0_level_1,LGBM,LR,LGBM,LR,LGBM,LR
sorlie,,,,,,
khan,,,,,,
christensen,,,,,,
alon,,,,,,
gravier,,,,,,


In [15]:
def load_data(dataset, type="none", test=True):
    X_train = data[dataset]["none"]["X_train"]
    y_train = data[dataset]["none"]["y_train"]
    if test:
        X_test = data[dataset]["none"]["X_test"]
        y_test = data[dataset]["none"]["y_test"]
        return X_train, y_train, X_test, y_test
    else:
        return X_train, y_train

In [9]:
for dataset in DATASETS:
    X_train, y_train, X_test, y_test = load_data(dataset)
    pipes = fitted_pipelines[dataset]
    
    for name, pipe in pipes.items():
        pca_name, model_name = name.split("_")
        
        score = pipe.score(X_test, y_test)
        res.loc[dataset, (pca_name, model_name)] = score

In [10]:
res.to_latex(os.path.join(LOG_DIR, 'latex_tables', 'classification_results.tex'))

  res.to_latex(os.path.join(LOG_DIR, 'latex_tables', 'classification_results.tex'))


In [11]:
res

Unnamed: 0_level_0,PCA,PCA,SPCA,SPCA,GSPCA,GSPCA
Unnamed: 0_level_1,LGBM,LR,LGBM,LR,LGBM,LR
sorlie,0.793103,0.862069,0.724138,0.862069,0.827586,0.793103
khan,,,,,,
gravier,,,,,,
christensen,,,,,,
alon,,,,,,


In [16]:
for dataset in DATASETS:
    X_train, y_train, X_test, y_test = load_data(dataset)
    
    print(f"Dataset: {dataset}: {np.unique(y_train)} Samples: {len(y_train)}")
    print(f"Dataset: {dataset}: {np.unique(y_test)} Sampels: {len(y_test)} percentage: {len(y_test)/(len(y_train) + len(y_test))})")
    

Dataset: sorlie: [1. 2. 3. 4. 5.] Samples: 56
Dataset: sorlie: [1. 2. 3. 4. 5.] Sampels: 29 percentage: 0.3411764705882353)
Dataset: khan: [1. 2. 3. 4.] Samples: 42
Dataset: khan: [1. 2. 3. 4.] Sampels: 21 percentage: 0.3333333333333333)
Dataset: christensen: [1. 2. 3.] Samples: 145
Dataset: christensen: [1. 2. 3.] Sampels: 72 percentage: 0.3317972350230415)
Dataset: alon: [1. 2.] Samples: 41
Dataset: alon: [1. 2.] Sampels: 21 percentage: 0.3387096774193548)
Dataset: gravier: [1. 2.] Samples: 112
Dataset: gravier: [1. 2.] Sampels: 56 percentage: 0.3333333333333333)


In [17]:
model = LGBMClassifier()
m = model.set_params(n_estimators=200)