# VU Econometics and Data Science: Case Study

In [2]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os

# import sweetviz

import matplotlib.pyplot as plt

# import optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# LightGBM
from lightgbm import LGBMClassifier

from joblib import dump, load


# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('config.ini')

os.chdir(config['PATH']['ROOT_DIR'])

In [5]:
# Read data
data = load(config['PATH']['DATA_DIR'] + '/microarray-data-dict.lib')

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

In [9]:
for key, v in data.items():
    print(f"Author: {key}")
    runsum = 0
    for _, data_cur in v['none'].items():
        runsum = runsum + data_cur.isna().sum().sum()
    print(f"Missing values: {runsum}")

Author: yeoh
Missing values: 0
Author: nakayama
Missing values: 0
Author: golub
Missing values: 0
Author: khan
Missing values: 0
Author: west
Missing values: 0
Author: alon
Missing values: 0
Author: subramanian
Missing values: 0
Author: burczynski
Missing values: 0
Author: chin
Missing values: 0
Author: borovecki
Missing values: 0
Author: shipp
Missing values: 0
Author: tian
Missing values: 0
Author: gordon
Missing values: 0
Author: chiaretti
Missing values: 0
Author: sorlie
Missing values: 0
Author: chowdary
Missing values: 0
Author: sun
Missing values: 0
Author: pomeroy
Missing values: 0
Author: gravier
Missing values: 0
Author: su
Missing values: 0
Author: christensen
Missing values: 0
Author: singh
Missing values: 0


In [67]:
for key, v in data.items():
    print(f"Author: {key}")
    samples = 0
    output = []
    for data_name, data_cur in v['none'].items():
        samples = samples + data_cur.shape[0]
        if data_name == 'y_train' or data_name == 'y_test':
            output = output + sorted(data_cur[0].unique())
    samples = samples / 2
    features = v['none']['X_train'].shape[1]
    print(f"Samples: {int(samples)}, features: {features}, classes: {len(np.unique(output))}")


Author: yeoh
Samples: 248, features: 12625, classes: 6
Author: nakayama
Samples: 105, features: 22283, classes: 10
Author: golub
Samples: 72, features: 7129, classes: 2
Author: khan
Samples: 63, features: 2308, classes: 4
Author: west
Samples: 49, features: 7129, classes: 2
Author: alon
Samples: 62, features: 2000, classes: 2
Author: subramanian
Samples: 50, features: 10100, classes: 2
Author: burczynski
Samples: 127, features: 22283, classes: 3
Author: chin
Samples: 118, features: 22215, classes: 2
Author: borovecki
Samples: 31, features: 22283, classes: 2
Author: shipp
Samples: 77, features: 7129, classes: 2
Author: tian
Samples: 173, features: 12625, classes: 2
Author: gordon
Samples: 181, features: 12533, classes: 2
Author: chiaretti
Samples: 128, features: 12625, classes: 6
Author: sorlie
Samples: 85, features: 456, classes: 5
Author: chowdary
Samples: 104, features: 22283, classes: 2
Author: sun
Samples: 180, features: 54613, classes: 4
Author: pomeroy
Samples: 60, features: 7128

In [29]:
for key, v in data.items():
    print(f"Author: {key}")
    runsum = 0
    for data_name, data_cur in v['none'].items():
        if data_name == 'y_train':
            print(sorted(data_cur[0].unique()))
        else: continue

Author: yeoh
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Author: nakayama
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
Author: golub
[1.0, 2.0]
Author: khan
[1.0, 2.0, 3.0, 4.0]
Author: west
[1.0, 2.0]
Author: alon
[1.0, 2.0]
Author: subramanian
[1.0, 2.0]
Author: burczynski
[1.0, 2.0, 3.0]
Author: chin
[1.0, 2.0]
Author: borovecki
[1.0, 2.0]
Author: shipp
[1.0, 2.0]
Author: tian
[1.0, 2.0]
Author: gordon
[1.0, 2.0]
Author: chiaretti
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Author: sorlie
[1.0, 2.0, 3.0, 4.0, 5.0]
Author: chowdary
[1.0, 2.0]
Author: sun
[1.0, 2.0, 3.0, 4.0]
Author: pomeroy
[1.0, 2.0]
Author: gravier
[1.0, 2.0]
Author: su
[1.0, 2.0, 3.0, 4.0]
Author: christensen
[1.0, 2.0, 3.0]
Author: singh
[1.0, 2.0]


In [50]:
clf = LogisticRegression(random_state=SEED)
clf.fit(data['khan']['spca']['X_train'], data['khan']['spca']['y_train'][0])
print(f"KNN score spca: {clf.score(data['khan']['spca']['X_test'], data['khan']['spca']['y_test'][0])}")

clf.fit(data['khan']['pca']['X_train'], data['khan']['pca']['y_train'][0])
print(f"KNN score pca:  {clf.score(data['khan']['pca']['X_test'], data['khan']['pca']['y_test'][0])}")

KNN score spca: 0.9523809523809523
KNN score pca:  0.9523809523809523


In [43]:
from sklearn.utils.multiclass import type_of_target
type_of_target(data['khan']['spca']['y_train'][0])

'multiclass'

## Evaluation on all data and transformation with default setting classifiers
Currently only accuracy:

In [60]:
from sklearn.base import clone

clf_dict = {
            'lr': LogisticRegression(random_state=SEED, max_iter = 1000),
            'lgbm': LGBMClassifier(random_state=SEED),
            'svc': SVC(random_state=SEED),
            'knn':KNeighborsClassifier()}

for key, dataset in data.items():
    print("-" * 80)
    print(f"Author: {key}")
    print("-" * 80)
    for clf_name, clf_obj in clf_dict.items():
        clf = clone(clf_obj)
        print(f"classifier: {clf_name}")
        print('-' * 30)
        for transform_name, transform_data in dataset.items():
            clf.fit(transform_data['X_train'], transform_data['y_train'][0])
            acc = clf.score(transform_data['X_test'], transform_data['y_test'][0])
            print(f"{transform_name}, accuracy: {acc}")
        

--------------------------------------------------------------------------------
Author: yeoh
--------------------------------------------------------------------------------
classifier: lr
------------------------------
none, accuracy: 0.9512195121951219
pca, accuracy: 0.926829268292683
spca, accuracy: 0.9146341463414634
classifier: lgbm
------------------------------
none, accuracy: 0.8902439024390244
pca, accuracy: 0.8414634146341463
spca, accuracy: 0.9146341463414634
classifier: svc
------------------------------
none, accuracy: 0.8292682926829268
pca, accuracy: 0.8536585365853658
spca, accuracy: 0.8780487804878049
classifier: knn
------------------------------
none, accuracy: 0.7682926829268293
pca, accuracy: 0.7682926829268293
spca, accuracy: 0.7926829268292683
--------------------------------------------------------------------------------
Author: nakayama
--------------------------------------------------------------------------------
classifier: lr
----------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


spca, accuracy: 0.6
classifier: lgbm
------------------------------
none, accuracy: 0.5428571428571428
pca, accuracy: 0.5714285714285714
spca, accuracy: 0.5428571428571428
classifier: svc
------------------------------
none, accuracy: 0.42857142857142855
pca, accuracy: 0.5714285714285714
spca, accuracy: 0.6285714285714286
classifier: knn
------------------------------
none, accuracy: 0.5142857142857142
pca, accuracy: 0.5428571428571428
spca, accuracy: 0.5428571428571428
--------------------------------------------------------------------------------
Author: golub
--------------------------------------------------------------------------------
classifier: lr
------------------------------
none, accuracy: 0.9583333333333334
pca, accuracy: 0.9583333333333334
spca, accuracy: 0.9583333333333334
classifier: lgbm
------------------------------
none, accuracy: 0.9583333333333334
pca, accuracy: 0.875
spca, accuracy: 0.9583333333333334
classifier: svc
------------------------------
none, accurac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


spca, accuracy: 0.5813953488372093
classifier: lgbm
------------------------------
none, accuracy: 0.7674418604651163
pca, accuracy: 0.6511627906976745
spca, accuracy: 0.6976744186046512
classifier: svc
------------------------------
none, accuracy: 0.627906976744186
pca, accuracy: 0.6511627906976745
spca, accuracy: 0.6046511627906976
classifier: knn
------------------------------
none, accuracy: 0.5116279069767442
pca, accuracy: 0.4883720930232558
spca, accuracy: 0.4418604651162791
--------------------------------------------------------------------------------
Author: sorlie
--------------------------------------------------------------------------------
classifier: lr
------------------------------
none, accuracy: 0.896551724137931
pca, accuracy: 0.8275862068965517
spca, accuracy: 0.8275862068965517
classifier: lgbm
------------------------------
none, accuracy: 0.7241379310344828
pca, accuracy: 0.8275862068965517
spca, accuracy: 0.8275862068965517
classifier: svc
------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


pca, accuracy: 0.6833333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


spca, accuracy: 0.7
classifier: lgbm
------------------------------
none, accuracy: 0.7333333333333333
pca, accuracy: 0.6333333333333333
spca, accuracy: 0.7166666666666667
classifier: svc
------------------------------
none, accuracy: 0.6833333333333333
pca, accuracy: 0.7166666666666667
spca, accuracy: 0.7166666666666667
classifier: knn
------------------------------
none, accuracy: 0.65
pca, accuracy: 0.6666666666666666
spca, accuracy: 0.6666666666666666
--------------------------------------------------------------------------------
Author: pomeroy
--------------------------------------------------------------------------------
classifier: lr
------------------------------
none, accuracy: 0.55
pca, accuracy: 0.7
spca, accuracy: 0.65
classifier: lgbm
------------------------------
none, accuracy: 0.65
pca, accuracy: 0.6
spca, accuracy: 0.6
classifier: svc
------------------------------
none, accuracy: 0.6
pca, accuracy: 0.55
spca, accuracy: 0.55
classifier: knn
-----------------------