# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [10]:
# import utility modules
import pandas as pd
import numpy as np
import configparser
import os
# import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# LightGBM
from lightgbm import LGBMClassifier

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

In [3]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

### Data exploration

In [4]:
# Check total missing values
print(f"Missing values in training set: {raw_train.isna().sum().sum()}")
print(f"Missing values in test set: {raw_test.isna().sum().sum()}")
print(f"Missing values in actuals: {actuals.isna().sum().sum()}")

Missing values in training set: 0
Missing values in test set: 0
Missing values in actuals: 0


### Preprocessing

In [5]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# get target variable
y_train = train["cancer"]
y_test = test["cancer"]

In [6]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

# X_train = preprocessing_pipe.fit_transform(train)
# X_test = preprocessing_pipe.fit_transform(test)

### PCA and SparsePCA

In [7]:
pca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=N_COMPONENTS, random_state=SEED)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_"))
])

spca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
    # Step 2:
        # Apply SPCA
        ('spca', SparsePCA(n_components=N_COMPONENTS, random_state=SEED, alpha=10, max_iter=50)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_"))
])

In [8]:
X_train_pca = pca_pipe.fit_transform(train)
X_train_spca = spca_pipe.fit_transform(train)

X_test_pca = pca_pipe.transform(test)
X_test_spca = spca_pipe.transform(test)

### Model fitting
#### Implemented models
- Logistic regression
- LightGBM (Gradient Tree Boosting)
- SVC (Support Vector Classifier)

#### Logistic Regression

In [12]:
lr_pca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
        
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=N_COMPONENTS, random_state=SEED)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_")),
        
    # Step 4:
        # Apply logistic regression
        ('logistic_regression', LogisticRegression(random_state=SEED))

])

lr_spca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
        
    # Step 2:
        # Apply SPCA
        ('spca', SparsePCA(n_components=N_COMPONENTS, random_state=SEED, alpha=10, max_iter=100)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_")),
        
    # Step 4:
        # Apply logistic regression
        ('logistic_regression', LogisticRegression(random_state=SEED))

])

In [14]:
# Fit and score PCA with Logistic Regression
lr_pca_pipe.fit(train, y_train)
print(f"{'Score with Logistic Regression and PCA:':<50} {lr_pca_pipe.score(test, y_test)}")

# Fit and score SPCA with Logistic Regression
lr_spca_pipe.fit(train, y_train)
print(f"{'Score with Logistic Regression and SPCA:':<50} {lr_spca_pipe.score(test, y_test)}")

Score with Logistic Regression and PCA:            0.7941176470588235
Score with Logistic Regression and SPCA           : 0.5882352941176471


#### LightGBM (Gradient Tree Boosting)

In [16]:
lgbm_pca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
        
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=N_COMPONENTS, random_state=SEED)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_")),
        
    # Step 4:
        # Apply logistic regression
        ('lgbm_classifier', LGBMClassifier(random_state=SEED))

])

lgbm_spca_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
        
    # Step 2:
        # Apply SPCA
        ('spca', SparsePCA(n_components=N_COMPONENTS, random_state=SEED, alpha=10, max_iter=100)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_")),
        
    # Step 4:
        # Apply logistic regression
        ('lgbm_classifier', LGBMClassifier(random_state=SEED))

])

In [19]:
# Fit and score PCA with LGBM
lgbm_pca_pipe.fit(train, y_train)
print(f"{'Score with LGBM and PCA:':<50} {lgbm_pca_pipe.score(test, y_test)}")

# Fit and score SPCA with LGBM
lgbm_spca_pipe.fit(train, y_train)
print(f"{'Score with LGBM and SPCA:':<50} {lgbm_spca_pipe.score(test, y_test)}")

Score with LGBM and PCA:                           0.5882352941176471
Score with LGBM and SPCA                          : 0.5882352941176471
