# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [1]:
# import packages
import pandas as pd
import numpy as np
import configparser
import os
import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

In [12]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')

### Data exploration

In [4]:
# Check total missing values
print(f"Missing values in training set: {raw_train.isna().sum().sum()}")
print(f"Missing values in test set: {raw_test.isna().sum().sum()}")
print(f"Missing values in actuals: {actuals.isna().sum().sum()}")

Missing values in training set: 0
Missing values in test set: 0
Missing values in actuals: 0


### Preprocessing

In [5]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# get target variable
y_train = train["cancer"]
y_test = test["cancer"]

In [10]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

X_train = preprocessing_pipe.fit_transform(train)
X_test = preprocessing_pipe.fit_transform(test)

### PCA and SparsePCA

In [13]:
pca_pipe = Pipeline([
    # Step 0 & 1:
        # Preprocessing
        ('preprocessing', preprocessing_pipe),
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=20, random_state=SEED)),
])

X_train_pca = pca_pipe.fit_transform(train)
X_test_pca = pca_pipe.fit_transform(test)

In [14]:
spca_pipe = Pipeline([
    # Step 0 & 1:
        # Preprocessing
        ('preprocessing', preprocessing_pipe),
    # Step 2:
        # Apply SPCA
        ('pca', SparsePCA(n_components=20, random_state=SEED)),
])

X_train_spca = spca_pipe.fit_transform(train)
X_test_pca = pca_pipe.fit_transform(test)