# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [2]:
# import packages
import pandas as pd
import numpy as np
import configparser
import os
import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals
from helpers.helper_classes import AddFeatureNames

# sklearn
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures

In [3]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

In [4]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

# Read parameters
SEED = config.getint('PARAMS', 'SEED')
N_COMPONENTS = config.getint('PARAMS', 'N_COMPONENTS')

### Data exploration

In [5]:
# Check total missing values
print(f"Missing values in training set: {raw_train.isna().sum().sum()}")
print(f"Missing values in test set: {raw_test.isna().sum().sum()}")
print(f"Missing values in actuals: {actuals.isna().sum().sum()}")

Missing values in training set: 0
Missing values in test set: 0
Missing values in actuals: 0


### Preprocessing

In [6]:
# Transform data to accesible format and add actuals
train = transform_data(raw_train)
train = add_actuals(train, actuals)
test = transform_data(raw_test)
test = add_actuals(test, actuals)

# get target variable
y_train = train["cancer"]
y_test = test["cancer"]

In [7]:
preprocessing_pipe = Pipeline([
    # Step 0:
        # Drop constant and duplicate features
        ('drop_features', DropFeatures(features_to_drop=["cancer"])),
        ('drop_constant', DropConstantFeatures(tol=0.98)),
    
    # Step 1:
        # Apply scaling to data as it is a requirement for the variance maximization procedure of PCA
        ('scaler', StandardScaler()),
])

# X_train = preprocessing_pipe.fit_transform(train)
# X_test = preprocessing_pipe.fit_transform(test)

### PCA and SparsePCA

In [14]:
pca_pipe = Pipeline([
    # Step 0 & 1:
        # Preprocessing
        ('preprocessing', preprocessing_pipe),
    # Step 2:
        # Apply PCA
        ('pca', PCA(n_components=N_COMPONENTS, random_state=SEED)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_"))
])

spca_pipe = Pipeline([
    # Step 0 & 1:
        # Preprocessing
        # ('preprocessing', preprocessing_pipe),
    # Step 2:
        # Apply SPCA
        ('pca', SparsePCA(n_components=N_COMPONENTS, random_state=SEED, alpha=10, max_iter=50)),
    # Step 3:
        # Add feature names
        ('add_features_names', AddFeatureNames(prefix="cmpnt_"))
])

In [15]:
X_train_pca = pca_pipe.fit_transform(train)
X_train_spca = spca_pipe.fit_transform(train)

X_test_pca = pca_pipe.fit_transform(test)
X_test_spca = spca_pipe.fit_transform(test)

Unnamed: 0,cmpnt_0,cmpnt_1,cmpnt_2,cmpnt_3,cmpnt_4,cmpnt_5,cmpnt_6,cmpnt_7,cmpnt_8,cmpnt_9,cmpnt_10,cmpnt_11,cmpnt_12,cmpnt_13,cmpnt_14,cmpnt_15,cmpnt_16,cmpnt_17,cmpnt_18,cmpnt_19
0,18.544801,5.82687,-19.443323,-11.275064,5.303888,19.096058,-9.46335,0.898541,-8.573092,-7.674149,-12.406816,9.995095,11.381889,-10.674955,-10.44054,22.102128,33.897149,28.674269,1.289886,6.629434
1,-7.341402,10.008684,11.796506,3.926174,-13.995669,-18.637315,4.694233,8.825182,1.340178,-11.829394,5.382486,2.508407,24.481645,-10.880732,-14.362732,-18.493988,-16.759757,9.510449,6.829374,-8.784589
2,52.852158,11.390629,-29.548822,-38.801693,-22.872324,-9.453886,-22.870619,20.96875,4.007808,-5.265807,3.831523,13.609795,-4.88485,-18.376255,-5.670233,28.947614,-17.926458,-24.685422,-2.722255,-15.013396
3,13.688532,-6.246782,-22.623827,-3.969949,-1.357629,13.095538,29.529216,-10.502208,-12.481158,21.905405,-11.474461,5.58018,-3.6409,-1.676164,-3.998947,1.130807,5.096364,-5.509603,-3.968852,4.951907
4,-36.790037,32.79985,5.247333,-2.792859,-6.608762,10.701026,2.392849,-4.497653,-9.658597,1.721688,-7.219925,19.56038,-6.015493,8.701053,3.318299,-3.777469,-6.481432,-1.41895,-4.303865,-4.012877
5,-9.64086,-20.992184,-22.472341,-11.614369,-25.308581,-11.162471,13.615825,-4.065295,-7.248612,-1.619774,-2.438451,-19.702704,-4.534137,-7.638221,-13.030895,-0.641578,-5.366304,-0.952896,-1.829491,10.356335
6,21.808873,-16.258285,-25.14708,-5.261994,1.392175,16.220026,22.148409,-8.804109,-11.379484,22.686653,-7.229026,-2.769615,-12.997735,-4.793064,-4.527577,-13.65976,7.291552,-11.552895,26.313034,-14.412134
7,56.000387,-21.416516,-35.862099,-9.368432,33.817848,3.706168,23.476598,-21.621031,36.433822,-39.125193,-13.451277,-4.907473,15.788694,18.627011,13.613555,-3.44716,-5.523232,-10.202432,-4.026228,0.794914
8,21.618693,30.854494,-8.311056,-36.349749,-40.065881,-23.872468,-4.405527,2.596107,6.138127,1.491745,0.519398,-9.006292,-7.491069,32.230413,13.924844,-6.514209,25.902651,12.328486,9.466665,-5.082247
9,-22.874989,-8.697872,-3.905816,1.85502,-13.127886,-9.818199,-8.310402,11.531171,-7.305953,-4.220144,9.937371,-8.805238,-0.412762,10.000722,3.706028,1.997551,-3.230501,-13.534038,1.928945,7.101667
