# VU Econometics and Data Science: Case Study
```
Author(s): Jacco Broere
```


### Setup
- Setup config.ini file
- Install necessary packages
- Download and unpack data



In [1]:
# import packages
import pandas as pd
import numpy as np
import configparser
import os
import sweetviz

# helper functions
from helpers.helper_functions import transform_data, add_actuals

# sklearn
from sklearn.decomposition import PCA, SparsePCA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

In [3]:
# Read data
raw_train = pd.read_csv(config['PATH']['RAW_TRAIN_DATA'])
raw_test = pd.read_csv(config['PATH']['RAW_TEST_DATA'])
actuals = pd.read_csv(config['PATH']['ACTUALS'])

### Data exploration

In [4]:
# Generate sweetviz reports
# report_train = sweetviz.analyze(raw_train)
# report_test = sweetviz.analyze(raw_test)

In [5]:
# Generate HTML reports
# report_train.show_html('output/sweetviz_train.html')
# report_test.show_html('output/sweetviz_test.html')

### Preprocessing

In [6]:
# preprocess data
train = transform_data(raw_train)
train = add_actuals(train, actuals)

train.head(3)

Unnamed: 0_level_0,cancer,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,-214,-153,-58,88,-295,-558,199,-176,252,...,185,511,-125,389,-37,793,329,36,191,-37
2,1,-139,-73,-1,283,-264,-400,-330,-168,101,...,169,837,-36,442,-17,782,295,11,76,-14
3,1,-76,-49,-307,309,-376,-650,33,-367,206,...,315,1199,33,168,52,1138,777,41,228,-41


In [7]:
report = sweetviz.analyze(
    source=train,
    pairwise_analysis="off",
    target_feat="cancer",
)

Feature: Z78285_f_at                         |██████████| [100%]   01:02 -> (00:00 left)  


In [8]:
report.show_html("output/sweetviz_train.html")

Report output/sweetviz_train.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
