In [1]:
import os
import sys

import yaml
import pandas as pd


os.chdir('..')
sys.path.append(os.getcwd())

# Initial configurations
config_filepath = 'config.yml'

with open(config_filepath, 'r') as f:
    config = yaml.safe_load(f)

In [2]:
data = pd.read_csv(config['filepaths']['data'])

# Data Preparation

In [3]:
abt = data.copy()

# Carry-over from data consistency pipeline
abt.loc[:, 'Age'] = data.loc[:, 'Age'].round()
data_types = {
    'Age': pd.Int64Dtype(),
}
abt = abt.astype(data_types, copy=True)

## 1 Features to Use

In [4]:
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_cols = ['Survived', 'Pclass', 'Sex', 'Embarked']

In [5]:
abt = abt.loc[:, numerical_cols + categorical_cols]

## 2 Handling Missing Values

### Dropping Rows

In [6]:
abt = abt.dropna(subset=['Age'])

In [7]:
abt.isnull().sum()

Age         0
SibSp       0
Parch       0
Fare        0
Survived    0
Pclass      0
Sex         0
Embarked    2
dtype: int64

### Imputing Missing Values

In [8]:
abt.loc[:, 'Embarked'] = abt.Embarked.fillna(abt.Embarked.mode().iloc[0])

In [9]:
abt.isnull().sum()

Age         0
SibSp       0
Parch       0
Fare        0
Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

## 3 Handling Outliers

In [10]:
abt.loc[abt.Fare > 50, 'Fare'] = 50

## 4 Normalization

## 5 Saving the Analytics Base Table (ABT) and Scalers

In [31]:
abt.to_csv('data/abt.csv', index=False)