# Data wrangling

Import the data wrangler parent class.

In [1]:
from data_wranglers import DataWrangler

Define a dictionary of parameters to be attributed to the data object.

In [3]:
data_args = dict(
    
    ##### Bookkeeping
    
    name='Boston housing prices (batch)',
    
    ##### Data
    
    # File name
    file='./Boston/data/data.csv',
    
    # Target column
    targets=['price'],
    
    # Feature names
    usecols=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'],
    
    # Number of rows to load
    nrows=None,
    
    # Number of rows to skip
    skiprows=0)

If need be, save the data paratemers a JSON under `./Boston/params/data.json`.

In [4]:
from utilities import rw_data

data_args_file = './Boston/params/data.json'
rw_data(data_args_file, data_args)

## Define the default object.

Initialize the Boston data wrangler with its default parameters. Cf. `data_wranglers.DataWrangler`.

In [5]:
class BostonDataWrangler(DataWrangler):
    
    def __init__(
        self,
        
        # Default arguments
        default_args=dict(  
            # Bookkeeping
            name='Boston housing prices (default)',
            # File name
            file='./Boston/data/data.csv',
            # Target column
            targets=['price'],
            # Feature names
            usecols=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'],
            # Number of rows to load
            nrows=None,
            # Number of rows to skip
            skiprows=0),
        **kwargs):
        
        from utilities import parse_args

        # Overwrite the default arguments with any new arguments pass via ``kwargs``.
        kwargs = parse_args(default_args, kwargs)

        super().__init__(**kwargs)

## Extract the data.

Extract the raw data (i.e., "human-readable", as opposed to "machine-readable") into `self.raw` and create a dictionary `self.spec` which describes the different variables.

In [7]:
def human_readable(self):

    from utilities import dict_to_dot, rw_data

    data = rw_data(
        self.file, 
        parameters=dict(
            nrows=self.nrows, 
            skiprows=range(1, self.skiprows),
            usecols=self.usecols+self.targets) 
        )

    self.raw = dict_to_dot(
        {'input': data[self.usecols],
         'output': data[self.targets]})
    
    self.specs = dict_to_dot(
        {'input': {feature: {'type': float} for feature in self.raw.input.columns},
         'output': {target: {'type': float} for target in self.raw.output.columns}})
        
BostonDataWrangler.human_readable = human_readable

Create the default object.

In [8]:
data = BostonDataWrangler()

The input features are

In [9]:
data.raw.input.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


and the the corresponding output targets are

In [10]:
data.raw.output.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


## Specifying the parameters.

We can already create a data object in three ways. The first is by using the default parameters.

In [11]:
data = BostonDataWrangler()
data.name

'Boston housing prices (default)'

The second is by explicitely specifying certain parameters which overwrite the defaults.

In [12]:
data = BostonDataWrangler(name='Boston housing prices (CLI)')
data.name

'Boston housing prices (CLI)'

The third is be specifying an external file containing all the parameters. Cf. the dictionary defined above and saved at `data_args_file`.

In [14]:
print('Use the data parameters at', data_args_file, '.', sep='')
data = BostonDataWrangler(data_args_file)
data.name

Use the data parameters at./Boston/params/data.json.


'Boston housing prices (batch)'

## Process the data

The actual data fed into the machine learning model is processed and stored as `self.{input, output}`, not the raw data `self.raw.{input, output}`. So far, however, the raw data has not been processed. 

In [16]:
from numpy import array_equal
array_equal(data.input, data.raw.input) and array_equal(data.output, data.raw.output)

True

Define the pipeline which transforms the raw data to a format that is processed for ingestion by the machine learning algorithm. In the example below, the input is

* selected (as per scikit-learn's `SelectKBest` whereby `kBest` features are selected),
* reduced (by PCA to `n_components`), and finally
* normalized (by a standard scaler if `scaler` is not `None`).

The output, on the other hand, is only normalized.

The parameters `scaler`, `kBest` and `n_components` need to be passed to the data object. If either one is `None`, then the corresponding operation is not performed. Cf. the methods `select`, `reduce`, and `normalize` of `data_wranglers.DataWrangler`.

In [17]:
def pipe(self):

    from sklearn.pipeline import Pipeline
    from utilities import dict_to_dot

    self.input = self.raw.input.values.copy()
    self.output = self.raw.output[self.targets[0]].values.copy().reshape(-1,1)

    self.pipeline = dict_to_dot({
        'input': Pipeline([
            ('select', self.select()),
            ('reduce', self.reduce()),
            ('normalize', self.normalize(self.scaler.input))
            ]),
        'output': Pipeline([
            ('normalize', self.normalize(self.scaler.output))])})
    
BostonDataWrangler.pipe = pipe

### Select the natural features.

Let's select the top five features.

In [18]:
from utilities import dict_to_dot

data = BostonDataWrangler(
    kBest=5, 
    n_components=None, 
    scaler=dict_to_dot({'input': None, 'output': None}))

Selecting...
Reducing...
Normalizing...
Normalizing...


  y = column_or_1d(y, warn=True)


The input is now reduced to five features.

In [20]:
data.input.shape[1]

5

These features are

In [21]:
[data.usecols[k] for k, col in enumerate(data.pipeline.input.named_steps['select']._get_support_mask()) if col]

['INDUS', 'RM', 'TAX', 'PTRATIO', 'LSTAT']

### Apply dimensionality reduction.

Similarly, we can perform PCA to condense the feature to, say, a three-dimensional space.

In [22]:
from utilities import dict_to_dot

data = BostonDataWrangler(
    kBest=None, 
    n_components=3, 
    scaler=dict_to_dot({'input': None, 'output': None}))

data.input.shape[1]

Selecting...
Reducing...
Normalizing...
Normalizing...


  y = column_or_1d(y, warn=True)


3

### Normalize the data.

In this final example, the output is normalized. Let's start by examining the first five target outputs before normalization.

In [23]:
data.output[:5]

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2]])

After normalization, they become:

In [24]:
from utilities import dict_to_dot

data = BostonDataWrangler(
    kBest=None, 
    n_components=None, 
    scaler=dict_to_dot({'input': None, 'output': True}))

data.output[:5]

Selecting...
Reducing...
Normalizing...
Normalizing...


  y = column_or_1d(y, warn=True)


array([[ 0.15968566],
       [-0.10152429],
       [ 1.32424667],
       [ 1.18275795],
       [ 1.48750288]])