## Setup

In [49]:
from enum import Enum
class COLUMN_TYPES(Enum):
    NUMERICAL = 'numerical',
    CATEGORICAL = 'categorical'

## Config

In [50]:
FILE_PATH = '../atus/ehresp_2014.xlsx'
SELECTED_COLUMNS = COLUMN_TYPES.NUMERICAL

## Load dataset
- Separate the dataset into categorical, numerical, and target DataFrames.
    - Having the categorical columns isolated will faciliate performing One Hot Encoding, whereas numerical columns do not need this manner of transformation.
    - However, since `erbmi` is our target column, we will omit it from either of these DataFrames. Instead, we'll create a separate `dataset_target` DataFrame, which we'll then merge with either the `dataset_categorical` or `dataset_numerical`. 

In [51]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the ehresp_2014 DataFrame
ehresp_2014 = pd.read_excel(FILE_PATH)

categorical_columns = ['eeincome1', \
                       'erhhch', \
                       'erincome', \
                       'erspemch', \
                       'ethgt', \
                       'etwgt', \
                       'eudietsoda', \
                       'eudrink', \
                       'eueat', \
                       'euexercise', \
                       'eufastfd', \
                       'euffyday', \
                       'eufdsit', \
                       'eusnap', \
                       'eugenhth', \
                       'eugroshp', \
                       'euinclvl', \
                       'euincome2', \
                       'eumeat', \
                       'eumilk', \
                       'euprpmel', \
                       'eusoda', \
                       'eustores', \
                       'eustreason', \
                       'eutherm', \
                       'euwic']

# Excluding `erbmi` because we're using this to determine our target
numerical_columns = ['ertpreat', \
                     'ertseat', \
                     'euexfreq', \
                     'eufastfdfrq', \
                     'euhgt', \
                     'euwgt']

target_column = ['erbmi']

dataset_numerical = ehresp_2014[numerical_columns]
dataset_categorical = ehresp_2014[categorical_columns]
dataset_target = ehresp_2014[target_column]

## Select categorical or numerical data and merge with target
- Select whether we want to work with categorical or numerical data.
- Merge our chosen dataset (either cateogircal or numerical) with our target column, relying on the indices of each dataframe to perform the merge.
    - [Pandas DataFrame Merge](https://stackoverflow.com/a/36539295)

In [52]:
# Choose which dataset we want to work with
if (SELECTED_COLUMNS == COLUMN_TYPES.CATEGORICAL):
    dataset = dataset_categorical.copy()
elif (SELECTED_COLUMNS == COLUMN_TYPES.NUMERICAL):
    dataset = dataset_numerical.copy()

# Merge our chosen dataset with the target column
dataset = dataset.merge(dataset_target, \
                        how='outer', \
                        left_index=True,
                        right_index=True \
                       )

## Get X & y and Train & Test
- Switch from DataFrames to matrices
- Pull out our predictors and target
- Calculate target classes
- Split and normalize data

In [53]:
# Get our X and y
column_count = dataset.shape[1]
X = dataset.iloc[:, 0:column_count-1].values
y = dataset.iloc[:, -1].values

# One Hot Encode categorical data
if (SELECTED_COLUMNS == COLUMN_TYPES.CATEGORICAL):
    print('One Hot Encoding not yet implemented')

    # The following code is my attempt so far at
    # running the LabelEncoder and OneHotEncoder
    # for all (categorical) columns in X. Not yet working.
    # Skip for now.
    if (False):
        from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

        # Apply LabelEncoder to multiple columns: https://stackoverflow.com/a/31939145
        X = pd.DataFrame(X).apply(LabelEncoder().fit_transform).values

        # Iterate through the columns in `X`, creating `new_X` with the output
        print('X.shape before OHE:', X.shape)
        onehotencoder = OneHotEncoder(categorical_features=[0])

        # Assign `new_X` outside the scope of the loop
        new_X = onehotencoder.fit_transform(pd.DataFrame(X)[0].reshape(-1, 1)).toarray()
        for c in pd.DataFrame(X).columns:
            # We already initialized `new_X` with the first column's output
            if (c != pd.DataFrame(X).columns[0]):
                np.append(new_X, onehotencoder.fit_transform(pd.DataFrame(X)[c].reshape(-1, 1)).toarray())

        # Now we _should_ have the OHE output from all columns.
        # But we don't.
        print('new_X.shape after OHE:', new_X.shape)


# Prepare our target
# Define a function to return BMI class
def get_bmi_class(bmi):
    if bmi < 18.5:
        return "Excellent"
    elif bmi >= 18.5 and  bmi < 24.9:
        return "Normal Weight"
    elif bmi >= 24.9 and bmi < 29.9:
        return "Overweight"
    elif bmi > 30:
        return "Obese"
    else:
        return "Unknown"

# Convert `y` from a continuous to a categorical value
y = np.asarray(pd.Series(get_bmi_class(bmi) for bmi in y))

# Split the data into a training and a testing set
from sklearn.model_selection import train_test_split
X_train, X_test, \
y_train, y_test = train_test_split(X, \
                                   y, \
                                   test_size=0.3, \
                                   random_state=0)


# Normalize numerical data
if (SELECTED_COLUMNS == COLUMN_TYPES.NUMERICAL):
    from sklearn.preprocessing import StandardScaler
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)

# Examine the shape of the X & y Train & Test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

One Hot Encoding not yet implemented
(7848, 26)
(3364, 26)
(7848,)
(3364,)


In [54]:
# Examine our training set
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,3,1,-1,0,0,-1,1,1,2,...,5,-1,-1,-1,2,2,-1,-1,-1,2
1,1,3,1,5,0,0,-1,1,1,1,...,5,-1,1,2,3,2,1,3,2,2
2,1,3,1,-1,0,0,-1,1,1,1,...,5,-1,1,2,1,2,1,1,2,-1
3,1,3,1,-1,0,0,-1,1,1,1,...,5,-1,-1,-1,2,2,-1,-1,-1,2
4,1,3,1,-1,0,0,-1,1,1,1,...,5,-1,1,1,1,2,1,2,2,2
