In [13]:
FILE_PATH = '../atus/ehresp_2014.xlsx'

# Import libraries
import pandas as pd

# Load the ehresp_2014 DataFrame
ehresp_2014 = pd.read_excel(FILE_PATH)

from enum import Enum
class COLUMN_TYPES(Enum):
    NUMERICAL = 'numerical',
    CATEGORICAL = 'categorical'

numerical_columns = ['ertpreat', \
                     'ertseat', \
                     'euexfreq', \
                     'eufastfdfrq', \
                     'euhgt', \
                     'euwgt']

categorical_columns = ['eeincome1', \
                       'erhhch', \
                       'erincome', \
                       'erspemch', \
                       'ethgt', \
                       'etwgt', \
                       'eudietsoda', \
                       'eudrink', \
                       'eueat', \
                       'euexercise', \
                       'eufastfd', \
                       'euffyday', \
                       'eufdsit', \
                       'eusnap', \
                       'eugenhth', \
                       'eugroshp', \
                       'euinclvl', \
                       'euincome2', \
                       'eumeat', \
                       'eumilk', \
                       'euprpmel', \
                       'eusoda', \
                       'eustores', \
                       'eustreason', \
                       'eutherm', \
                       'euwic']

target_column = ['erbmi']

In [19]:
def split_data(dataset):
    import numpy as np
    length = dataset.shape[0]
    train_index_stop = int(length * 0.7)
    permutation = np.random.RandomState(0).permutation(length)
    
    # Take just the values up to the train_index_stop
    train_permutation = permutation[:train_index_stop]
    # Take the values after the train_index_stop
    test_permutation = permutation[train_index_stop:]

    # Create the training and testing data
    dataset_train = dataset[train_permutation]
    dataset_test = dataset[test_permutation]
    
    return [dataset_train, dataset_test]

def prepare_categorical_columns(file, columns):
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    
    # Extract columns from file; convert to array
    dataset = file[columns].values

    # Apply LabelEncoder to multiple columns: https://stackoverflow.com/a/31939145
    dataset = pd.DataFrame(dataset).apply(LabelEncoder().fit_transform).values

    # OHE for multiple columns using pd.get_dummies: https://stackoverflow.com/a/44601764
    # Another get_dummies example: http://queirozf.com/entries/one-hot-encoding-a-feature-on-a-pandas-dataframe-an-example
    # OHE vs. get_dummies: https://medium.com/@guaisang/handling-categorical-features-get-dummies-onehotencoder-and-multicollinearity-f9d473a40417
    dataset_df = pd.DataFrame(dataset, columns=columns)
    dataset_dummies = pd.get_dummies(dataset_df, columns=columns)
    dataset = dataset_dummies.values

    # TODO: Determine if there are extraneous 
    # dummy columns we want to drop after this.
    
    # Split dataset
    dataset_train, dataset_test = split_data(dataset)

    return [dataset_train, dataset_test]

def prepare_numerical_columns(file, columns):
    # Extract columns from file; convert to array
    dataset = file[columns].values
    
    # Split dataset
    X_train, X_test = split_data(dataset)

    # Scale values
    from sklearn.preprocessing import StandardScaler
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    
    return [X_train, X_test]

# Define a function to return BMI class
def get_bmi_class(bmi):
    if bmi < 18.5:
        return "Excellent"
    elif bmi >= 18.5 and  bmi < 24.9:
        return "Normal Weight"
    elif bmi >= 24.9 and bmi < 29.9:
        return "Overweight"
    elif bmi > 30:
        return "Obese"
    else:
        return "Unknown"

def prepare_target_column(file, column, return_column_type):
    import numpy as np
    import pandas as pd
    dataset = file[column].values
    
    if (return_column_type == COLUMN_TYPES.CATEGORICAL):
        dataset = np.asarray(pd.Series(get_bmi_class(bmi) for bmi in dataset))
    
    # Split dataset
    dataset_train, dataset_test = split_data(dataset)

    return [dataset_train, dataset_test]


def prepare_data(file,
                 categorical_columns,
                 numerical_columns,
                 target_column,
                 target_column_return_type):
    import pandas as pd
    X_categorical_train, \
    X_categorical_test = prepare_categorical_columns(file, categorical_columns)

    X_numerical_train, \
    X_numerical_test = \
    prepare_numerical_columns(file, numerical_columns)

    X_train = pd.DataFrame(X_numerical_train) \
    .merge(pd.DataFrame(X_categorical_train),
           how='outer',
           left_index=True,
           right_index=True) \
    .values
    print('X_train shape:', X_train.shape)

    X_test = pd.DataFrame(X_numerical_test) \
    .merge(pd.DataFrame(X_categorical_test),
           how='outer',
           left_index=True,
           right_index=True) \
    .values
    print('X_test shape:', X_test.shape)

    y_train, \
    y_test = \
    prepare_target_column(file, target_column, target_column_return_type)

    print('y_train shape:', y_train.shape)
    print('y_test shape:', y_test.shape)
    
    return [X_train, X_test, y_train, y_test]

X_train, X_test, \
y_train, y_test = \
prepare_data(file = ehresp_2014,
             categorical_columns = categorical_columns,
             numerical_columns = numerical_columns,
             target_column = target_column,
             target_column_return_type = COLUMN_TYPES.CATEGORICAL)

X_train shape: (7848, 142)
X_test shape: (3364, 142)
y_train shape: (7848,)
y_test shape: (3364,)




In [22]:
X_train

array([[ 0.83149402, -0.33369081,  0.83500393, ...,  1.        ,
         0.        ,  0.        ],
       [-1.16922465, -0.33369081, -0.98177202, ...,  0.        ,
         0.        ,  1.        ],
       [-0.11621482, -0.33369081,  0.53220794, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.34787698, -0.17628703, -0.98177202, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.77920286, -0.33369081, -0.37618003, ...,  0.        ,
         0.        ,  1.        ],
       [-0.22151581, -0.33369081,  0.22941195, ...,  1.        ,
         0.        ,  0.        ]])

In [21]:
y_train

array(['Overweight', 'Overweight', 'Excellent', ..., 'Overweight',
       'Normal Weight', 'Normal Weight'], dtype=object)