## Import a variety of libraries
- `numpy`
- `pyplot`
- `pandas`

In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load the dataset

In [3]:
# Load the ehresp_2014 DataFrame
FILE_PATH = '../atus/ehresp_2014.xlsx'
ehresp_2014 = pd.read_excel(FILE_PATH)

## Explore the dataset

In [None]:
# Start exploring ehresp_2014
ehresp_2014.head()

In [None]:
ehresp_2014.describe()

## Select a subset of columns
- Rearrange the columns
    - Place BMI (`erbmi`), our target, in the final column.
    - Remove household ID (`tucaseid`) and respondent ID (`tulineno`), since those are not important for the present analysis.

In [None]:
dataset = ehresp_2014[['eeincome1', \
                       'erhhch', \
                       'erincome', \
                       'erspemch', \
                       'ertpreat', \
                       'ertseat', \
                       'ethgt', \
                       'etwgt', \
                       'eudietsoda', \
                       'eudrink', \
                       'eueat', \
                       'euexercise', \
                       'euexfreq', \
                       'eufastfd', \
                       'eufastfdfrq', \
                       'euffyday', \
                       'eufdsit', \
                       'eufinlwgt', \
                       'eusnap', \
                       'eugroshp', \
                       'euhgt', \
                       'euinclvl', \
                       'euincome2', \
                       'eumeat', \
                       'eumilk', \
                       'euprpmel', \
                       'eusoda', \
                       'eustores', \
                       'eustreason', \
                       'eutherm', \
                       'euwgt', \
                       'euwic', \
                       'exincome1', \
                       'eugenhth', \
                       'erbmi']]

In [None]:
# Get our X and y
column_count = dataset.shape[1]
X = dataset.iloc[:, 0:column_count-1]
y = dataset.iloc[:, -1]

In [None]:
# Verify the shape of our data
print(X.shape)
print(y.shape)

## Classifying BMI
- NIH classifies BMI in the following ranges
    - Underweight: <18.5
    - Normal weight: 18.5–24.9 
    - Overweight: 25–29.9 
    - Obesity: BMI of 30 or greater

In [None]:
# Define a function to return BMI class
def get_bmi_class(bmi):
    if bmi < 18.5:
        return "Excellent"
    elif bmi >= 18.5 and  bmi < 24.9:
        return "Normal Weight"
    elif bmi >= 24.9 and bmi < 29.9:
        return "Overweight"
    elif bmi > 30:
        return "Obese"
    else:
        return "Unknown"
    
# List comprehension: https://hackernoon.com/list-comprehension-in-python-8895a785550b
# Convert List to Series: https://stackoverflow.com/questions/21646791/convert-python-list-to-pandas-series
# Convert the generator into a Series, and the Series into a numpy array
y = np.asarray(pd.Series(get_bmi_class(bmi) for bmi in y))

## Split and normalize the dataset

In [None]:
# Split the data into a training and a testing set
from sklearn.model_selection import train_test_split
X_train, X_test, \
y_train, y_test = train_test_split(X, \
                                   y, \
                                   test_size=0.3, \
                                   random_state=0)

In [None]:
# Verify the shape of our split data
print(X_train.shape)
print(y_test.shape)

In [None]:
# Normalize the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Use Logistic Regression to predict BMI

In [None]:
# Fit Logistic Regression to our training set
from sklearn.linear_model import LogisticRegression
classifierObj = LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)

In [None]:
# Make predictions on our test set
y_pred = classifierObj.predict(X_test)

## Score the model

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

## Next steps
- Build visualizations
    - Data exploration and data profiling
    - Model evaluation
- Try different algorithms and hyperparameters
    - Classification
    - Regression
    - Clustering
- Ask a different question
    - Classification
        - Predict health stauus (`Excellent`, `Very Good`, `Good`, `Fair`, `Poor`)
    - Regression
        - Predict BMI (`erbmi`)
    - Clustering
        - E.g., behavioral, diet, activity, and health status or BMI category
- Try feature selection and dimensionality reduction
    - From a public health and policy standpoint, what behaviors and characteristics seem most related to a healthy BMI?
- Build pipelines
    - Automate feature selection, hyperparameter tuning, etc.