## Import a variety of libraries
- `numpy`
- `pyplot`
- `pandas`

In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load the dataset

In [2]:
# Load the ehresp_2014 DataFrame
FILE_PATH = 'atus/ehresp_2014.xlsx'
ehresp_2014 = pd.read_excel(FILE_PATH)

## Explore the dataset

In [3]:
# Start exploring ehresp_2014
ehresp_2014.head()

Unnamed: 0,tucaseid,tulineno,eeincome1,erbmi,erhhch,erincome,erspemch,ertpreat,ertseat,ethgt,...,eumeat,eumilk,euprpmel,eusoda,eustores,eustreason,eutherm,euwgt,euwic,exincome1
0,20140101140007,1,-2,33.200001,1,-1,-1,30,2,0,...,1,2,1,-1,2,1,2,170,1,2
1,20140101140011,1,1,22.700001,3,1,-1,45,14,0,...,1,2,1,-1,1,2,2,128,2,0
2,20140101140028,1,2,49.400002,3,5,-1,60,0,0,...,-1,-1,2,2,-1,-1,-1,270,2,12
3,20140101140063,1,-2,-1.0,3,-1,-1,0,0,0,...,2,2,1,1,2,6,-1,-2,2,2
4,20140101140168,1,2,31.0,3,5,-1,65,0,0,...,1,2,1,2,1,1,2,210,1,0


In [4]:
ehresp_2014.describe()

Unnamed: 0,tucaseid,tulineno,eeincome1,erbmi,erhhch,erincome,erspemch,ertpreat,ertseat,ethgt,...,eumeat,eumilk,euprpmel,eusoda,eustores,eustreason,eutherm,euwgt,euwic,exincome1
count,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,...,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0,11212.0
mean,20140660000000.0,1.0,1.293525,26.29125,2.885212,2.036479,1.872547,65.678113,16.7599,-0.003122,...,0.529344,1.157867,1.464592,0.738494,0.788887,1.366572,0.844006,168.182572,0.51213,4.475027
std,345673500.0,0.0,0.845532,8.737788,0.443716,1.694437,2.915099,48.080541,50.643598,0.182376,...,0.985731,1.338405,0.691768,1.285256,1.301327,1.872375,1.408404,56.893436,1.481951,17.634628
min,20140100000000.0,1.0,-3.0,-1.0,1.0,-1.0,-1.0,0.0,-3.0,-1.0,...,-2.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-5.0,-3.0,-1.0
25%,20140300000000.0,1.0,1.0,23.0,3.0,1.0,-1.0,30.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,140.0,-1.0,0.0
50%,20140610000000.0,1.0,1.0,26.5,3.0,1.0,1.0,60.0,3.0,0.0,...,1.0,2.0,1.0,1.0,1.0,1.0,2.0,168.0,1.0,0.0
75%,20140910000000.0,1.0,2.0,30.4,3.0,3.0,5.0,90.0,15.0,0.0,...,1.0,2.0,2.0,2.0,1.0,2.0,2.0,200.0,2.0,0.0
max,20141210000000.0,1.0,3.0,73.599998,3.0,5.0,5.0,508.0,990.0,2.0,...,2.0,2.0,3.0,2.0,5.0,6.0,2.0,340.0,2.0,87.0


## Select a subset of columns
- Rearrange the columns
    - Place BMI (`erbmi`), our target, in the final column.
    - Remove household ID (`tucaseid`) and respondent ID (`tulineno`), since those are not important for the present analysis.

In [5]:
dataset = ehresp_2014[['eeincome1', \
                       'erhhch', \
                       'erincome', \
                       'erspemch', \
                       'ertpreat', \
                       'ertseat', \
                       'ethgt', \
                       'etwgt', \
                       'eudietsoda', \
                       'eudrink', \
                       'eueat', \
                       'euexercise', \
                       'euexfreq', \
                       'eufastfd', \
                       'eufastfdfrq', \
                       'euffyday', \
                       'eufdsit', \
                       'eufinlwgt', \
                       'eusnap', \
                       'eugroshp', \
                       'euhgt', \
                       'euinclvl', \
                       'euincome2', \
                       'eumeat', \
                       'eumilk', \
                       'euprpmel', \
                       'eusoda', \
                       'eustores', \
                       'eustreason', \
                       'eutherm', \
                       'euwgt', \
                       'euwic', \
                       'exincome1', \
                       'eugenhth', \
                       'erbmi']]

In [6]:
# Get our X and y
column_count = dataset.shape[1]
X = dataset.iloc[:, 0:column_count-1]
y = dataset.iloc[:, -1]

In [7]:
# Verify the shape of our data
print(X.shape)
print(y.shape)

(11212, 34)
(11212,)


## Classifying BMI
- NIH classifies BMI in the following ranges
    - Underweight: <18.5
    - Normal weight: 18.5–24.9 
    - Overweight: 25–29.9 
    - Obesity: BMI of 30 or greater

In [8]:
# Define a function to return BMI class
def get_bmi_class(bmi):
    if bmi < 18.5:
        return "Excellent"
    elif bmi >= 18.5 and  bmi < 24.9:
        return "Normal Weight"
    elif bmi >= 24.9 and bmi < 29.9:
        return "Overweight"
    elif bmi > 30:
        return "Obese"
    else:
        return "Unknown"
    
# List comprehension: https://hackernoon.com/list-comprehension-in-python-8895a785550b
# Convert List to Series: https://stackoverflow.com/questions/21646791/convert-python-list-to-pandas-series
# Convert the generator into a Series, and the Series into a numpy array
y = np.asarray(pd.Series(get_bmi_class(bmi) for bmi in y))

## Split and normalize the dataset

In [9]:
# Split the data into a training and a testing set
from sklearn.model_selection import train_test_split
X_train, X_test, \
y_train, y_test = train_test_split(X, \
                                   y, \
                                   test_size=0.3, \
                                   random_state=0)

In [10]:
# Verify the shape of our split data
print(X_train.shape)
print(y_test.shape)

(7848, 34)
(3364,)


In [None]:
# Normalize the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Use Logistic Regression to predict BMI

In [None]:
# Fit Logistic Regression to our training set
from sklearn.linear_model import LogisticRegression
classifierObj = LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)

In [None]:
# Make predictions on our test set
y_pred = classifierObj.predict(X_test)

## Score the model

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

## Next steps
- Build visualizations
    - Data exploration and data profiling
    - Model evaluation
- Try different algorithms and hyperparameters
    - Classification
    - Regression
    - Clustering
- Ask a different question
    - Classification
        - Predict health stauus (`Excellent`, `Very Good`, `Good`, `Fair`, `Poor`)
    - Regression
        - Predict BMI (`erbmi`)
    - Clustering
        - E.g., behavioral, diet, activity, and health status or BMI category
- Try feature selection and dimensionality reduction
    - From a public health and policy standpoint, what behaviors and characteristics seem most related to a healthy BMI?
- Build pipelines
    - Automate feature selection, hyperparameter tuning, etc.