In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Load and clean the data

In [2]:
# Load the data

df = pd.read_csv('500_Person_Gender_Height_Weight_Index.csv').set_index('Gender')

# check for null values

for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")
    

Column Height has 0 null values
Column Weight has 0 null values
Column Index has 0 null values


In [3]:
# identify unique values for 'index' column
df_index = df['Index'].unique()
df_index

array([4, 2, 3, 5, 1, 0])

In [4]:
# Convert the target column values to the following groups based on their values

# Index : 0 - Extremely Weak 1 - Weak 2 - Normal 3 - Overweight 4 - Obesity 5 - Extreme Obesity

# replace 2 (normal)

x = {2: 'Normal'}   
df = df.replace(x)

# replace the rest; everything not 'normal' generalized as at_risk to group subjects into two categories. 

x = dict.fromkeys([0,1,3,4,5], 'At_Risk')    
df = df.replace(x)


In [5]:
# Create our features
X = pd.get_dummies(df.drop(columns='Index')) # get_dummies to encode values of the columns

# Create our target
y = df['Index']

In [24]:
X

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,174,96
Male,189,87
Female,185,110
Female,195,104
Male,149,61
...,...,...
Female,150,153
Female,184,121
Female,141,136
Male,150,95


In [6]:
df.head()

Unnamed: 0_level_0,Height,Weight,Index
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,174,96,At_Risk
Male,189,87,Normal
Female,185,110,At_Risk
Female,195,104,At_Risk
Male,149,61,At_Risk


In [7]:
df.describe()

Unnamed: 0,Height,Weight
count,500.0,500.0
mean,169.944,106.0
std,16.375261,32.382607
min,140.0,50.0
25%,156.0,80.0
50%,170.5,106.0
75%,184.0,136.0
max,199.0,160.0


In [8]:
# count the values, at_risk is the majority and normal is the minority

y.value_counts()

At_Risk    431
Normal      69
Name: Index, dtype: int64

In [9]:
# import train, test, split module
# create test and train variables 
# deccribe the shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(375, 2)

### Ensemble Learners
In this section, I compared two ensemble algorithms to determine which algorithm results in the best performance. I trained a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier. The following steps were completed:

Train the model using the training data.
Calculate the balanced accuracy score from sklearn.metrics.
Print the confusion matrix from sklearn.metrics.
Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

For the Balanced Random Forest Classifier, display feature importance sorted in descending order (most important to least important feature that influences the target) along with the feature score.

Note: A random state of 1 is used for each algorithm to ensure consistency between tests


#### Balanced Random Forest Classifier¶

In [10]:
# Resample the training data with the BalancedRandomForestClassifier


from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

brfc_model = BalancedRandomForestClassifier(n_estimators =128, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=128, random_state=1)

In [11]:
# Calculated the balanced accuracy score

y_pred_brfc = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brfc)

0.8671023965141612

In [12]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred_brfc)

array([[92, 16],
       [ 2, 15]])

In [16]:
# imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.98      0.85      0.88      0.91      0.87      0.75       108
     Normal       0.48      0.88      0.85      0.62      0.87      0.75        17

avg / total       0.91      0.86      0.88      0.87      0.87      0.75       125



In [17]:
# List the features sorted in descending order by feature importance

feature_importance_brfc = pd.Series(data=brfc_model.feature_importances_,index=X.columns)
feature_importance_brfc.sort_values(ascending=False,inplace=True)
print(feature_importance_brfc)

Weight    0.752989
Height    0.247011
dtype: float64


#### Easy Ensemble AdaBoost Classifier


In [19]:
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=128, random_state=1)
ee_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=128, random_state=1)

In [20]:
# calculate the balanced accuracy score

y_pred_ee = ee_model.predict(X_test)

In [21]:
# confusion matrix

confusion_matrix(y_pred_ee, y_test)

array([[87,  2],
       [21, 15]])

In [23]:
# imbalanced classification report

print(classification_report_imbalanced(y_pred_ee, y_test))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.81      0.98      0.42      0.88      0.64      0.43        89
     Normal       0.88      0.42      0.98      0.57      0.64      0.38        36

avg / total       0.83      0.82      0.58      0.79      0.64      0.42       125

