In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Load and clean the data

In [2]:
# load the CSV file

df = pd.read_csv('500_Person_Gender_Height_Weight_Index.csv').set_index('Gender')
df.head()

Unnamed: 0_level_0,Height,Weight,Index
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,174,96,4
Male,189,87,2
Female,185,110,4
Female,195,104,3
Male,149,61,3


In [3]:
# check for null values

for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")


Column Height has 0 null values
Column Weight has 0 null values
Column Index has 0 null values


In [4]:
# check the unique values in "Index" column

df['Index'].unique()

array([4, 2, 3, 5, 1, 0])

In [5]:
# separate the indices into two categories: 'Normal' and 'At_risk'

x = {2: 'Normal'}
df = df.replace(x)


x = dict.fromkeys([0,1,3,4,5], 'At_Risk')  # dict.fromkeys used here, otherwise "not-hashable" error appears
df = df.replace(x)


In [6]:
df

Unnamed: 0_level_0,Height,Weight,Index
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,174,96,At_Risk
Male,189,87,Normal
Female,185,110,At_Risk
Female,195,104,At_Risk
Male,149,61,At_Risk
...,...,...,...
Female,150,153,At_Risk
Female,184,121,At_Risk
Female,141,136,At_Risk
Male,150,95,At_Risk


In [7]:
# create features and target variables

X = pd.get_dummies(df.drop(columns = 'Index'))
y = df['Index']

In [8]:
X.head()

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,174,96
Male,189,87
Female,185,110
Female,195,104
Male,149,61


In [9]:
# describe df

df.describe()

Unnamed: 0,Height,Weight
count,500.0,500.0
mean,169.944,106.0
std,16.375261,32.382607
min,140.0,50.0
25%,156.0,80.0
50%,170.5,106.0
75%,184.0,136.0
max,199.0,160.0


In [10]:
# y value counts

y.value_counts()

At_Risk    431
Normal      69
Name: Index, dtype: int64

In [11]:
# train all the variables (x/y, test/train)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
 
X_train.shape

(375, 2)

### Oversampling
In this section, I compared two oversampling algorithms to determine which algorithm results in the best performance. I oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, the folliowing steps were completed:

View the count of the target classes using Counter from the collections library. Use the resampled data to train a logistic regression model. Calculate the balanced accuracy score from sklearn.metrics. Print the confusion matrix from sklearn.metrics. Generate a classication report using the imbalanced_classification_report from imbalanced-learn. Note: A random state of 1 for each sampling algorithm is used to ensure consistency between tests.

#### Naive Random Oversampling 

In [12]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
naive_model = RandomOverSampler(random_state=1)
naive_X_resampled, naive_y_resampled = naive_model.fit_resample(X_train, y_train)
Counter(naive_y_resampled)  # no need for naive_model again after applied to the training variables

Counter({'At_Risk': 323, 'Normal': 323})

In [13]:
# train the logistic regression model with X/y resampled variables

from sklearn.linear_model import LogisticRegression 
naive_lr_model = LogisticRegression(solver='lbfgs', random_state=1)
naive_lr_model.fit(naive_X_resampled, naive_y_resampled)

LogisticRegression(random_state=1)

In [14]:
# predict

naive_y_pred = naive_lr_model.predict(X_test)

In [15]:
# calculate the balanced accuracy_score

naive_acc_score = balanced_accuracy_score(naive_y_pred, y_test)
naive_acc_score

0.7446236559139785

In [16]:
# confusion matrix

confusion_matrix(naive_y_pred, y_test)

array([[92,  1],
       [16, 16]])

In [17]:
# imbalanced classification report
print(classification_report_imbalanced(naive_y_pred, y_test))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.85      0.99      0.50      0.92      0.70      0.52        93
     Normal       0.94      0.50      0.99      0.65      0.70      0.47        32

avg / total       0.87      0.86      0.63      0.85      0.70      0.51       125



#### SMOTE Oversampling

In [18]:
# import SMOTE classifier
from imblearn.over_sampling import SMOTE

In [19]:
# define the classifier
smote_model = SMOTE(random_state=1)

In [20]:
# apply the classifier to X/y training variables
smote_X_resampled, smote_y_resampled = smote_model.fit_resample(X_train, y_train)

In [21]:
# fit LR model with resampled variables
from sklearn.linear_model import LogisticRegression
smote_lr_model = LogisticRegression(solver='lbfgs', random_state=1).fit(smote_X_resampled, smote_y_resampled)

In [22]:
# predict
smote_y_pred = smote_lr_model.predict(X_test)

In [23]:
# balanced accuracy score

smote_acc_score = balanced_accuracy_score(smote_y_pred, y_test)
smote_acc_score

0.7446236559139785

In [24]:
# confusion matrix 
confusion_matrix(smote_y_pred, y_test)

array([[92,  1],
       [16, 16]])

In [25]:
# imbalanced classification report

print(classification_report_imbalanced(smote_y_pred, y_test))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.85      0.99      0.50      0.92      0.70      0.52        93
     Normal       0.94      0.50      0.99      0.65      0.70      0.47        32

avg / total       0.87      0.86      0.63      0.85      0.70      0.51       125



### Undersampling
In this section, I tested an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. I undersampled the data using the RandomUnderSampler algorithm and complete the following steps:

Use the resampled data to train a logistic regression model.
Calculate the balanced accuracy score from sklearn.metrics.
Print the confusion matrix from sklearn.metrics.
Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

Note: A random state of 1 for each sampling algorithm is used to ensure consistency between tests.

In [26]:
#import RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
#define the classifier
us_model = RandomUnderSampler(random_state=1)
# train the classifier
under_X_resampled, under_y_resampled = us_model.fit_resample(X_train, y_train)

In [27]:
# import Logistic Regression Model

from sklearn.linear_model import LogisticRegression

# define and train the LR model with the resampled variables 

under_lr_model = LogisticRegression(solver='lbfgs', random_state=1).fit(under_X_resampled, under_y_resampled)

In [28]:
# predict

under_y_pred = under_lr_model.predict(X_test)

In [29]:
# accuracy score

under_acc_score = balanced_accuracy_score(under_y_pred, y_test)
under_acc_score

0.75

In [30]:
# confusion matrix

confusion_matrix(under_y_pred, y_test)

array([[91,  0],
       [17, 17]])

In [31]:
# imbalanced classification report

print(classification_report_imbalanced(under_y_pred, y_test))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.84      1.00      0.50      0.91      0.71      0.53        91
     Normal       1.00      0.50      1.00      0.67      0.71      0.48        34

avg / total       0.89      0.86      0.64      0.85      0.71      0.51       125



### Combination (Over and Under) Sampling
In this section, I tested a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. I resampled the data using the SMOTEENN algorithm and complete the folliowing steps:

Use the resampled data to train a logistic regression model.
Calculate the balanced accuracy score from sklearn.metrics.
Print the confusion matrix from sklearn.metrics.
Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

Note: A random state of 1 for each sampling algorithm isde to ensure consistency between tests.

In [32]:
# import smotteen
from imblearn.combine import SMOTEENN
# define the classifier
smoteenn_model = SMOTEENN(random_state=1)
# fit the classifier
comb_X_resampled, comb_y_resampled = smoteenn_model.fit_resample(X_train, y_train)

In [33]:
# import logistic regression model
from sklearn.linear_model import LogisticRegression
# define and fit the model
comb_lr_model = LogisticRegression(solver='lbfgs', random_state=1).fit(comb_X_resampled, comb_y_resampled)

In [34]:
# predict

comb_y_pred = comb_lr_model.predict(X_test)

In [35]:
# accuracy score

comb_acc_score = balanced_accuracy_score(comb_y_pred, y_test)
comb_acc_score

0.75

In [36]:
# confusion matrix

confusion_matrix(comb_y_pred, y_test)

array([[91,  0],
       [17, 17]])

In [37]:
# imbalanced classification report

print(classification_report_imbalanced(comb_y_pred, y_test))

                   pre       rec       spe        f1       geo       iba       sup

    At_Risk       0.84      1.00      0.50      0.91      0.71      0.53        91
     Normal       1.00      0.50      1.00      0.67      0.71      0.48        34

avg / total       0.89      0.86      0.64      0.85      0.71      0.51       125

